In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import csv

In [28]:
moisture_data = pd.read_csv('temparature.csv', parse_dates=['Date'])
temperature_data = pd.read_csv('moisture.csv', parse_dates=['Date'])
diameter_data = pd.read_csv('diameter.csv', parse_dates=['Date'])

  diameter_data = pd.read_csv('diameter.csv', parse_dates=['Date'])


In [29]:
# Merge the datasets
moisture_data.columns = ['Date', 'Moisture_I0', 'Moisture_I3', 'Moisture_I5']
temperature_data.columns = ['Date', 'Temp_I0', 'Temp_I3', 'Temp_I5']
diameter_data.columns = ['Date', 'Diameter_I0', 'Diameter_I3', 'Diameter_I5']

merged_data = pd.merge(moisture_data, temperature_data, on='Date', how='inner')
merged_data = pd.merge(merged_data, diameter_data, on='Date', how='inner')

In [34]:
# Fill missing values using forward fill
merged_data = merged_data.ffill()


In [35]:
data.head

<bound method NDFrame.head of          Date  DayOfYear     Variable  Value
0  2022-03-31         90  Moisture_I0  31.10
1  2022-04-18        108  Moisture_I0  32.65
2  2022-04-29        119  Moisture_I0  29.70
3  2022-03-31         90      Temp_I0  17.65
4  2022-04-18        108      Temp_I0  16.71
5  2022-04-29        119      Temp_I0  16.91
6  2022-03-31         90  Moisture_I3  31.23
7  2022-04-18        108  Moisture_I3  31.27
8  2022-04-29        119  Moisture_I3  31.20
9  2022-03-31         90      Temp_I3   5.73
10 2022-04-18        108      Temp_I3  18.12
11 2022-04-29        119      Temp_I3   1.98
12 2022-03-31         90  Moisture_I5  32.10
13 2022-04-18        108  Moisture_I5  31.53
14 2022-04-29        119  Moisture_I5  31.03
15 2022-03-31         90      Temp_I5   7.03
16 2022-04-18        108      Temp_I5   6.53
17 2022-04-29        119      Temp_I5   6.26
18 2022-03-31         90  Diameter_I0   5.02
19 2022-04-18        108  Diameter_I0   5.80
20 2022-04-29        119 

In [36]:
# Feature engineering
merged_data['DayOfYear'] = merged_data['Date'].dt.dayofyear

In [39]:
# Reshape data for ML
data_list = []
for loc in ['I0', 'I3', 'I5']:
    temp_df = merged_data[['Moisture_' + loc, 'Temp_' + loc, 'Diameter_' + loc, 'DayOfYear']].copy()
    temp_df.columns = ['Moisture', 'Temperature', 'Diameter', 'DayOfYear']
    temp_df['Location'] = loc
    data_list.append(temp_df)

final_data = pd.concat(data_list, axis=0)

In [43]:
final_data

Unnamed: 0,Moisture,Temperature,Diameter,DayOfYear,Location
0,31.1,17.65,5.02,90,I0
1,32.65,16.71,5.8,108,I0
2,29.7,16.91,6.66,119,I0
0,31.23,5.73,4.93,90,I3
1,31.27,18.12,5.56,108,I3
2,31.2,1.98,6.31,119,I3
0,32.1,7.03,4.96,90,I5
1,31.53,6.53,5.43,108,I5
2,31.03,6.26,6.06,119,I5


In [44]:
# Separate features and target
X = final_data[['Moisture', 'Temperature', 'DayOfYear', 'Location']]
y = final_data['Diameter']


In [45]:
# Encode categorical data
X = pd.get_dummies(X, columns=['Location'], drop_first=True)

In [46]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [47]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [48]:
# Define models to evaluate
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42)
}

In [49]:
# Train and evaluate models
results = []
for model_name, model in models.items():
    # Train the model
    model.fit(X_train_scaled, y_train)
    
    # Predict
    y_pred = model.predict(X_test_scaled)
    
    # Evaluate
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results.append([model_name, mae, mse, r2])

    print(f"Model: {model_name}")
    print(f"  Mean Absolute Error (MAE): {mae}")
    print(f"  Mean Squared Error (MSE): {mse}")
    print(f"  R-Squared (R2): {r2}")
    print()


Model: Linear Regression
  Mean Absolute Error (MAE): 0.1454503580552564
  Mean Squared Error (MSE): 0.029529467868597996
  R-Squared (R2): 0.13719597169910946

Model: Random Forest
  Mean Absolute Error (MAE): 0.17194999999999938
  Mean Squared Error (MSE): 0.032370505000000126
  R-Squared (R2): 0.05418539079619844

Model: Gradient Boosting
  Mean Absolute Error (MAE): 0.4204869618816631
  Mean Squared Error (MSE): 0.20743533765132374
  R-Squared (R2): -5.060930245473298



In [50]:
# Save results to CSV
results_df = pd.DataFrame(results, columns=['Model', 'MAE', 'MSE', 'R2'])
results_df.to_csv('model_evaluation_results.csv', index=False)

In [51]:
# Select the best model (based on R²)
best_model_name = results_df.sort_values(by='R2', ascending=False).iloc[0]['Model']
best_model = models[best_model_name]
print(f"The best model is: {best_model_name}")

The best model is: Linear Regression


In [52]:
# Predict using the best model for sample data
sample_data = pd.DataFrame({
    'Moisture': [16.0, 15.8],
    'Temperature': [31.5, 32.1],
    'DayOfYear': [90, 91],
    'Location_I3': [0, 1],
    'Location_I5': [1, 0]
})


In [53]:
# Scale the sample data
sample_scaled = scaler.transform(sample_data)

In [54]:
sample_scaled

array([[-22.97390328,   3.3424987 ,  -1.11274775,  -0.8660254 ,
          1.58113883],
       [-23.27839504,   3.43811574,  -1.03856456,   1.15470054,
         -0.63245553]])

In [55]:
# Predict
sample_predictions = best_model.predict(sample_scaled)
sample_data['Predicted_Diameter'] = sample_predictions

In [56]:
# Save sample predictions to CSV
sample_data.to_csv('sample_predictions.csv', index=False)
print("Sample predictions saved to 'sample_predictions.csv'.")

Sample predictions saved to 'sample_predictions.csv'.


In [60]:


# Calculate accuracy as a percentage
def calculate_accuracy(y_true, y_pred):
    # Mean Absolute Percentage Error (MAPE)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    accuracy = 100 - mape
    return accuracy


# Predicting with the test data
y_test_pred = best_model.predict(X_test_scaled)

# Calculate accuracy in percentage
accuracy_percentage = calculate_accuracy(y_test, y_test_pred)

# Print the accuracy
print(f"Model Accuracy: {accuracy_percentage:.2f}%")

# Save the accuracy result to a CSV file
accuracy_results = pd.DataFrame({
    "Metric": ["Accuracy (%)"],
    "Value": [accuracy_percentage]
})
accuracy_results.to_csv('accuracy_percentage_results.csv', index=False)
print("Accuracy percentage saved to 'accuracy_percentage_results.csv'.")


Model Accuracy: 97.35%
Accuracy percentage saved to 'accuracy_percentage_results.csv'.
