# **Load The dataset**

In [None]:
import pandas as pd
import os

In [None]:
# !unzip -q /content/Weather_Data.zip -d /content/data/
data_directory = r'D:\Code Stuff\Agriweather Wizard\Data'



In [None]:
data1 = pd.read_csv(os.path.join(data_directory, 'Crop_Yield_Data_challenge_2.csv'))
data2 = pd.read_csv(os.path.join(data_directory, 'ndvi.csv'))
data3 = pd.read_csv(os.path.join(data_directory, 'Sentinel_1.csv'))
data4 = pd.read_csv(os.path.join(data_directory, 'Weather_Data.csv'))
data4 = data4.drop(['Lattitude', 'Longtitude', 'Season'], axis=1)

In [None]:
data = pd.concat([data1, data2, data3, data4], axis=1)

In [None]:
data.columns


One-hot encoding

In [None]:
data = pd.get_dummies(data, columns=['District'])
data = pd.get_dummies(data, columns=['Season(SA = Summer Autumn, WS = Winter Spring)'])
data = pd.get_dummies(data, columns=['Rice Crop Intensity(D=Double, T=Triple)'])

Convert date to columns

In [None]:
import pandas as pd

data['Date of Harvest'] = pd.to_datetime(data['Date of Harvest'], format='%d-%m-%Y')

# Create new features
data['Year'] = data['Date of Harvest'].dt.year
data['Quarter'] = data['Date of Harvest'].dt.quarter
data['Month'] = data['Date of Harvest'].dt.month
data['Day of Year'] = data['Date of Harvest'].dt.dayofyear
data['Day of Month'] = data['Date of Harvest'].dt.day
data['Day of Week'] = data['Date of Harvest'].dt.dayofweek
data['Week of Year'] = data['Date of Harvest'].dt.isocalendar().week

# Now drop the 'Date of Harvest' column
data = data.drop(columns=['Date of Harvest'])

# **Replace missing values** (**KNNImputer**)

In [None]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
import numpy as np


In [None]:
print(data.isnull().sum())

In [None]:
# Perform KNN Imputation
imputer = KNNImputer(n_neighbors=5)
df_imputed = imputer.fit_transform(data)
# Replace df with the imputed data
df = pd.DataFrame(df_imputed, columns=data.columns)

In [None]:
# Split the dataset into features and target variable
X = df.drop('Rice Yield (kg/ha)', axis=1)  # replace with your target column name
y = df['Rice Yield (kg/ha)']  # replace with your target column name


In [None]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Create a pipeline - Scale the data and then apply KNN
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsRegressor())
])

In [None]:
# Define the grid of hyperparameters
grid = {
    'knn__n_neighbors': range(1, 30),
    'knn__weights': ['uniform', 'distance']
}

In [None]:
# Perform Grid Search
grid_cv = GridSearchCV(pipe, grid, cv=5)
grid_cv.fit(X_train, y_train)


In [None]:
# Best hyperparameters
print("Best Parameters: ", grid_cv.best_params_)


In [None]:
# Predict on the test set
y_pred = grid_cv.predict(X_test)

In [None]:
# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE: ", rmse)

In [None]:
# Add predicted values back into the original dataframe
data.loc[X_test.index, 'Predicted Yield'] = y_pred

print(data)

In [None]:
# Fit the model to the entire data
grid_cv.fit(X, y)

# Predict on the entire dataset
y_pred_all = grid_cv.predict(X)

# Add predicted values back into the original dataframe
data['Predicted Yield'] = y_pred_all

print(data)

In [None]:
print(data.isnull().sum())

In [None]:
# Perform KNN Imputation for 'ndvi'
imputer = KNNImputer(n_neighbors=5)
data['ndvi'] = imputer.fit_transform(data[['ndvi']])


In [None]:
print(data.isnull().sum())

# **Extra Trees model-GridSearch**

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_squared_error, r2_score


In [None]:
# Define the parameter grid for grid search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


In [None]:
# Initialize the Extra Trees regression model
et_model = ExtraTreesRegressor()

In [None]:
# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=et_model, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

In [None]:
# Get the best model and best parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

In [None]:
# Print the best parameters
print("Best Parameters: ", best_params)
# Best Parameters:  {'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}

In [None]:
# Predict on the validation data with the best model
y_pred_et_val = best_model.predict(X_val)

In [None]:
import matplotlib.pyplot as plt

# Assuming y_pred_et_test and y_test are the predicted and actual values, respectively
plt.figure(figsize=(10, 6))

# Plotting the predicted values
plt.plot(range(len(y_pred_et_test)), y_pred_et_test, label='Predicted', color='purple', linewidth=2)

# Plotting the actual values
plt.plot(range(len(y_test)), y_test, label='Actual', color='green', linewidth=2)

# Customizing the plot
plt.xlabel('Data Points', fontsize=12)
plt.ylabel('Rice Yield (kg/ha)', fontsize=12)
plt.title('Actual vs. Predicted Values (Extra Trees)', fontsize=14)
plt.legend(fontsize=12)

# Adding gridlines
plt.grid(True, linestyle='--', alpha=0.5)

# Removing top and right spines
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

# Show the plot
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Assuming y_pred_et_test and y_test are the predicted and actual values, respectively
plt.figure(figsize=(10, 6))

# Plotting the predicted values
plt.plot(range(len(y_pred_et_test)), y_pred_et_test, label='Predicted', color='purple', linewidth=2)

# Plotting the actual values
plt.plot(range(len(y_test)), y_test, label='Actual', color='green', linewidth=2)

# Customizing the plot
plt.xlabel('Data Points', fontsize=12)
plt.ylabel('Rice Yield (kg/ha)', fontsize=12)
plt.title('Actual vs. Predicted Values (Extra Trees)', fontsize=14)
plt.legend(fontsize=12)

# Adding gridlines
plt.grid(True, linestyle='--', alpha=0.5)

# Removing top and right spines
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

# Show the plot
plt.tight_layout()
plt.show()


# **Metrics**

In [None]:
# ---------------------EXTRA_TREES----------------------
print('---------------------EXTRA_TREES----------------------')
# Calculate RMSE and R^2 for the validation data
rmse_val_et = np.sqrt(mean_squared_error(y_val, y_pred_et_val))
r2_val_et = r2_score(y_val, y_pred_et_val)
print("Validation RMSE (Extra Trees): ", rmse_val_et)
print("Validation R^2 (Extra Trees): ", r2_val_et)

# Predict on the test data with the best model
y_pred_et_test = best_model.predict(X_test)

# Calculate RMSE and R^2 for the test data
rmse_test_et = np.sqrt(mean_squared_error(y_test, y_pred_et_test))
r2_test_et = r2_score(y_test, y_pred_et_test)
print("Test RMSE (Extra Trees): ", rmse_test_et)
print("Test R^2 (Extra Trees): ", r2_test_et)

In [None]:
import matplotlib.pyplot as plt

# Assuming y_test, y_pred_rf_test, and y_pred_et_test are the actual, predicted values from Random Forest, and predicted values from Extra Trees, respectively
plt.figure(figsize=(10, 6))
# plt.plot(range(len(y_test)), y_test, label='Actual')
plt.plot(range(len(y_pred_et_test)), y_pred_et_test, label='Extra Trees')
plt.xlabel('Data Points')
plt.ylabel('Rice Yield (kg/ha)')
plt.title('Actual vs. Predicted Values')
plt.legend()
plt.show()


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for grid search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the Random Forest regression model
rf_model = RandomForestRegressor()

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best model and best parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Print the best parameters
print("Best Parameters: ", best_params)
# Best Parameters:  {'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}

# Predict on the validation data with the best model
y_pred_rf_val = best_model.predict(X_val)

import matplotlib.pyplot as plt

# Assuming y_pred_rf_test and y_test are the predicted and actual values, respectively
plt.figure(figsize=(10, 6))

# Plotting the predicted values
plt.plot(range(len(y_pred_rf_test)), y_pred_rf_test, label='Predicted', color='purple', linewidth=2)

# Plotting the actual values
plt.plot(range(len(y_test)), y_test, label='Actual', color='green', linewidth=2)

# Customizing the plot
plt.xlabel('Data Points', fontsize=12)
plt.ylabel('Rice Yield (kg/ha)', fontsize=12)
plt.title('Actual vs. Predicted Values (Random Forest)', fontsize=14)
plt.legend(fontsize=12)

# Adding gridlines
plt.grid(True, linestyle='--', alpha=0.5)

# Removing top and right spines
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Assuming et_accuracy and rf_accuracy are the accuracies of Extra Trees and Random Forest models, respectively
et_accuracy = 0.85
rf_accuracy = 0.72

# Bar plot for model accuracies
plt.figure(figsize=(6, 5))

# Plotting the accuracies
plt.bar(['Extra Trees', 'Random Forest'], [et_accuracy, rf_accuracy], color=['blue', 'green'])

# Adding labels and title
plt.xlabel('Models', fontsize=12)
plt.ylabel('Accuracy', fontsize=12)
plt.title('Comparison of Model Accuracies', fontsize=14)

# Show the plot
plt.tight_layout()
plt.show()
