## This creates a dataframe using which we can build a model for yield prediction, this is a haryana based model with all the districts coming from haryana

In [15]:
import pandas as pd

In [17]:
yield_data = pd.read_csv('Yield Prediction Data.csv')

In [18]:
yield_data.head()

Unnamed: 0,Dist Name,Crop,Area,Production,Yield
0,Ambala,BARLEY,1.760385,1.880769,678.461154
1,Ambala,CHICKPEA,14.906731,10.798077,719.603462
2,Ambala,COTTON,1.41,0.260962,97.376731
3,Ambala,FRUITS,0.978269,7.292692,8.270962
4,Ambala,GROUNDNUT,3.529231,3.275385,716.369231


In [19]:
from sklearn.metrics import mean_squared_error as mse, mean_absolute_error as mae, r2_score
from sklearn.linear_model import LinearRegression 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler
import matplotlib.pyplot as plt
import seaborn as sns

In [20]:
X = yield_data.drop('Yield', axis = 1)
y = yield_data['Yield']

In [21]:
encoders = {}
for col in ['Dist Name', 'Crop']:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    encoders[col] = le

encoders_dist = dict(zip(encoders['Dist Name'].classes_, range(len(encoders['Dist Name'].classes_))))
encoders_crop = dict(zip(encoders['Crop'].classes_, range(len(encoders['Crop'].classes_))))

print(encoders_dist)
print(encoders_crop)

{'Ambala': 0, 'Gurgaon': 1, 'Hissar': 2, 'Jind': 3, 'Karnal': 4, 'Mahendragarh / Narnaul': 5, 'Rohtak': 6}
{'BARLEY': 0, 'CHICKPEA': 1, 'COTTON': 2, 'FRUITS': 3, 'GROUNDNUT': 4, 'KHARIF': 5, 'MAIZE': 6, 'MINOR': 7, 'OILSEEDS': 8, 'PEARL': 9, 'PIGEONPEA': 10, 'POTATOES': 11, 'RAPESEED': 12, 'RICE': 13, 'SESAMUM': 14, 'SORGHUM': 15, 'SUGARCANE': 16, 'WHEAT': 17}


In [22]:
X

Unnamed: 0,Dist Name,Crop,Area,Production
0,0,0,1.760385,1.880769
1,0,1,14.906731,10.798077
2,0,2,1.410000,0.260962
3,0,3,0.978269,7.292692
4,0,4,3.529231,3.275385
...,...,...,...,...
121,6,13,70.526923,144.948269
122,6,14,0.204231,0.055385
123,6,15,65.110192,17.164231
124,6,16,32.017885,151.055769


In [23]:
y

0       678.461154
1       719.603462
2        97.376731
3         8.270962
4       716.369231
          ...     
121    1830.817115
122     249.962308
123     288.270000
124    4839.843654
125    3160.012115
Name: Yield, Length: 126, dtype: float64

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [25]:
sc = StandardScaler()

In [26]:
sc.fit_transform(X_train)
sc.fit(X_test)

In [27]:
y_train

68      310.888654
12      831.091923
110     284.550769
124    4839.843654
27     1034.860385
          ...     
106    4189.777115
14      273.581923
92      315.296346
51      271.111923
102    1098.780962
Name: Yield, Length: 100, dtype: float64

In [28]:
y_test

73      565.125962
19      961.612115
116     846.183654
67     2178.710192
94      537.337308
77      189.724615
31     2181.710962
53     3352.926923
117    1066.200385
44      850.995385
4       716.369231
104     271.220385
36     2131.141538
97      580.739038
78     1270.757308
18     2226.972308
91      702.526346
10      845.619231
55      615.706731
11       32.788269
45      951.373654
26      887.083462
76      366.025577
0       678.461154
56      333.722500
40      963.124808
Name: Yield, dtype: float64

In [29]:
X_train

Unnamed: 0,Dist Name,Crop,Area,Production
68,3,14,0.402500,0.155769
12,0,12,5.830000,4.682692
110,6,2,13.602692,4.066154
124,6,16,32.017885,151.055769
27,1,9,85.193269,81.057692
...,...,...,...,...
106,5,16,0.848654,3.441346
14,0,14,0.321346,0.108077
92,5,2,14.741731,5.613462
51,2,15,8.944808,2.077500


In [30]:
X_test

Unnamed: 0,Dist Name,Crop,Area,Production
73,4,1,21.845192,19.228846
19,1,1,26.818269,22.967308
116,6,8,46.920577,51.733269
67,3,13,78.532308,192.869231
94,5,4,0.131538,0.105192
77,4,5,5.573846,1.458077
31,1,13,22.110385,57.157692
53,2,17,511.074423,1964.578846
117,6,9,80.059231,77.463077
44,2,8,167.941346,184.4525


#### importing all regression models and building a gridsearchcv to check which model is the best

In [32]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline

In [33]:
pipeline = Pipeline([
    ('regressor', LinearRegression())  # Placeholder
])

# Define hyperparameter grid
param_grid = [
    {
        'regressor': [RandomForestRegressor()],
        'regressor__n_estimators': [50, 100, 200],
        'regressor__max_depth': [None, 10, 20]
    },
    {
        'regressor': [SVR()],
        'regressor__C': [0.1, 1, 10],
        'regressor__kernel': ['linear', 'rbf']
    },
    {
        'regressor': [LinearRegression()]
    },
    {
        'regressor': [GradientBoostingRegressor()],
        'regressor__n_estimators': [50, 100, 200],
        'regressor__learning_rate': [0.01, 0.1, 0.2],
        'regressor__max_depth': [3, 5, 10]
    }
]

# Perform GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, scoring='r2', verbose=1)
grid_search.fit(X_train, y_train)

# Best model and parameters
print("Best Model:", grid_search.best_estimator_)
print("Best Parameters:", grid_search.best_params_)
print("Best R^2 Score:", grid_search.best_score_)


Fitting 5 folds for each of 43 candidates, totalling 215 fits
Best Model: Pipeline(steps=[('regressor', GradientBoostingRegressor(n_estimators=200))])
Best Parameters: {'regressor': GradientBoostingRegressor(), 'regressor__learning_rate': 0.1, 'regressor__max_depth': 3, 'regressor__n_estimators': 200}
Best R^2 Score: 0.9068202328012805


#### Best regression model is GradientBoostingRegressor() with the learning_rate: 0.2, max_depth: 3, n_estimators: 19; r2_score: 0.799

In [35]:
best_model = GradientBoostingRegressor(learning_rate=0.2, max_depth=3, n_estimators=15)

In [36]:
best_model.fit(X_train, y_train)

In [37]:
y_pred = best_model.predict(X_test)

In [38]:
r2_score(y_test, y_pred)

0.7942782343771266

In [93]:
X_test

Unnamed: 0,Dist Name,Crop,Area,Production
73,4,1,21.845192,19.228846
19,1,1,26.818269,22.967308
116,6,8,46.920577,51.733269
67,3,13,78.532308,192.869231
94,5,4,0.131538,0.105192
77,4,5,5.573846,1.458077
31,1,13,22.110385,57.157692
53,2,17,511.074423,1964.578846
117,6,9,80.059231,77.463077
44,2,8,167.941346,184.4525


In [40]:
y_train

68      310.888654
12      831.091923
110     284.550769
124    4839.843654
27     1034.860385
          ...     
106    4189.777115
14      273.581923
92      315.296346
51      271.111923
102    1098.780962
Name: Yield, Length: 100, dtype: float64

In [91]:
y_test

73      565.125962
19      961.612115
116     846.183654
67     2178.710192
94      537.337308
77      189.724615
31     2181.710962
53     3352.926923
117    1066.200385
44      850.995385
4       716.369231
104     271.220385
36     2131.141538
97      580.739038
78     1270.757308
18     2226.972308
91      702.526346
10      845.619231
55      615.706731
11       32.788269
45      951.373654
26      887.083462
76      366.025577
0       678.461154
56      333.722500
40      963.124808
Name: Yield, dtype: float64

## creating a user input function 

In [85]:
def user_input(dist_name, crop, input_area, input_production):
    input_dist = encoders_dist[dist_name]
    input_crop = encoders_crop[crop]
    input_df = pd.DataFrame({
        'Dist Name': [input_dist],
        'Crop': [input_crop],
        'Area': [input_area],
        'Production': [input_production]
    }
    )
    #input_df = sc.transform(input_df)
    y_pred = best_model.predict(input_df)
    return y_pred[0]

In [43]:
print(encoders_dist)
print(encoders_crop)

{'Ambala': 0, 'Gurgaon': 1, 'Hissar': 2, 'Jind': 3, 'Karnal': 4, 'Mahendragarh / Narnaul': 5, 'Rohtak': 6}
{'BARLEY': 0, 'CHICKPEA': 1, 'COTTON': 2, 'FRUITS': 3, 'GROUNDNUT': 4, 'KHARIF': 5, 'MAIZE': 6, 'MINOR': 7, 'OILSEEDS': 8, 'PEARL': 9, 'PIGEONPEA': 10, 'POTATOES': 11, 'RAPESEED': 12, 'RICE': 13, 'SESAMUM': 14, 'SORGHUM': 15, 'SUGARCANE': 16, 'WHEAT': 17}


In [87]:
predicted_yield = user_input('Hissar', 'SESAMUM', 0.402500, 0.155769)

In [89]:
predicted_yield

318.2677303701476