In [69]:
import pandas as pd
from sklearn.metrics import mean_squared_error as mse, mean_absolute_error as mae, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('Yield_Prediction_Data.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Dist Name,Area,Production,Yield,Crop
0,0,Hissar,12.3,19.0,1544.72,RICE
1,1,Hissar,14.0,21.0,1500.0,RICE
2,2,Hissar,13.0,21.0,1615.38,RICE
3,3,Hissar,16.2,30.0,1851.85,RICE
4,4,Hissar,19.7,32.0,1624.37,RICE


In [4]:
df.drop('Unnamed: 0', axis = 1, inplace = True)

In [5]:
X = df.drop('Yield', axis = 1)
y = df['Yield']

In [6]:
encoders = {}
for col in ['Dist Name', 'Crop']:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    encoders[col] = le

encoders_dist = dict(zip(encoders['Dist Name'].classes_, range(len(encoders['Dist Name'].classes_))))
encoders_crop = dict(zip(encoders['Crop'].classes_, range(len(encoders['Crop'].classes_))))

print(encoders_dist)
print(encoders_crop)

{'Ambala': 0, 'Gurgaon': 1, 'Hissar': 2, 'Jind': 3, 'Karnal': 4, 'Mahendragarh / Narnaul': 5, 'Rohtak': 6}
{'BARLEY': 0, 'CHICKPEA': 1, 'COTTON': 2, 'FRUITS': 3, 'GROUNDNUT': 4, 'KHARIF': 5, 'MAIZE': 6, 'MINOR': 7, 'OILSEEDS': 8, 'PEARL': 9, 'PIGEONPEA': 10, 'POTATOES': 11, 'RAPESEED': 12, 'RICE': 13, 'SESAMUM': 14, 'SORGHUM': 15, 'SUGARCANE': 16, 'WHEAT': 17}


In [7]:
sc = StandardScaler()

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [9]:
X_train

Unnamed: 0,Dist Name,Area,Production,Crop
1553,1,68.50,139.00,9
4338,6,0.40,0.20,14
107,3,10.70,12.20,13
29,2,82.07,250.00,13
2879,6,72.00,67.00,1
...,...,...,...,...
3772,3,0.00,0.00,4
5191,1,3.10,16.20,16
5226,3,10.16,46.30,16
5390,4,34.17,221.78,16


In [10]:
X_test

Unnamed: 0,Dist Name,Area,Production,Crop
1183,1,21.90,6.00,15
756,2,3.87,1.42,5
3880,0,0.00,0.00,4
763,2,2.06,0.71,5
5893,1,0.72,4.48,3
...,...,...,...,...
3317,2,19.38,7.64,7
3235,6,0.00,0.00,10
1945,3,1.00,1.00,6
3749,3,0.00,0.00,4


In [31]:
X_train_scaled = sc.fit_transform(X_train)
X_test_transform = sc.transform(X_test)

In [33]:
X_train

Unnamed: 0,Dist Name,Area,Production,Crop
1553,1,68.50,139.00,9
4338,6,0.40,0.20,14
107,3,10.70,12.20,13
29,2,82.07,250.00,13
2879,6,72.00,67.00,1
...,...,...,...,...
3772,3,0.00,0.00,4
5191,1,3.10,16.20,16
5226,3,10.16,46.30,16
5390,4,34.17,221.78,16


In [35]:
X_train_scaled

array([[-0.99240387,  0.24064878,  0.13927713,  0.09868765],
       [ 1.50097517, -0.44891625, -0.29469987,  1.06437473],
       [ 0.00494775, -0.3446208 , -0.25718024,  0.87123732],
       ...,
       [ 0.00494775, -0.35008872, -0.15056197,  1.45064956],
       [ 0.50362356, -0.10696895,  0.39810001,  1.45064956],
       [ 0.00494775, -0.44415714, -0.29219856, -0.67386201]])

In [37]:
X_test_transform

array([[-0.99240387, -0.23121216, -0.27656538,  1.25751215],
       [-0.49372806, -0.41377982, -0.29088537, -0.67386201],
       [-1.49107968, -0.45296656, -0.2953252 , -0.86699942],
       ...,
       [ 0.00494775, -0.44284079, -0.29219856, -0.48072459],
       [ 0.00494775, -0.45296656, -0.2953252 , -0.86699942],
       [-0.99240387, -0.45296656, -0.2943872 , -0.86699942]])

In [139]:
pipeline = Pipeline([
    ('regressor', GradientBoostingRegressor())  # Placeholder
])

# Define hyperparameter grid
param_grid = [
    {
        'regressor': [RandomForestRegressor()],
        'regressor__n_estimators': [50, 100, 200],
        'regressor__max_depth': [None, 10, 20]
    },
    {
        'regressor': [GradientBoostingRegressor()],
        'regressor__n_estimators': [50, 100, 200],
        'regressor__learning_rate': [0.01, 0.1, 0.2],
        'regressor__max_depth': [3, 5, 10]
    },
    {
        'regressor': [LinearRegression()],
        'regressor__fit_intercept': [True, False],
        'regressor__positive': [True, False]
    },
    {
        'regressor': [Ridge()],
        'regressor__alpha': [0.1, 1.0, 10.0],
        'regressor__fit_intercept': [True, False]
    },
    {
        'regressor': [Lasso()],
        'regressor__alpha': [0.01, 0.1, 1.0],
        'regressor__fit_intercept': [True, False]
    },
    {
        'regressor': [SVR()],
        'regressor__kernel': ['rbf', 'linear'],
        'regressor__C': [0.1, 1, 10],
        'regressor__epsilon': [0.01, 0.1, 0.2]
    },
    {
        'regressor': [KNeighborsRegressor()],
        'regressor__n_neighbors': [3, 5, 7],
        'regressor__weights': ['uniform', 'distance']
    }
]

# Perform GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, scoring='neg_root_mean_squared_error', verbose=1)
grid_search.fit(X_train_scaled, y_train)

# Best model and parameters
print("Best Model:", grid_search.best_estimator_)
print("Best Parameters:", grid_search.best_params_)
print("Best R^2 Score:", grid_search.best_score_)

Fitting 5 folds for each of 76 candidates, totalling 380 fits
Best Model: Pipeline(steps=[('regressor',
                 GradientBoostingRegressor(learning_rate=0.2, max_depth=5,
                                           n_estimators=200))])
Best Parameters: {'regressor': GradientBoostingRegressor(), 'regressor__learning_rate': 0.2, 'regressor__max_depth': 5, 'regressor__n_estimators': 200}
Best R^2 Score: -266.7388220810925


In [85]:
best_model = GradientBoostingRegressor(learning_rate=0.2, max_depth=5, n_estimators=200)

In [87]:
best_model.fit(X_train_scaled, y_train)

In [129]:
y_pred = best_model.predict(X_test_transform)
mse(y_test, y_pred)

73592.982559079

In [119]:
def user_input(dist_name, crop, input_area, input_production):
    input_dist = encoders_dist[dist_name]
    input_crop = encoders_crop[crop]
    input_df = pd.DataFrame([{
        'Dist Name': input_dist,
        'Crop': input_crop,
        'Area': input_area,
        'Production': input_production
    }]
    )
    input_df = sc.transform(input_df)
    y_pred = best_model.predict(input_df)
    return y_pred[0]

In [121]:
predicted_yield = user_input('Ambala', 'BARLEY', 1.76038, 1.88076)

In [123]:
predicted_yield

2019.772702047398