In [126]:
import pandas as pd
import numpy as np
import pickle
import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split as split
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Ridge
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score

In [65]:
import warnings
warnings.filterwarnings("ignore")

In [268]:
df = pd.read_csv("synthetic_harvest_risk_score_dataset.csv")
df = df.drop(columns = "farmer_id")
df

Unnamed: 0,crop_type,state,season,date,risk_score
0,Cassava,Oyo,Rainy,10/09/2022,0
1,Rice,Taraba,Rainy,04/15/2022,2
2,Cassava,Benue,Rainy,08/28/2023,1
3,Groundnut,Oyo,Rainy,04/06/2022,2
4,Rice,Taraba,Rainy,07/29/2023,2
...,...,...,...,...,...
698,Maize,Kano,Rainy,02/22/2022,2
699,Sorghum,Kaduna,Rainy,03/16/2022,3
700,Rice,Kano,Rainy,05/06/2023,0
701,Rice,Plateau,Dry,01/16/2022,7


In [270]:
df["month"] = df["date"].str.split("/").str[0]

In [272]:
df = df.drop(columns = "date")

0      10
1       4
2       8
3       4
4       7
       ..
698     2
699     3
700     5
701     1
702     5
Name: month, Length: 703, dtype: int64

In [288]:
df["month"] = df["month"].apply(lambda x: int(x))
df["month"] = df["month"].apply(lambda x: str(x))

In [290]:
df_1 = pd.get_dummies(df)
df_1 = df_1.replace(True, 1).replace(False, 0)
df_1

Unnamed: 0,risk_score,crop_type_Cassava,crop_type_Groundnut,crop_type_Maize,crop_type_Pepper,crop_type_Rice,crop_type_Sorghum,crop_type_Yam,state_Benue,state_Kaduna,...,month_11,month_12,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,1,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
3,2,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,2,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
698,2,0,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
699,3,0,0,0,0,0,1,0,0,1,...,0,0,0,1,0,0,0,0,0,0
700,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
701,7,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [315]:
saved_columns = df_1.columns.tolist()

In [292]:
X = df_1.drop(columns = "risk_score")
y = df_1.risk_score

In [294]:
x_train, x_test, y_train, y_test = split(X, y, test_size = 0.3, random_state = 42)

In [282]:
x_train.shape

(492, 16)

In [296]:
lr = LinearRegression()
lr.fit(x_train, y_train)

In [298]:
lr.score(x_test, y_test)

0.824287770195353

In [300]:
RFR = RandomForestClassifier()

In [302]:
RFR.fit(x_train, y_train)

In [304]:
RFR.score(x_test, y_test)

0.3127962085308057

In [218]:
ada = AdaBoostClassifier()

In [220]:
ada.fit(x_train, y_train)

In [222]:
ada.score(x_test, y_test)

0.22748815165876776

In [242]:
pipeline = make_pipeline(PolynomialFeatures(degree= 5), LinearRegression())

In [244]:
pipeline.fit(x_train, y_train)

In [246]:
pipeline.score(x_test, y_test)

0.7050338842362458

In [122]:
degrees = [1, 2, 3, 4, 5, 7, 10, 15]

In [248]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error

In [250]:
alphas = [0.001, 0.01, 0.1, 1, 10, 100]

for a in alphas:
    ridge = Ridge(alpha=a)
    ridge.fit(x_train, y_train)
    y_pred = ridge.predict(x_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"Ridge alpha={a},{ridge.score(x_test, y_test)}")

Ridge alpha=0.001,0.8242864318215692
Ridge alpha=0.01,0.8242743776006747
Ridge alpha=0.1,0.8241529627460762
Ridge alpha=1,0.8228555690353344
Ridge alpha=10,0.8044322032265482
Ridge alpha=100,0.6044638894209884


In [252]:
for a in alphas:
    lasso = Lasso(alpha=a, max_iter=10000)
    lasso.fit(x_train, y_train)
    y_pred = lasso.predict(x_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"Lasso alpha={a}, {lasso.score(x_test, y_test)}")


Lasso alpha=0.001, 0.8236851964559573
Lasso alpha=0.01, 0.8145744207246342
Lasso alpha=0.1, 0.6155872653644128
Lasso alpha=1, -0.00041155000002479625
Lasso alpha=10, -0.00041155000002479625
Lasso alpha=100, -0.00041155000002479625


In [254]:
df["state_label"] = LabelEncoder().fit_transform(df["state"])
df["season_label"] = LabelEncoder().fit_transform(df["season"])
dummy = pd.get_dummies(df["crop_type"])
dummy = dummy.replace(True, 1).replace(False, 0)
df_3 = df.join(dummy)

In [256]:
df_3 = df_3.drop(columns = ["crop_type", "state", "season"])
df_3

Unnamed: 0,risk_score,month,state_label,season_label,Cassava,Groundnut,Maize,Pepper,Rice,Sorghum,Yam
0,0,10,3,1,1,0,0,0,0,0,0
1,2,04,5,1,0,0,0,0,1,0,0
2,1,08,0,1,1,0,0,0,0,0,0
3,2,04,3,1,0,1,0,0,0,0,0
4,2,07,5,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
698,2,02,2,1,0,0,1,0,0,0,0
699,3,03,1,1,0,0,0,0,0,1,0
700,0,05,2,1,0,0,0,0,1,0,0
701,7,01,4,0,0,0,0,0,1,0,0


In [258]:
u = df_3.drop(columns = "risk_score")
v = df_3.risk_score

In [260]:
u_train, u_test, v_train, v_test = split(u, v, test_size = 0.3, random_state = 42)

In [262]:
lr = LinearRegression()
lr.fit(u_train, v_train)

In [264]:
lr.score(u_test, v_test)

0.7663561859843167

In [306]:
model = Ridge(alpha= 0.01)

In [308]:
model.fit(X, y)

In [312]:
model.score(x_train, y_train)

0.7792442893920172

In [327]:
model.predict(x_train.iloc[100:125])

array([2.37711423, 4.2114719 , 2.89141202, 3.12069006, 6.96180983,
       3.06890086, 0.18412531, 4.2925023 , 4.16416829, 1.90983529,
       6.13172449, 3.91424451, 7.0261575 , 3.20831127, 5.10968734,
       3.88230329, 0.26746589, 5.97722258, 1.36819209, 2.15962775,
       2.10059547, 5.82027772, 3.3122488 , 2.21276153, 4.00159567])

In [329]:
with open("risk_score_columns.pkl", "wb") as f:
    pickle.dump(saved_columns, f)

In [332]:
with open("risk_score_model.pkl", "wb") as f:
    pickle.dump(model, f)