In [90]:
import pandas as pd
import pickle
import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split as split
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor

In [75]:
import warnings
warnings.filterwarnings("ignore")

In [9]:
df = pd.read_csv("harvest_prediction_dataset_nigeria.csv")
df

Unnamed: 0,farmer_id,crop_type,planting_date,harvesting_date,duration_days,state,season
0,FARM0001,Cassava,2023-05-21,2024-02-18,273,Gombe,Dry
1,FARM0002,Tomato,2023-02-22,2023-05-16,83,Katsina,Rainy
2,FARM0003,Rice,2023-02-17,2023-05-29,101,Ekiti,Dry
3,FARM0004,Carrot,2023-01-14,2023-04-03,79,Edo,Rainy
4,FARM0005,Sorghum,2023-10-29,2024-03-01,124,Jigawa,Dry
...,...,...,...,...,...,...,...
822,FARM0823,Okra,2023-12-04,2024-02-06,64,Benue,Rainy
823,FARM0824,Maize,2023-01-25,2023-05-11,106,Sokoto,Rainy
824,FARM0825,Carrot,2023-12-21,2024-03-02,72,Ebonyi,Dry
825,FARM0826,Cassava,2023-12-03,2024-10-17,319,Kwara,Rainy


In [11]:
df["crop_type"].unique()

array(['Cassava', 'Tomato', 'Rice', 'Carrot', 'Sorghum', 'Pepper',
       'Cowpea', 'Yam', 'Okra', 'Groundnut', 'Onion', 'Millet',
       'Cucumber', 'Sweet Potato', 'Cocoa', 'Maize', 'Oil Palm',
       'Soybean'], dtype=object)

In [152]:
df["planting_month"] = df["planting_date"].str[5:7]
df["planting_month"] = df["planting_month"].apply(lambda x: str(x))

In [23]:
df.planting_dat

0      2023-05-21
1      2023-02-22
2      2023-02-17
3      2023-01-14
4      2023-10-29
          ...    
822    2023-12-04
823    2023-01-25
824    2023-12-21
825    2023-12-03
826    2023-12-21
Name: planting_date, Length: 827, dtype: object

In [154]:
df_1 = df.drop(columns = ["farmer_id", "harvesting_date", "planting_date"])

In [156]:
df_2 = pd.get_dummies(df_1)
df_2 = df_2.replace(True, 1).replace(False, 0)
df_2

Unnamed: 0,duration_days,crop_type_Carrot,crop_type_Cassava,crop_type_Cocoa,crop_type_Cowpea,crop_type_Cucumber,crop_type_Groundnut,crop_type_Maize,crop_type_Millet,crop_type_Oil Palm,...,planting_month_03,planting_month_04,planting_month_05,planting_month_06,planting_month_07,planting_month_08,planting_month_09,planting_month_10,planting_month_11,planting_month_12
0,273,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,83,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,101,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,79,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,124,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
822,64,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
823,106,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
824,72,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
825,319,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [158]:
saved_columns = df_2.columns.tolist()

In [160]:
X = df_2.drop(columns = "duration_days")
y = df_2.duration_days

In [162]:
x_train, x_test, y_train, y_test = split(X, y, test_size = 0.3, random_state = 42)

In [164]:
lr = LinearRegression()
lr.fit(x_train, y_train)

In [166]:
lr.score(x_test, y_test)

0.950331265170593

In [168]:
base_learners = [
    ("LR", LinearRegression()),
    ("RFR", RandomForestRegressor(n_estimators= 50, max_depth= None, random_state= 42)),
    ("Ridge", Ridge(max_iter = 50)),
    ("Ada", AdaBoostRegressor(n_estimators = 50, learning_rate= 0.1)),
    ("KNN", KNeighborsRegressor(n_neighbors= 5))
]

stacking_model = StackingRegressor(estimators= base_learners, final_estimator= LinearRegression())
stacking_model.fit(x_train, y_train)

In [170]:
stacking_model.score(x_test, y_test)

0.9459461242028392

In [172]:
stacking_model.score(x_train, y_train)

0.9332077606049652

In [174]:
stacking_model.predict(x_train.iloc[5:10])

array([ 53.62931907, 110.32092248,  67.84288157,  67.64941294,
       306.32532312])

In [176]:
x_train.iloc[5:10]

Unnamed: 0,crop_type_Carrot,crop_type_Cassava,crop_type_Cocoa,crop_type_Cowpea,crop_type_Cucumber,crop_type_Groundnut,crop_type_Maize,crop_type_Millet,crop_type_Oil Palm,crop_type_Okra,...,planting_month_03,planting_month_04,planting_month_05,planting_month_06,planting_month_07,planting_month_08,planting_month_09,planting_month_10,planting_month_11,planting_month_12
132,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
462,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
730,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
442,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
41,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [179]:
stacking_model.fit(X, y)

In [181]:
with open("harvest_model_columns.pkl", "wb") as f:
    pickle.dump(saved_columns, f)

In [183]:
with open("harvest_day_model.pkl", "wb") as f:
    pickle.dump(stacking_model, f)