In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use("ggplot") 

In [None]:
df = pd.read_csv("yield_df.csv")

In [None]:
df.head()

## Data Preprocessing

In [None]:
df.drop('Unnamed: 0',axis = 1, inplace = True)

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.duplicated().sum()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.corr(numeric_only=True)


## Data Visualization

In [None]:
len(df['Area'].unique())

In [None]:
len(df['Item'].unique())

In [None]:
plt.figure(figsize=(15,20))
sns.countplot(y = df["Area"])
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.countplot(y = df["Item"])
plt.show()

In [None]:
(df['Area'].value_counts() < 400).sum()

In [None]:
country = df['Area'].unique()
yield_per_country = []
for state in country:
    yield_per_country.append(df[df['Area']==state]['hg/ha_yield'].sum())

In [None]:
df['hg/ha_yield'].sum()

In [None]:
yield_per_country

In [None]:
plt.figure(figsize = (15,20))
sns.barplot(y = country, x= yield_per_country )
plt.show()

In [None]:
crops = df['Item'].unique()
yield_per_crop = []
for crop in crops:
    yield_per_crop.append(df[df['Item']==crop]['hg/ha_yield'].sum())

In [None]:
plt.figure(figsize=(15,5))
sns.barplot(y= crops, x= yield_per_crop)
plt.show()

In [None]:
df.columns

In [None]:
col = ['Year','average_rain_fall_mm_per_year', 'pesticides_tonnes', 'avg_temp','Area', 'Item','hg/ha_yield' ]

In [None]:
df = df[col]

In [None]:
df.head()

In [None]:
x = df.drop('hg/ha_yield',axis=1)
y = df['hg/ha_yield']

In [None]:
x.shape

In [None]:
y.shape

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=0,shuffle=True)

## OneHotEncoder


In [None]:
df.head()

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

categorical_features = ['Area', 'Item']
numeric_features = ['Year', 'average_rain_fall_mm_per_year', 'pesticides_tonnes', 'avg_temp']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', RandomForestRegressor(random_state=42))])


In [None]:
preprocessor

In [None]:
x_train_d = preprocessor.fit_transform(x_train)
x_test_d = preprocessor.fit_transform(x_test)

In [None]:
preprocessor.get_feature_names_out(col[:-1])

## Model Training


In [None]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, r2_score

In [None]:
models = {
    'Linear Regression' : LinearRegression(),
    'Lasso' : Lasso(),
    'Ridge' : Ridge(),
    'Decision Tree' : DecisionTreeRegressor(),
    'KNN' : KNeighborsRegressor(),
}

for name, md in models.items():
    md.fit(x_train_d,y_train)
    y_pred = md.predict(x_test_d)
    print(f"{name}:Mean Absolute Error: {mean_absolute_error(y_test,y_pred)} Score: {r2_score(y_test,y_pred)}")

## Model Selecting


In [None]:
model = DecisionTreeRegressor()
model.fit(x_train_d,y_train)
model.predict(x_test_d)

## Prediction


In [None]:
df.columns

In [None]:
def prediction(Year, average_rain_fall_mm_per_year, pesticides_tonnes, avg_temp, Area, Item):
    features = pd.DataFrame([[Year, average_rain_fall_mm_per_year, pesticides_tonnes, avg_temp, Area, Item]],
                            columns=['Year','average_rain_fall_mm_per_year','pesticides_tonnes','avg_temp','Area','Item'])
    
    transform_features = preprocessor.transform(features)
    predicted_yield = model.predict(transform_features).reshape(-1,1)
    return predicted_yield[0][0]


In [None]:
df.head(10)

In [None]:
result = prediction(1990, 1485.0, 121.0, 16.37,'Albania','	Sorghum')
result

In [None]:
result = prediction(1991, 1485.0, 121.0, 15.36, "Albania", "Maize")	
result

In [None]:
result = prediction(1990, 1485.0, 121.00, 16.37, "Albania", "Potatoes")
result

In [None]:
#82865
result = prediction(1999, 1010.0, 40.0, 24.37, "Angola", "Sweet potatoes")
result

In [None]:
df.tail(6)

In [None]:
res = prediction(2013,657,2550,19.76,"Zimbabwe","Sorghum")
res

In [None]:
res = prediction(2013, 657.0, 2550.07, 19.76, "Zimbabwe", "Potatoes")
res

## Save the Model

In [None]:
import pickle
pickle.dump(model, open("model.pkl","wb"))
pickle.dump(preprocessor, open("preprocessor.pkl","wb"))