In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use("ggplot")

: 

In [67]:
df = pd.read_csv("yield_df.csv")

In [None]:
#The `df.head()` method in pandas returns the first 5 rows of the DataFrame `df`.
df.head()

In [69]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
df.head()

In [None]:
#displays a concise summary of the DataFrame, including the index dtype, column dtypes, non-null values, and memory usage.
df.info()

In [None]:
#Count of Null values in each column of DF 
df.isnull().sum()

In [None]:
#Returns duplicate rows
df.duplicated().sum()

In [74]:
df.drop_duplicates(inplace=True)

In [None]:
df.duplicated().sum()

In [None]:
df.shape

In [None]:
#generates descriptive statistics.
df.describe()

In [None]:
df.corr()

In [79]:
# Data Visualization

In [None]:
len(df['Area'].unique())

In [None]:
len(df['Item'].unique())

In [None]:
plt.figure(figsize=(15,20))
sns.countplot(y = df['Area'])
plt.show()

In [None]:
plt.figure(figsize=(15,20))
sns.countplot(y = df['Item'])
plt.show()

In [None]:
#It counts how many unique areas have fewer than 400 occurrences.
(df['Area'].value_counts() <400).sum()

In [85]:
country = df['Area'].unique()
yield_per_country = []
for state in country:
    yield_per_country.append(df[df['Area'] == state]['hg/ha_yield'].sum())

In [None]:
df['hg/ha_yield'].sum()

In [None]:
#total yield across all entries or rows in that column.
yield_per_country

In [None]:
plt.figure(figsize=(15,20))
sns.barplot(y = country, x = yield_per_country)
plt.show()

In [89]:
crops = df['Item'].unique()
yield_per_crop = []
for crop in crops:
    yield_per_crop.append(df[df['Item'] == crop]['hg/ha_yield'].sum())

In [None]:
plt.figure(figsize=(15,20))
sns.barplot(y = crops, x = yield_per_crop)
plt.show()

In [None]:
df.head()

In [None]:
df.columns

In [93]:
col = ['Year','average_rain_fall_mm_per_year','pesticides_tonnes', 'avg_temp','Area', 'Item', 'hg/ha_yield']

In [94]:
df = df[col]

In [None]:
df.head()

In [96]:
X = df.drop('hg/ha_yield', axis = 1)
y = df['hg/ha_yield']

In [None]:
X.shape

In [None]:
y.shape

In [126]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 0, shuffle=True)

In [127]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

ohe = OneHotEncoder(drop = 'first')
scale = StandardScaler()

preprocesser = ColumnTransformer(
    transformers = [
        ('StandardScale', scale, [0,1,2,3]),
        ('OneHotEncode', ohe, [4,5])
    ], 
    remainder = 'passthrough'
) 

In [128]:
X_train_dummy = preprocesser.fit_transform(X_train)
X_test_dummy  = preprocesser.fit_transform(X_test)

In [None]:
preprocesser.get_feature_names_out(col[:-1])

In [130]:
from sklearn.linear_model import LinearRegression,Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, r2_score, accuracy_score

In [None]:
models = {
    'Linear Regression': LinearRegression(),
    'Lasso' : Lasso(),
    'Ridge' : Ridge(),
    'Decision Tree': DecisionTreeRegressor(),
    'KNN': KNeighborsRegressor(),
}

for name, md in models.items():
    md.fit(X_train_dummy,y_train)
    y_pred = md.predict(X_test_dummy)
    print(f"{name}: mae : {mean_absolute_error(y_test, y_pred)} score : {r2_score(y_test, y_pred)}")

In [None]:
dtr = DecisionTreeRegressor()
dtr.fit(X_train_dummy,y_train)
dtr.predict(X_test_dummy)

In [None]:
df.columns

In [None]:
df.head()

In [135]:
# Predictive System

In [136]:
def prediction(Year, average_rain_fall_mm_per_year,pesticides_tonnes, avg_temp, Area, Item):
    features = np.array([[Year, average_rain_fall_mm_per_year,pesticides_tonnes, avg_temp, Area, Item]], dtype = object)
    transform_features = preprocesser.transform(features)
    predicted_yeild = dtr.predict(transform_features).reshape(-1,1)
    return predicted_yeild[0][0]

In [None]:
result = prediction(1990,1485.0,121.0,16.37,'Albania','Maize')

In [None]:
result

In [139]:
import pickle
pickle.dump(dtr, open("dtr.pkl","wb"))
pickle.dump(preprocesser, open("preprocesser.pkl","wb"))