##### importing libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
pd.options.mode.chained_assignment = None  # default='warn'

##### importing dataset

In [2]:
df=pd.read_csv('crop_production_predictionML.csv')
df

Unnamed: 0,Year,C02039V02469,Area_under_Crops_Hectares,Crop_Yield_per_Hectare_in_Tonnes,crop_production_in_Tonnes,crop__Fodder beet,crop__Kale and field cabbage,crop__Oilseed rape,crop__Potatoes,crop__Spring barley,...,crop__Spring wheat,crop__Sugar beet,crop__Total barley,crop__Total oats,crop__Total wheat,"crop__Total wheat, oats and barley",crop__Turnips,crop__Winter barley,crop__Winter oats,crop__Winter wheat
0,2008,1,320.7,7.7,2461.3,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,2008,11,110.7,9.0,992.8,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,2008,111,87.5,9.6,839.9,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,2008,112,23.2,6.6,153.0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,2008,12,22.9,7.6,174.3,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
545,2007,131,18.7,7.6,142.4,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
546,2007,132,148.8,6.6,982.2,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
547,2007,2,2.9,4.4,12.8,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
548,2007,3,8.2,3.9,31.9,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


#### splitting in dependant and independant data

In [3]:
X=df.drop(['crop_production_in_Tonnes'],axis=1)
X.shape

(550, 20)

In [4]:
y=df['crop_production_in_Tonnes']

### feature scalling

In [5]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled

array([[ 0.58211472, -1.00907364,  3.02939687, ..., -0.26856053,
        -0.26856053, -0.26856053],
       [ 0.58211472, -0.82827142,  0.55963168, ..., -0.26856053,
        -0.26856053, -0.26856053],
       [ 0.58211472,  0.97975081,  0.28678143, ..., -0.26856053,
        -0.26856053,  3.72355541],
       ...,
       [ 0.48822525, -0.99099342, -0.70818111, ..., -0.26856053,
        -0.26856053, -0.26856053],
       [ 0.48822525, -0.9729132 , -0.64584894, ..., -0.26856053,
        -0.26856053, -0.26856053],
       [ 0.48822525, -0.95483297, -0.60468619, ..., -0.26856053,
        -0.26856053, -0.26856053]])

### dimentional reduction using PCA

In [9]:
from sklearn.decomposition import PCA

pca = PCA(0.95)
X_pca = pca.fit_transform(X_scaled)
X_pca.shape

(550, 16)

In [10]:
pca.explained_variance_ratio_

array([0.1175678 , 0.11047456, 0.08033868, 0.05692333, 0.05360624,
       0.05360624, 0.05360624, 0.05360624, 0.05360624, 0.05360624,
       0.05360624, 0.05360624, 0.05198806, 0.05192557, 0.05157712,
       0.04107943])

### train_test_split

In [11]:
from sklearn.model_selection import train_test_split
X_train_pca, X_test_pca, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=30)

### Model Building

In [12]:
from sklearn.linear_model import LinearRegression
LR= LinearRegression()
LR.fit(X_train_pca, y_train)

LinearRegression()

In [13]:
from sklearn.tree import DecisionTreeRegressor
DTR= DecisionTreeRegressor()
DTR.fit(X_train_pca, y_train)

DecisionTreeRegressor()

In [14]:
from sklearn.ensemble import RandomForestRegressor
RFR= RandomForestRegressor()
RFR.fit(X_train_pca, y_train)

RandomForestRegressor()

### accuarcy score

In [16]:
LR.score(X_test_pca, y_test)

0.9501566595154014

In [17]:
DTR.score(X_test_pca, y_test)

0.9571615588315115

In [18]:
RFR.score(X_test_pca, y_test)

0.974447322614494

### cross validation

In [19]:
from sklearn.model_selection import cross_val_score
model_scoring={}
def all_model_scores(model,X,y):
    scores=cross_val_score(model,X,y,cv=10)
    mean_score=scores.mean()
    model_scoring.update({model:mean_score})
    return model_scoring

Model_list=[LinearRegression(),DecisionTreeRegressor(),RandomForestRegressor()]
for model in Model_list:
    score_dict=all_model_scores(model,X_pca,y)
df_Models_scores=pd.DataFrame(score_dict,index=[0])
df_Models_scores

Unnamed: 0,LinearRegression(),DecisionTreeRegressor(),RandomForestRegressor()
0,0.934236,0.966176,0.976309


### Table for Accuracy score ,MAE and RMSE

In [32]:

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

Models_names=['Linear Regresser','Decision Tree Regressor','Random Forest Regressor']
Model_list=[LinearRegression(),DecisionTreeRegressor(),RandomForestRegressor()]
Accuracy_Scores=[]
MAE_Scores=[]
RMSE_Scores=[]

for model in Model_list:
    model.fit(X_train_pca,y_train)
    y_pred=model.predict(X_test_pca)
    Accuracy_Scores.append(model.score(X_test_pca, y_test))
    MAE_Scores.append(mean_absolute_error(y_test,y_pred))
    RMSE_Scores.append(np.sqrt(mean_squared_error(y_test,y_pred)))
    
Table=pd.DataFrame(list(zip(Models_names,Accuracy_Scores,MAE_Scores,RMSE_Scores)),columns =['Machine Learning Models','Accuracy Scores','Mean_Absolute_Error','root_mean_squared_error'])

In [33]:
Table

Unnamed: 0,Machine Learning Models,Accuracy Scores,Mean_Absolute_Error,root_mean_squared_error
0,Linear Regresser,0.950157,110.016822,148.049836
1,Decision Tree Regressor,0.962281,74.646364,128.790875
2,Random Forest Regressor,0.974919,64.697618,105.02063
