In [1]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


In [2]:
df = pd.read_csv('Agricultural_Yield_Prediction.csv')

In [3]:
df.head(10)

Unnamed: 0,Crop,Crop_Year,Season,State,Area,Production,Annual_Rainfall,Fertilizer,Pesticide,Yield
0,Arecanut,1997,Whole Year,Assam,73814.0,56708,2051.4,7024878.38,22882.34,0.796087
1,Arhar/Tur,1997,Kharif,Assam,6637.0,4685,2051.4,631643.29,2057.47,0.710435
2,Castor seed,1997,Kharif,Assam,796.0,22,2051.4,75755.32,246.76,0.238333
3,Coconut,1997,Whole Year,Assam,19656.0,126905000,2051.4,1870661.52,6093.36,5238.051739
4,Cotton(lint),1997,Kharif,Assam,1739.0,794,2051.4,165500.63,539.09,0.420909
5,Dry chillies,1997,Whole Year,Assam,13587.0,9073,2051.4,1293074.79,4211.97,0.643636
6,Gram,1997,Rabi,Assam,2979.0,1507,2051.4,283511.43,923.49,0.465455
7,Jute,1997,Kharif,Assam,94520.0,904095,2051.4,8995468.4,29301.2,9.919565
8,Linseed,1997,Rabi,Assam,10098.0,5158,2051.4,961026.66,3130.38,0.461364
9,Maize,1997,Kharif,Assam,19216.0,14721,2051.4,1828786.72,5956.96,0.615652


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19689 entries, 0 to 19688
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Crop             19689 non-null  object 
 1   Crop_Year        19689 non-null  int64  
 2   Season           19689 non-null  object 
 3   State            19689 non-null  object 
 4   Area             19689 non-null  float64
 5   Production       19689 non-null  int64  
 6   Annual_Rainfall  19689 non-null  float64
 7   Fertilizer       19689 non-null  float64
 8   Pesticide        19689 non-null  float64
 9   Yield            19689 non-null  float64
dtypes: float64(5), int64(2), object(3)
memory usage: 1.5+ MB


In [5]:
df.describe()

Unnamed: 0,Crop_Year,Area,Production,Annual_Rainfall,Fertilizer,Pesticide,Yield
count,19689.0,19689.0,19689.0,19689.0,19689.0,19689.0,19689.0
mean,2009.127584,179926.6,16435940.0,1437.755177,24103310.0,48848.35,79.954009
std,6.498099,732828.7,263056800.0,816.909589,94946000.0,213287.4,878.306193
min,1997.0,0.5,0.0,301.3,54.17,0.09,0.0
25%,2004.0,1390.0,1393.0,940.7,188014.6,356.7,0.6
50%,2010.0,9317.0,13804.0,1247.6,1234957.0,2421.9,1.03
75%,2015.0,75112.0,122718.0,1643.7,10003850.0,20041.7,2.388889
max,2020.0,50808100.0,6326000000.0,6552.7,4835407000.0,15750510.0,21105.0


In [6]:
correlation = df.corr(numeric_only = True)
correlation


Unnamed: 0,Crop_Year,Area,Production,Annual_Rainfall,Fertilizer,Pesticide,Yield
Crop_Year,1.0,-0.035686,0.003366,-0.011187,0.011169,-0.004657,0.002539
Area,-0.035686,1.0,0.037441,-0.106054,0.973255,0.973479,0.001858
Production,0.003366,0.037441,1.0,0.029879,0.039799,0.035171,0.570809
Annual_Rainfall,-0.011187,-0.106054,0.029879,1.0,-0.109734,-0.097657,0.020761
Fertilizer,0.011169,0.973255,0.039799,-0.109734,1.0,0.954991,0.002862
Pesticide,-0.004657,0.973479,0.035171,-0.097657,0.954991,1.0,0.001782
Yield,0.002539,0.001858,0.570809,0.020761,0.002862,0.001782,1.0


In [7]:
fig = px.imshow(correlation, text_auto = '.3f', aspect = 'auto',
               title = '<b>Correlation Matrix</b>', color_continuous_scale = 'tropic')
fig.update_layout(title_font_color = 'olive', font_color = 'fuchsia')
fig.show()


In [8]:
fig = make_subplots(rows = 5, cols = 1,
                    subplot_titles = ['<b>Production vs Yield</b>', '<b>Area vs Yield</b>',
                                     '<b>Annual Rainfall vs Yield</b>', '<b>Fertilizer vs Yield</b>',
                                     '<b>Pesticide vs Yield</b>'])

fig.add_trace(go.Scattergl(x = df['Production'], y = df['Yield'], mode = 'markers',
                          marker_line_color = 'darkblue', marker_line_width = 1,
                          name = 'Production'), row = 1, col = 1)

fig.add_trace(go.Scattergl(x = df['Area'], y = df['Yield'], mode = 'markers',
                           marker_color = 'coral', marker_line_color = 'darkslategray',
                           marker_line_width = 1, name = 'Area'), row = 2, col = 1)

fig.add_trace(go.Scattergl(x = df['Annual_Rainfall'], y = df['Yield'], mode = 'markers',
                           marker_color = 'beige', marker_line_width = 1,
                           marker_line_color = 'coral',
                           name = 'Annual Rainfall'), row = 3, col = 1)

fig.add_trace(go.Scattergl(x = df['Fertilizer'], y = df['Yield'], mode = 'markers',
                            marker_line_width = 1, name = 'Fertilizer'),row = 4, col = 1)

fig.add_trace(go.Scattergl(x = df['Pesticide'], y = df['Yield'], mode = 'markers',
                            marker_line_width = 1, name = 'Pesticide'), row = 5, col = 1)
fig.update_xaxes(type = 'log')
fig.update_xaxes(title = 'Production', row = 1, col = 1)
fig.update_xaxes(title = 'Area', row = 2, col = 1)
fig.update_xaxes(title = 'Annual Rainfall', row = 3, col = 1)
fig.update_xaxes(title = 'Fertilizer', row = 4, col = 1)
fig.update_xaxes(title = 'Pesticide', row = 5, col = 1)

fig.update_yaxes(type = 'log', title = 'Yield')
fig.update_layout(title = '<b>Impact of Various Factors on Crop Yield</b>',
                  title_font_color = 'sienna', legend_font_color = 'saddlebrown',
                  height = 1700)
fig.show()

In [9]:
X = df.drop(['Crop', 'Crop_Year', 'Season', 'State', 'Yield','Production'], axis = 1)
y = df['Yield']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2 ,random_state = 42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)


(15751, 4) (15751,)
(3938, 4) (3938,)


In [10]:
lr = LinearRegression()
lr.fit(X_train, y_train)


In [18]:
pred_lr = lr.predict(X_test)
r2_lr = r2_score(y_test, pred_lr)
print("r2 Score of Linear Regression: ", r2_lr)


r2 Score of Linear Regression:  0.0005353688048722605


In [26]:
comparison = pd.DataFrame()


In [27]:
comparison['Type'] = ['Linear Regression']
comparison['r2 Score'] = [r2_lr]
comparison



Unnamed: 0,Type,r2 Score
0,Linear Regression,0.000535


In [28]:
print(pred_lr)


[ 67.0526911   80.29278596 133.85452223 ...  83.31126157  79.98834397
  71.50052441]
