In [56]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler,LabelEncoder,PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor,GradientBoostingRegressor
from sklearn.model_selection import train_test_split,learning_curve,ShuffleSplit,cross_val_score,KFold,GridSearchCV
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures
from sklearn import neighbors
from sklearn.svm import SVR 

In [57]:
data = pd.read_csv('crop_yield.csv')

In [58]:
data.head()

Unnamed: 0,Temperature,Rainfall,Humidity,Prediction,Crop,Yield
0,29.02,161,78,0.691,Aus,35.7
1,29.42,130,77,0.806,Aus,35.7
2,30.0,90,76,0.814,Aus,35.7
3,29.45,142,76,1.043,Aus,35.7
4,29.05,132,76,0.93,Aus,35.7


In [59]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 6 columns):
Temperature    60 non-null float64
Rainfall       60 non-null int64
Humidity       60 non-null int64
Prediction     60 non-null float64
Crop           60 non-null object
Yield          60 non-null float64
dtypes: float64(3), int64(2), object(1)
memory usage: 2.9+ KB


In [60]:
data.isnull().sum()

Temperature    0
Rainfall       0
Humidity       0
Prediction     0
Crop           0
Yield          0
dtype: int64

In [61]:
data.describe()

Unnamed: 0,Temperature,Rainfall,Humidity,Prediction,Yield
count,60.0,60.0,60.0,60.0,60.0
mean,25.229167,97.966667,78.683333,3.020133,51.166667
std,4.256946,109.063031,3.422219,2.905228,24.008732
min,20.36,1.0,72.0,0.679,17.5
25%,20.8575,11.25,77.0,0.95875,35.7
50%,25.51,70.0,78.0,1.308,47.05
75%,29.2275,132.0,80.0,5.51,71.4
max,30.47,479.0,86.0,10.088,88.3


In [62]:
data.corr()

Unnamed: 0,Temperature,Rainfall,Humidity,Prediction,Yield
Temperature,1.0,0.736964,-0.050174,-0.198074,-0.337564
Rainfall,0.736964,1.0,0.549719,-0.292852,-0.097176
Humidity,-0.050174,0.549719,1.0,-0.216558,0.266062
Prediction,-0.198074,-0.292852,-0.216558,1.0,0.160262
Yield,-0.337564,-0.097176,0.266062,0.160262,1.0


In [63]:
le=LabelEncoder()
data['Crop']=le.fit_transform(data['Crop'])

In [64]:
data.head()

Unnamed: 0,Temperature,Rainfall,Humidity,Prediction,Crop,Yield
0,29.02,161,78,0.691,1,35.7
1,29.42,130,77,0.806,1,35.7
2,30.0,90,76,0.814,1,35.7
3,29.45,142,76,1.043,1,35.7
4,29.05,132,76,0.93,1,35.7


In [65]:
data.describe()

Unnamed: 0,Temperature,Rainfall,Humidity,Prediction,Crop,Yield
count,60.0,60.0,60.0,60.0,60.0,60.0
mean,25.229167,97.966667,78.683333,3.020133,2.5,51.166667
std,4.256946,109.063031,3.422219,2.905228,1.722237,24.008732
min,20.36,1.0,72.0,0.679,0.0,17.5
25%,20.8575,11.25,77.0,0.95875,1.0,35.7
50%,25.51,70.0,78.0,1.308,2.5,47.05
75%,29.2275,132.0,80.0,5.51,4.0,71.4
max,30.47,479.0,86.0,10.088,5.0,88.3


In [66]:
y = data['Yield'].values
x = data.drop(columns = ['Yield'])

In [67]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=99)

In [68]:
cv_k = 5
cv_scoring = 'neg_mean_squared_error'
cv_scoring = 'r2'

In [69]:
kf = KFold(n_splits=cv_k, shuffle=True)

In [70]:
# Linear regression
est = linear_model.LinearRegression()
scores = cross_val_score(est, x_train, y_train, cv=kf, scoring=cv_scoring)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.52 (+/- 0.47)


In [71]:
# Linear regression with L2 regularization
est = linear_model.Ridge(alpha = 1.0)
scores = cross_val_score(est, x_train, y_train, cv=kf, scoring=cv_scoring)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.56 (+/- 0.47)


In [72]:
# Polynomial regression
poly = PolynomialFeatures(degree=2)
x_train_poly = poly.fit_transform(x_train)
est = linear_model.LinearRegression()
scores = cross_val_score(est, x_train_poly, y_train, cv=kf, scoring=cv_scoring)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.99 (+/- 0.02)


In [73]:
# Polynomial regression with regularization
poly = PolynomialFeatures(degree=2)
x_train_poly = poly.fit_transform(x_train)
est = linear_model.Ridge(alpha = 1.0)
scores = cross_val_score(est, x_train_poly, y_train, cv=kf, scoring=cv_scoring)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.99 (+/- 0.03)


In [74]:
# Random forest regression
est = RandomForestRegressor(n_estimators=10, n_jobs=-1)
scores = cross_val_score(est, x_train, y_train, cv=kf, scoring=cv_scoring)
print("Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.9978 (+/- 0.0062)


In [75]:
# Gradient boosted regression
est = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,
                                max_depth=0.3, random_state=0, loss='ls')
scores = cross_val_score(est, x_train, y_train, cv=kf, scoring=cv_scoring)
print("Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

Accuracy: -0.0084 (+/- 0.0176)


In [76]:
# Nearest neighbor regression
n_neighbors = 5
weight = 'uniform' # 'distance'
est = neighbors.KNeighborsRegressor(n_neighbors, weights=weight, n_jobs=-1)
scores = cross_val_score(est, x_train, y_train, cv=kf, scoring=cv_scoring)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: -0.18 (+/- 0.67)


In [77]:
# Support vector regression - linear kernel
est = SVR(kernel='linear', C=1)
scores = cross_val_score(est, x_train, y_train, cv=kf, scoring=cv_scoring)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.36 (+/- 0.60)


In [78]:
# Support vector regression - RBF kernel
est = SVR(kernel='rbf', C=1, gamma=0.1)
scores = cross_val_score(est, x_train, y_train, cv=kf, scoring=cv_scoring)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: -0.45 (+/- 0.65)


In [79]:
## Support vector regression - polynomial kernel
## This takes much longer than the previous two SVR methods.
#est = SVR(kernel='poly', C=1e3, degree=2)
#scores = cross_val_score(est, x_train, y_train, cv=kf, scoring=cv_scoring)
#print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [80]:
# Random forest regression - with hyperparameter tuning
# Set the parameters by cross-validation

tuned_parameters = [{'n_estimators': [10,100,1000], 
                     'min_samples_split': [2, 5, 10, 20], 
                    'min_samples_leaf': [10,20,50],
                    'max_features': ['auto','sqrt']}]


est_base = RandomForestRegressor(random_state=0,  n_jobs=-1)

est = GridSearchCV(est_base, tuned_parameters, cv=kf, scoring=cv_scoring, n_jobs=-1)

est.fit(x_train, y_train)

means = est.cv_results_['mean_test_score']
stds = est.cv_results_['std_test_score']

for mean, std, params in zip(means, stds, est.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()



0.531 (+/-0.112) for {'max_features': 'auto', 'min_samples_leaf': 10, 'min_samples_split': 2, 'n_estimators': 10}
0.539 (+/-0.138) for {'max_features': 'auto', 'min_samples_leaf': 10, 'min_samples_split': 2, 'n_estimators': 100}
0.549 (+/-0.118) for {'max_features': 'auto', 'min_samples_leaf': 10, 'min_samples_split': 2, 'n_estimators': 1000}
0.531 (+/-0.112) for {'max_features': 'auto', 'min_samples_leaf': 10, 'min_samples_split': 5, 'n_estimators': 10}
0.539 (+/-0.138) for {'max_features': 'auto', 'min_samples_leaf': 10, 'min_samples_split': 5, 'n_estimators': 100}
0.549 (+/-0.118) for {'max_features': 'auto', 'min_samples_leaf': 10, 'min_samples_split': 5, 'n_estimators': 1000}
0.531 (+/-0.112) for {'max_features': 'auto', 'min_samples_leaf': 10, 'min_samples_split': 10, 'n_estimators': 10}
0.539 (+/-0.138) for {'max_features': 'auto', 'min_samples_leaf': 10, 'min_samples_split': 10, 'n_estimators': 100}
0.549 (+/-0.118) for {'max_features': 'auto', 'min_samples_leaf': 10, 'min_samp