In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score

In [7]:
ds = pd.read_csv("./crop_production.csv")
ds = ds.drop(['Crop_Year'], axis=1)
ds = ds.drop(['State_Name'], axis=1)

In [9]:
ds

Unnamed: 0,District_Name,Season,Crop,Area,Production
0,NICOBARS,Kharif,Arecanut,1254.0,2000.0
1,NICOBARS,Kharif,Other Kharif pulses,2.0,1.0
2,NICOBARS,Kharif,Rice,102.0,321.0
3,NICOBARS,Whole Year,Banana,176.0,641.0
4,NICOBARS,Whole Year,Cashewnut,720.0,165.0
...,...,...,...,...,...
246086,PURULIA,Summer,Rice,306.0,801.0
246087,PURULIA,Summer,Sesamum,627.0,463.0
246088,PURULIA,Whole Year,Sugarcane,324.0,16250.0
246089,PURULIA,Winter,Rice,279151.0,597899.0


In [11]:
ds.shape

(246091, 5)

In [13]:
ds.columns

Index(['District_Name', 'Season', 'Crop', 'Area', 'Production'], dtype='object')

In [15]:
ds.describe()

Unnamed: 0,Area,Production
count,246091.0,242361.0
mean,12002.82,582503.4
std,50523.4,17065810.0
min,0.04,0.0
25%,80.0,88.0
50%,582.0,729.0
75%,4392.0,7023.0
max,8580100.0,1250800000.0


In [17]:
ds.isnull().sum()

District_Name       0
Season              0
Crop                0
Area                0
Production       3730
dtype: int64

In [19]:
ds = ds.dropna()

In [21]:
ds

Unnamed: 0,District_Name,Season,Crop,Area,Production
0,NICOBARS,Kharif,Arecanut,1254.0,2000.0
1,NICOBARS,Kharif,Other Kharif pulses,2.0,1.0
2,NICOBARS,Kharif,Rice,102.0,321.0
3,NICOBARS,Whole Year,Banana,176.0,641.0
4,NICOBARS,Whole Year,Cashewnut,720.0,165.0
...,...,...,...,...,...
246086,PURULIA,Summer,Rice,306.0,801.0
246087,PURULIA,Summer,Sesamum,627.0,463.0
246088,PURULIA,Whole Year,Sugarcane,324.0,16250.0
246089,PURULIA,Winter,Rice,279151.0,597899.0


In [23]:
ds.isnull().values.any()

False

In [25]:
ds.District_Name.unique()

array(['NICOBARS', 'NORTH AND MIDDLE ANDAMAN', 'SOUTH ANDAMANS',
       'ANANTAPUR', 'CHITTOOR', 'EAST GODAVARI', 'GUNTUR', 'KADAPA',
       'KRISHNA', 'KURNOOL', 'PRAKASAM', 'SPSR NELLORE', 'SRIKAKULAM',
       'VISAKHAPATANAM', 'VIZIANAGARAM', 'WEST GODAVARI', 'ANJAW',
       'CHANGLANG', 'DIBANG VALLEY', 'EAST KAMENG', 'EAST SIANG',
       'KURUNG KUMEY', 'LOHIT', 'LONGDING', 'LOWER DIBANG VALLEY',
       'LOWER SUBANSIRI', 'NAMSAI', 'PAPUM PARE', 'TAWANG', 'TIRAP',
       'UPPER SIANG', 'UPPER SUBANSIRI', 'WEST KAMENG', 'WEST SIANG',
       'BAKSA', 'BARPETA', 'BONGAIGAON', 'CACHAR', 'CHIRANG', 'DARRANG',
       'DHEMAJI', 'DHUBRI', 'DIBRUGARH', 'DIMA HASAO', 'GOALPARA',
       'GOLAGHAT', 'HAILAKANDI', 'JORHAT', 'KAMRUP', 'KAMRUP METRO',
       'KARBI ANGLONG', 'KARIMGANJ', 'KOKRAJHAR', 'LAKHIMPUR', 'MARIGAON',
       'NAGAON', 'NALBARI', 'SIVASAGAR', 'SONITPUR', 'TINSUKIA',
       'UDALGURI', 'ARARIA', 'ARWAL', 'AURANGABAD', 'BANKA', 'BEGUSARAI',
       'BHAGALPUR', 'BHOJPUR', 'B

In [27]:
ds.Season.unique()

array(['Kharif     ', 'Whole Year ', 'Autumn     ', 'Rabi       ',
       'Summer     ', 'Winter     '], dtype=object)

In [29]:
ds.Crop.unique()

array(['Arecanut', 'Other Kharif pulses', 'Rice', 'Banana', 'Cashewnut',
       'Coconut ', 'Dry ginger', 'Sugarcane', 'Sweet potato', 'Tapioca',
       'Black pepper', 'Dry chillies', 'other oilseeds', 'Turmeric',
       'Maize', 'Moong(Green Gram)', 'Urad', 'Arhar/Tur', 'Groundnut',
       'Sunflower', 'Bajra', 'Castor seed', 'Cotton(lint)', 'Horse-gram',
       'Jowar', 'Korra', 'Ragi', 'Tobacco', 'Gram', 'Wheat', 'Masoor',
       'Sesamum', 'Linseed', 'Safflower', 'Onion', 'other misc. pulses',
       'Samai', 'Small millets', 'Coriander', 'Potato',
       'Other  Rabi pulses', 'Soyabean', 'Beans & Mutter(Vegetable)',
       'Bhindi', 'Brinjal', 'Citrus Fruit', 'Cucumber', 'Grapes', 'Mango',
       'Orange', 'other fibres', 'Other Fresh Fruits', 'Other Vegetables',
       'Papaya', 'Pome Fruit', 'Tomato', 'Mesta', 'Cowpea(Lobia)',
       'Lemon', 'Pome Granet', 'Sapota', 'Cabbage', 'Rapeseed &Mustard',
       'Peas  (vegetable)', 'Niger seed', 'Bottle Gourd', 'Varagu',
       'Garl

In [31]:
X = ds.drop(['Production'], axis=1)
Y = ds['Production']

In [33]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=0)

In [37]:
categorical_cols = ['District_Name', 'Season', 'Crop']

In [39]:
ohe = OneHotEncoder(handle_unknown='ignore')
ohe.fit(x_train[categorical_cols])

x_train_categorical = ohe.transform(x_train[categorical_cols])
x_test_categorical = ohe.transform(x_test[categorical_cols])

x_train_final = np.hstack((x_train_categorical.toarray(), x_train.drop(categorical_cols, axis=1)))
x_test_final = np.hstack((x_test_categorical.toarray(), x_test.drop(categorical_cols, axis=1)))

In [41]:
model = RandomForestRegressor(n_estimators=15, random_state=0)
model.fit(x_train_final, y_train)

In [42]:
RF_predict = model.predict(x_test_final)
r = r2_score(y_test, RF_predict)
print("R2 score: ", r)

R2 score:  0.9057992052789764


In [48]:
user_input = pd.DataFrame([['BAGALKOT', 'Kharif', 'Rice', 197]], columns=['District_Name', 'Season', 'Crop', 'Area'])
user_input_categorical = ohe.transform(user_input[categorical_cols])
user_input_final = np.hstack((user_input_categorical.toarray(), user_input.drop(categorical_cols, axis=1)))
prediction = model.predict(user_input_final)
print("Prediction: ", prediction)

Prediction:  [323.06666667]


In [50]:
import pickle
with open('./YieldPreModel.pkl', 'wb') as f:
    pickle.dump(model, f)

In [52]:
import pickle
with open('./YieldPreModel.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

In [54]:
ohe = OneHotEncoder(handle_unknown='ignore')
ohe.fit(x_train[categorical_cols])
user_input = pd.DataFrame([['BAGALKOT', 'Kharif', 'Rice', 197]], columns=['District_Name', 'Season', 'Crop', 'Area'])
user_input_categorical = ohe.transform(user_input[categorical_cols])
user_input_final = np.hstack((user_input_categorical.toarray(), user_input.drop(categorical_cols, axis=1)))
prediction = loaded_model.predict(user_input_final)
print("Prediction: ", prediction)

Prediction:  [323.06666667]
