In [1]:
import csv
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import linear_model

In [2]:
df = pd.read_csv('./crop_production.csv', sep=',')
df.head(10)

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production
0,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Arecanut,1254.0,2000.0
1,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Other Kharif pulses,2.0,1.0
2,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Rice,102.0,321.0
3,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Banana,176.0,641.0
4,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Cashewnut,720.0,165.0
5,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Coconut,18168.0,65100000.0
6,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Dry ginger,36.0,100.0
7,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Sugarcane,1.0,2.0
8,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Sweet potato,5.0,15.0
9,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Tapioca,40.0,169.0


In [3]:
print('State_Name:', df.State_Name.unique())
print('District_Name:', df.District_Name.unique())
print('Crop:', df.Crop.unique())
print('Season:', df.Season.unique())



State_Name: ['Andaman and Nicobar Islands' 'Andhra Pradesh' 'Arunachal Pradesh'
 'Assam' 'Bihar' 'Chandigarh' 'Chhattisgarh' 'Dadra and Nagar Haveli'
 'Goa' 'Gujarat' 'Haryana' 'Himachal Pradesh' 'Jammu and Kashmir '
 'Jharkhand' 'Karnataka' 'Kerala' 'Madhya Pradesh' 'Maharashtra' 'Manipur'
 'Meghalaya' 'Mizoram' 'Nagaland' 'Odisha' 'Puducherry' 'Punjab'
 'Rajasthan' 'Sikkim' 'Tamil Nadu' 'Telangana ' 'Tripura' 'Uttar Pradesh'
 'Uttarakhand' 'West Bengal']
District_Name: ['NICOBARS' 'NORTH AND MIDDLE ANDAMAN' 'SOUTH ANDAMANS' 'ANANTAPUR'
 'CHITTOOR' 'EAST GODAVARI' 'GUNTUR' 'KADAPA' 'KRISHNA' 'KURNOOL'
 'PRAKASAM' 'SPSR NELLORE' 'SRIKAKULAM' 'VISAKHAPATANAM' 'VIZIANAGARAM'
 'WEST GODAVARI' 'ANJAW' 'CHANGLANG' 'DIBANG VALLEY' 'EAST KAMENG'
 'EAST SIANG' 'KURUNG KUMEY' 'LOHIT' 'LONGDING' 'LOWER DIBANG VALLEY'
 'LOWER SUBANSIRI' 'NAMSAI' 'PAPUM PARE' 'TAWANG' 'TIRAP' 'UPPER SIANG'
 'UPPER SUBANSIRI' 'WEST KAMENG' 'WEST SIANG' 'BAKSA' 'BARPETA'
 'BONGAIGAON' 'CACHAR' 'CHIRANG' 'DARRANG' 'D

In [4]:
state_name = df.State_Name.unique().tolist()
district_name = df.District_Name.unique().tolist()
crop = df.Crop.unique().tolist()
season = df.Season.unique().tolist()

print('no states:',len(state_name))
print('no districts:', len(district_name))
print('no crops:', len(crop))
print('no seasons:', len(season))

no states: 33
no districts: 646
no crops: 124
no seasons: 6


In [5]:
columnsToEncode = ['State_Name','District_Name','Season','Crop']
myEncoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
myEncoder.fit(df[columnsToEncode])

enc_df = pd.concat([df.drop(columnsToEncode, 1),
          pd.DataFrame(myEncoder.transform(df[columnsToEncode]))], axis=1).reindex()
enc_df = enc_df.dropna()
enc_df


Unnamed: 0,Crop_Year,Area,Production,0,1,2,3,4,5,6,...,799,800,801,802,803,804,805,806,807,808
0,2000,1254.0,2000.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2000,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2000,102.0,321.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2000,176.0,641.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2000,720.0,165.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246086,2014,306.0,801.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
246087,2014,627.0,463.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
246088,2014,324.0,16250.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
246089,2014,279151.0,597899.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
enc_df = enc_df[[col for col in enc_df if col not in ['Crop_Year', 
                                                     'Area',
                                                    'Production']] + ['Crop_Year', 
                                                     'Area',
                                                    'Production']]
enc_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,802,803,804,805,806,807,808,Crop_Year,Area,Production
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2000,1254.0,2000.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2000,2.0,1.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2000,102.0,321.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2000,176.0,641.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2000,720.0,165.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246086,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2014,306.0,801.0
246087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2014,627.0,463.0
246088,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2014,324.0,16250.0
246089,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2014,279151.0,597899.0


In [7]:
dataset = enc_df.values
dataset
X = dataset[:, :-1]
Y = dataset[:,-1]


X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.25, random_state=1)
dataset



array([[1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 2.00000e+03,
        1.25400e+03, 2.00000e+03],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 2.00000e+03,
        2.00000e+00, 1.00000e+00],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 2.00000e+03,
        1.02000e+02, 3.21000e+02],
       ...,
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 2.01400e+03,
        3.24000e+02, 1.62500e+04],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 2.01400e+03,
        2.79151e+05, 5.97899e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 2.01400e+03,
        1.75000e+02, 8.80000e+01]])

In [8]:
print('Train_data : {}, {}'.format(X_train.shape, Y_train.shape))
print('Test_data : {}, {}'.format(X_test.shape, Y_test.shape))

Train_data : (181770, 811), (181770,)
Test_data : (60591, 811), (60591,)


In [9]:
regr = linear_model.LinearRegression()
regr.fit(X_train, Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [11]:
print(regr.coef_)

[-7.83159394e+05  1.82943186e+05  8.07318459e+04 -1.59144058e+06
 -4.19602718e+05 -1.17059448e+05 -5.22179075e+05 -1.12012950e+06
 -6.99420215e+05 -6.44806402e+05 -5.78442793e+05 -3.18835722e+05
 -5.60858063e+05 -1.75426581e+05 -2.53969464e+06  2.17378844e+07
 -5.64492782e+05 -8.43342029e+05  3.69579632e+05  2.60240961e+05
 -3.91472296e+05 -4.93295487e+05 -9.99849108e+04 -3.81918706e+06
 -8.02098350e+05 -5.24569714e+05 -2.72618992e+05 -3.34244218e+05
 -8.96170958e+05 -6.48670286e+05 -5.80152160e+05 -4.71751106e+05
 -1.81827452e+06 -1.73039333e+05  4.23248735e+04  2.27715971e+05
 -1.44993499e+05 -1.64021306e+05 -5.84756605e+04 -2.01112399e+05
 -4.53069641e+05 -1.25519256e+05 -1.14331627e+05 -4.23231416e+06
 -2.43179083e+05  6.45351427e+02 -2.69915266e+05 -1.02819159e+05
 -1.84371910e+05  6.59163977e+04 -2.04685206e+05 -3.73277535e+05
 -1.94426630e+05 -2.08447255e+05 -2.44643700e+05  8.19416843e+04
  8.38806857e+04 -1.95100460e+06 -5.80343762e+04 -4.59278887e+04
  6.90868077e+04  1.65604

In [15]:
np.mean((regr.predict(X_test))) 

512950.51369917166

In [16]:
regr.score(X_test, Y_test)

0.1631318976661137