In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
df=pd.read_csv('indiancrop_dataset.csv')
df.head()

Unnamed: 0,N_SOIL,P_SOIL,K_SOIL,TEMPERATURE,HUMIDITY,ph,RAINFALL,STATE,CROP_PRICE,CROP
0,90,42,43,20.879744,82.002744,6.502985,202.935536,Andaman and Nicobar,7000,Rice
1,85,58,41,21.770462,80.319644,7.038096,226.655537,Andaman and Nicobar,5000,Rice
2,60,55,44,23.004459,82.320763,7.840207,263.964248,Andaman and Nicobar,7000,Rice
3,74,35,40,26.491096,80.158363,6.980401,242.864034,Andaman and Nicobar,7000,Rice
4,78,42,42,20.130175,81.604873,7.628473,262.71734,Andaman and Nicobar,120000,Rice


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   N_SOIL       2200 non-null   int64  
 1   P_SOIL       2200 non-null   int64  
 2   K_SOIL       2200 non-null   int64  
 3   TEMPERATURE  2200 non-null   float64
 4   HUMIDITY     2200 non-null   float64
 5   ph           2200 non-null   float64
 6   RAINFALL     2200 non-null   float64
 7   STATE        2200 non-null   object 
 8   CROP_PRICE   2200 non-null   int64  
 9   CROP         2200 non-null   object 
dtypes: float64(4), int64(4), object(2)
memory usage: 172.0+ KB


In [11]:
df["STATE"].nuni

0       Andaman and Nicobar
1       Andaman and Nicobar
2       Andaman and Nicobar
3       Andaman and Nicobar
4       Andaman and Nicobar
               ...         
2195            West Bengal
2196            West Bengal
2197            West Bengal
2198            West Bengal
2199            West Bengal
Name: STATE, Length: 2200, dtype: object

In [5]:
df.drop(['STATE'],axis=1,inplace=True)

In [6]:
df

Unnamed: 0,N_SOIL,P_SOIL,K_SOIL,TEMPERATURE,HUMIDITY,ph,RAINFALL,CROP_PRICE,CROP
0,90,42,43,20.879744,82.002744,6.502985,202.935536,7000,Rice
1,85,58,41,21.770462,80.319644,7.038096,226.655537,5000,Rice
2,60,55,44,23.004459,82.320763,7.840207,263.964248,7000,Rice
3,74,35,40,26.491096,80.158363,6.980401,242.864034,7000,Rice
4,78,42,42,20.130175,81.604873,7.628473,262.717340,120000,Rice
...,...,...,...,...,...,...,...,...,...
2195,107,34,32,26.774637,66.413269,6.780064,177.774507,1000,Coffee
2196,99,15,27,27.417112,56.636362,6.086922,127.924610,800,Coffee
2197,118,33,30,24.131797,67.225123,6.362608,173.322839,560,Coffee
2198,117,32,34,26.272418,52.127394,6.758793,127.175293,1500,Coffee


# **SKEWNESS**

In [11]:
df.skew()

  """Entry point for launching an IPython kernel.


N_SOIL          0.509721
P_SOIL          1.010773
K_SOIL          2.375167
TEMPERATURE     0.184933
HUMIDITY       -1.091708
ph              0.283929
RAINFALL        0.965756
CROP_PRICE     15.926745
dtype: float64

In [12]:
df.P_SOIL=np.log(df['P_SOIL'])

In [13]:
df.K_SOIL=np.log(df['K_SOIL'])

In [14]:
df['HUMIDITY']=np.log(df['HUMIDITY'])

In [15]:
iq1=df['CROP_PRICE'].quantile(0.25)
iq2=df['CROP_PRICE'].quantile(0.75)
iq1

950.0

In [16]:
iq2

3500.0

In [17]:
df['CROP_PRICE']=np.where(df['CROP_PRICE']<iq1,iq1,df['CROP_PRICE'])
df['CROP_PRICE']=np.where(df['CROP_PRICE']>iq2,iq2,df['CROP_PRICE'])

In [18]:
df.skew()

  """Entry point for launching an IPython kernel.


N_SOIL         0.509721
P_SOIL        -0.782108
K_SOIL         0.804010
TEMPERATURE    0.184933
HUMIDITY      -1.900877
ph             0.283929
RAINFALL       0.965756
CROP_PRICE     0.283069
dtype: float64

# **ENCODING CATEGORICAL INTO NUMERICAL**

In [19]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df['CROP']=le.fit_transform(df['CROP'])


# **SPLITING OF FEATURES**

In [20]:
x=df.iloc[:,0:8]
y=df.iloc[:,8]
x.head()

Unnamed: 0,N_SOIL,P_SOIL,K_SOIL,TEMPERATURE,HUMIDITY,ph,RAINFALL,CROP_PRICE
0,90,3.73767,3.7612,20.879744,4.406753,6.502985,202.935536,3500.0
1,85,4.060443,3.713572,21.770462,4.386014,7.038096,226.655537,3500.0
2,60,4.007333,3.78419,23.004459,4.410623,7.840207,263.964248,3500.0
3,74,3.555348,3.688879,26.491096,4.384004,6.980401,242.864034,3500.0
4,78,3.73767,3.73767,20.130175,4.401889,7.628473,262.71734,3500.0


In [21]:
y.unique()

array([20, 11,  3,  9, 18, 13, 14,  2, 10, 19,  1, 12,  7, 21, 15,  0, 16,
       17,  4,  6,  8,  5])

###Training a model

**TRAIN-TEST-SPLIT**

In [22]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7,random_state=1)

In [23]:
print("Training data",x_train.shape)

Training data (1540, 8)


In [24]:
print("Training data",x_test.shape)

Training data (660, 8)


# **IMPORTING ALGORITHM**

**NAIVE BAYES**

In [25]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()

In [26]:
model.fit(x_train,y_train)

GaussianNB()

**PREDICTION OF CROP**

In [27]:
y_prediction=model.predict(x_test)

# **MODEL METRICS**

In [28]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_prediction)

0.9954545454545455

# **SAVING THE MODEL**

In [29]:
import pickle

In [30]:
filename='trained_model.sav'
pickle.dump(model,open(filename,'wb'))

In [31]:
loaded_model=pickle.load(open('trained_model.sav','rb'))

In [32]:
input_data=(49,4.234107,4.406719,18.315615,2.731860,7.263119,81.787105,3500.0)
input_array=np.asarray(input_data)
data_reshape=input_array.reshape(1,-1)
prediction=model.predict(data_reshape)
print(prediction)


[3]


  "X does not have valid feature names, but"
