In [36]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [37]:
crop = pd.read_csv("Crop_recommendation.csv")

In [38]:
crop.head()

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,90,42,43,20.879744,82.002744,6.502985,202.935536,rice
1,85,58,41,21.770462,80.319644,7.038096,226.655537,rice
2,60,55,44,23.004459,82.320763,7.840207,263.964248,rice
3,74,35,40,26.491096,80.158363,6.980401,242.864034,rice
4,78,42,42,20.130175,81.604873,7.628473,262.71734,rice


In [39]:
crop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   N            2200 non-null   int64  
 1   P            2200 non-null   int64  
 2   K            2200 non-null   int64  
 3   temperature  2200 non-null   float64
 4   humidity     2200 non-null   float64
 5   ph           2200 non-null   float64
 6   rainfall     2200 non-null   float64
 7   label        2200 non-null   object 
dtypes: float64(4), int64(3), object(1)
memory usage: 137.6+ KB


In [40]:
crop.isnull().values.any()

False

In [41]:
crop['label'].value_counts()

label
rice           100
maize          100
jute           100
cotton         100
coconut        100
papaya         100
orange         100
apple          100
muskmelon      100
watermelon     100
grapes         100
mango          100
banana         100
pomegranate    100
lentil         100
blackgram      100
mungbean       100
mothbeans      100
pigeonpeas     100
kidneybeans    100
chickpea       100
coffee         100
Name: count, dtype: int64

In [42]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline

In [43]:
X_train, X_test, y_train, y_test = train_test_split(crop.drop('label', axis = 1), 
                                                    crop['label'],
                                                   random_state=42)

In [44]:
transformer = make_column_transformer((StandardScaler(), crop.drop('label', axis = 1).columns.tolist()), remainder='passthrough')
knn_pipe = Pipeline([('transform', transformer), ('knn', KNeighborsClassifier())])

In [45]:
params = {'knn__n_neighbors': list(range(1, 22, 2)),
         'knn__weights': ['uniform', 'distance'],
         'knn__p': [1,2]}

In [46]:
y_train.head()

564       mothbeans
916     pomegranate
1700         papaya
436      pigeonpeas
1555          apple
Name: label, dtype: object

In [47]:
knn_grid = GridSearchCV(knn_pipe, param_grid=params)
knn_grid.fit(X_train, y_train)
best_k = list(knn_grid.best_params_.values())[0]
best_acc = knn_grid.score(X_test, y_test)
print("done")
print(best_k)
print(best_acc)
print(knn_grid.best_params_)

done
1
0.9618181818181818
{'knn__n_neighbors': 1, 'knn__p': 2, 'knn__weights': 'uniform'}


In [48]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

In [50]:
y_encoded = pd.factorize(crop['label'])[0]
print(y_encoded)

[ 0  0  0 ... 21 21 21]


In [52]:
X_train, X_test, y_train, y_test = train_test_split(crop.drop('label', axis = 1), 
                                                    y_encoded,
                                                   random_state=42)

In [53]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [54]:
lasso = Lasso(alpha=0.01)
lasso.fit(X_train_scaled, y_train)
y_pred = lasso.predict(X_test_scaled)
print(y_pred)

[16.32356511 15.95042403 15.4799117  13.59354791 12.87116941  8.141445
 12.81797255  6.8083466  10.38255444  4.79903712  8.0193321  13.2417287
 12.83348151 12.92157251 14.48414434 14.12477911 13.46190523 10.86914357
 13.28231658 16.49963237 11.9280098  14.54602219 10.59844488 13.0049492
  5.03653395 13.31036487  9.80688259 12.90813023  5.27150289 12.03591682
 12.09304005  7.15106115 13.09471135  4.21850938 13.68754155  8.25260677
 14.01774289  2.82605379  3.7526341   1.34710784 13.11332995  8.90113819
  7.67907198  8.79384465 11.33616316  7.46752743  6.34928417 15.57399997
 15.11866449 10.00019738  8.36775087 10.40912625 12.34784702 10.82627388
 12.14616656  0.84163577 12.15547346 12.78861217 11.66044281  7.12441883
 16.12320638 14.13060463  8.08496947 12.37398977 12.59249487  7.68268972
 14.10270905 13.83087369 16.28749355  5.48532731 15.89304669 15.192363
 12.04800748 10.60560092  7.60662007 15.53234506 10.87760208 12.93420362
 15.17332553 11.15554438 12.69874422  3.27875469 13.48079

In [55]:
mse = mean_squared_error(y_test, y_pred)

In [56]:
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

Mean Squared Error: 25.038191185166134
R^2 Score: 0.39827341795532323


In [57]:
coefficients = lasso.coef_

In [58]:
print(coefficients)

[ 1.0961919  -2.1419019   2.42699297  0.76047823  2.15749895  0.46246361
  0.49345327]


In [62]:
features = crop.drop('label', axis = 1).columns

In [63]:
coef_df = pd.DataFrame({
    'Feature': features,
    'Coefficient': coefficients
}).sort_values(by='Coefficient', ascending=False)

In [64]:
print(coef_df)

       Feature  Coefficient
2            K     2.426993
4     humidity     2.157499
0            N     1.096192
3  temperature     0.760478
6     rainfall     0.493453
5           ph     0.462464
1            P    -2.141902
