In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load the data
df = pd.read_csv('/content/drive/MyDrive/ML_VAP__project/archive (6).zip')

In [None]:
df.head()

Unnamed: 0,State,District,Crop,Crop_Year,Season,Area,Production,Yield
0,Andaman and Nicobar Island,NICOBARS,Arecanut,2007,Kharif,2439.6,3415.0,1.4
1,Andaman and Nicobar Island,NICOBARS,Arecanut,2007,Rabi,1626.4,2277.0,1.4
2,Andaman and Nicobar Island,NICOBARS,Arecanut,2008,Autumn,4147.0,3060.0,0.74
3,Andaman and Nicobar Island,NICOBARS,Arecanut,2008,Summer,4147.0,2660.0,0.64
4,Andaman and Nicobar Island,NICOBARS,Arecanut,2009,Autumn,4153.0,3120.0,0.75


In [None]:
df.shape

(345336, 8)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 345336 entries, 0 to 345335
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   State       345336 non-null  object 
 1   District    345336 non-null  object 
 2   Crop        345327 non-null  object 
 3   Crop_Year   345336 non-null  int64  
 4   Season      345336 non-null  object 
 5   Area        345336 non-null  float64
 6   Production  340388 non-null  float64
 7   Yield       345336 non-null  float64
dtypes: float64(3), int64(1), object(4)
memory usage: 21.1+ MB


In [None]:
#to check for null/missing values in dataset
df.isnull().values.any()

True

In [None]:
df.isnull().sum() #to count the number od missing values and where its present

State            0
District         0
Crop             9
Crop_Year        0
Season           0
Area             0
Production    4948
Yield            0
dtype: int64

In [None]:
#replacing missing values/imputation for 'Crop' with mode
#for categorical/string values u cant compute avg so use mode/ most frrequent occurring element
df['Crop'].fillna(df['Crop'].mode()[0],inplace=False)
#if u give False it will display changes

0         Arecanut
1         Arecanut
2         Arecanut
3         Arecanut
4         Arecanut
            ...   
345331       Wheat
345332       Wheat
345333       Wheat
345334       Wheat
345335       Wheat
Name: Crop, Length: 345336, dtype: object

In [None]:
#replacing missing values/imputation for 'MonthlyCharges' with mean
#for numerical vales u can use either mean, mode or median
df['Production'].fillna(df['Production'].mean(),inplace=False)

0         3415.0
1         2277.0
2         3060.0
3         2660.0
4         3120.0
           ...  
345331    1241.0
345332    2415.0
345333    2145.0
345334    2114.0
345335     931.0
Name: Production, Length: 345336, dtype: float64

In [None]:
df.isnull().values.any()

True

In [None]:
df.isnull().sum()

State            0
District         0
Crop             9
Crop_Year        0
Season           0
Area             0
Production    4948
Yield            0
dtype: int64

In [None]:
# Drop any rows with missing values for simplicity
df.dropna(inplace=True)

# Encode categorical variables
label_encoder = LabelEncoder()
df['State'] = label_encoder.fit_transform(df['State'])
df['District '] = label_encoder.fit_transform(df['District '])
df['Crop'] = label_encoder.fit_transform(df['Crop'])
df['Season'] = label_encoder.fit_transform(df['Season'])

# Features (X) and target variable (y)
X = df.drop("Crop", axis=1)
y = df["Crop"]

In [None]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train,y_train)
y_pred_knn=knn.predict(X_test)
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,f1_score
print(confusion_matrix(y_test,y_pred_knn))
print("acuuracy",accuracy_score(y_test,y_pred_knn,))
print("precison",precision_score(y_test,y_pred_knn,average=None))
print("recall",recall_score(y_test,y_pred_knn,average=None))
print("f1:",f1_score(y_test,y_pred_knn,average=None))

[[100  33  22 ...  11   0   0]
 [ 39 517  59 ... 124  16   1]
 [ 26 122 241 ...  44  31   0]
 ...
 [ 43 280  73 ... 543  11   6]
 [ 21  81  60 ...   8 572   0]
 [  4  14   5 ...  10   3  13]]
acuuracy 0.25358050442880853
precison [0.12048193 0.11882326 0.11261682 0.25668449 0.19404916 0.26869159
 0.43859649 0.19825708 0.19055794 0.97508897 0.2171753  0.14190476
 0.15942029 0.20494532 0.27176781 0.27166276 0.15665064 0.15750774
 0.27393617 0.15211268 0.1290107  0.36013986 0.09259259 0.21351351
 0.25808458 0.19541376 0.38509317 0.20399765 0.23566879 0.2192029
 0.21818182 0.34276851 0.14193548 0.11428571 0.16729089 0.
 0.2245098  0.41594925 0.22916667 0.17480315 0.34358868 0.09770115
 0.27428571 0.26276217 0.25081788 0.28415301 0.72857143 0.21046229
 0.36786469 0.36226415 0.31563422 0.34322581 0.22382523 0.34108527
 0.22033898]
recall [0.23419204 0.23813911 0.14526823 0.24090339 0.25380711 0.37953795
 0.71428571 0.2826087  0.24       0.9148581  0.3011811  0.11797308
 0.1981982  0.24227094

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtree=DecisionTreeClassifier()
dtree.fit(X_train,y_train)
y_pred_dt=dtree.predict(X_test)
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,f1_score
print(confusion_matrix(y_test,y_pred_dt))
print("acuuracy",accuracy_score(y_test,y_pred_dt))
print("precison",precision_score(y_test,y_pred_dt,average=None))
print("recall",recall_score(y_test,y_pred_dt,average=None))
print("f1:",f1_score(y_test,y_pred_dt,average=None))

[[ 309    2    2 ...    9   11    1]
 [   0  971   82 ...  115   14    4]
 [   0   84  789 ...   46    5    2]
 ...
 [   8  113   33 ... 1532    5    3]
 [   5    9    1 ...    7 1813    1]
 [   1    6    0 ...    3    1  102]]
acuuracy 0.6204885644197012
precison [0.71527778 0.45845137 0.48945409 0.81851852 0.71356784 0.82698962
 0.85714286 0.71604938 0.42857143 0.99832215 0.72482552 0.51517572
 0.45142857 0.66989738 0.73249027 0.69767442 0.53578067 0.50380457
 0.80339806 0.4361603  0.42910053 0.80662983 0.34017595 0.59190853
 0.68081366 0.48801237 0.77568134 0.51732502 0.38576779 0.46917808
 0.7        0.80571429 0.52234359 0.28214286 0.43514259 0.23076923
 0.58959944 0.82742752 0.50782609 0.57858456 0.81445179 0.33012821
 0.53954306 0.55354994 0.40864714 0.4720314  0.91824197 0.46368715
 0.78373847 0.81176471 0.6744186  0.70701169 0.51617251 0.82709854
 0.47663551]
recall [0.7236534  0.44725933 0.4755877  0.83186951 0.72081218 0.78877888
 0.91428571 0.72049689 0.39891892 0.9933222  

In [None]:
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, export_text
# Plot the Decision Tree using export_text
tree_rules = export_text(dtree, feature_names=list(X.columns), spacing=3)
print("Decision Tree Rules:\n", tree_rules)

Decision Tree Rules:
 |--- Yield <= 5.12
|   |--- Yield <= 1.22
|   |   |--- Season <= 1.50
|   |   |   |--- Yield <= 0.61
|   |   |   |   |--- State <= 21.50
|   |   |   |   |   |--- Production <= 1018.50
|   |   |   |   |   |   |--- State <= 3.50
|   |   |   |   |   |   |   |--- State <= 2.50
|   |   |   |   |   |   |   |   |--- Yield <= 0.31
|   |   |   |   |   |   |   |   |   |--- Yield <= 0.07
|   |   |   |   |   |   |   |   |   |   |--- Yield <= 0.04
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 6
|   |   |   |   |   |   |   |   |   |   |--- Yield >  0.04
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 7
|   |   |   |   |   |   |   |   |   |--- Yield >  0.07
|   |   |   |   |   |   |   |   |   |   |--- Crop_Year <= 2002.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 14
|   |   |   |   |   |   |   |   |   |   |--- Crop_Year >  2002.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated br

In [None]:
# Check feature importances
feature_importances = dtree.feature_importances_
print("Feature Importances:\n", feature_importances)


Feature Importances:
 [0.14451434 0.16072666 0.13767345 0.06006129 0.13239462 0.12238472
 0.24224492]


In [None]:
# Check feature importances
feature_importances = dtree.feature_importances_

# Create a dictionary mapping feature names to their importance scores
feature_importance_dict = dict(zip(X.columns, feature_importances))

# Sort the features by importance in descending order
sorted_features = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

# Print the importance of each feature
print("Feature Importances:")
for feature, importance in sorted_features:
    print(f"{feature}: {importance:.4f}")


Feature Importances:
Yield: 0.2422
District : 0.1607
State: 0.1445
Crop_Year: 0.1377
Area : 0.1324
Production: 0.1224
Season: 0.0601
