In [10]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


In [4]:
# Load the dataset
file_path = '/Users/sivagar/Downloads/Crop_Dataset.csv'
data = pd.read_csv(file_path)


In [6]:
# Display basic information about the dataset and the first few rows to understand its structure
data.info(), data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   N                     2200 non-null   int64  
 1   P                     2200 non-null   int64  
 2   K                     2200 non-null   int64  
 3   temperature           2200 non-null   float64
 4   humidity              2200 non-null   float64
 5   ph                    2200 non-null   float64
 6   rainfall              2200 non-null   float64
 7   Total_Nutrients       2200 non-null   int64  
 8   Temperature_Humidity  2200 non-null   float64
 9   Log_Rainfall          2200 non-null   float64
 10  Label                 2200 non-null   object 
 11  Label_Encoded         2200 non-null   int64  
dtypes: float64(6), int64(5), object(1)
memory usage: 206.4+ KB


(None,
     N   P   K  temperature   humidity        ph    rainfall  Total_Nutrients  \
 0  90  42  43    20.879744  82.002744  6.502985  202.935536              175   
 1  85  58  41    21.770462  80.319644  7.038096  226.655537              184   
 2  60  55  44    23.004459  82.320763  7.840207  263.964248              159   
 3  74  35  40    26.491096  80.158363  6.980401  242.864034              149   
 4  78  42  42    20.130175  81.604873  7.628473  262.717340              162   
 
    Temperature_Humidity  Log_Rainfall  Label  Label_Encoded  
 0           1712.196283      5.317804  wheat              0  
 1           1748.595734      5.427834  wheat              0  
 2           1893.744627      5.579595  wheat              0  
 3           2123.482908      5.496611  wheat              0  
 4           1642.720357      5.574878  wheat              0  )

In [8]:
# Check for any missing values
missing_data = data.isnull().sum()

In [9]:
# Scale the numeric features (excluding encoded labels)
features_to_scale = ['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall',
                     'Total_Nutrients', 'Temperature_Humidity', 'Log_Rainfall']
scaler = StandardScaler()
data_scaled = data.copy()
data_scaled[features_to_scale] = scaler.fit_transform(data[features_to_scale])

missing_data, data_scaled.head()


(N                       0
 P                       0
 K                       0
 temperature             0
 humidity                0
 ph                      0
 rainfall                0
 Total_Nutrients         0
 Temperature_Humidity    0
 Log_Rainfall            0
 Label                   0
 Label_Encoded           0
 dtype: int64,
           N         P         K  temperature  humidity        ph  rainfall  \
 0  1.068797 -0.344551 -0.101688    -0.935587  0.472666  0.043302  1.810361   
 1  0.933329  0.140616 -0.141185    -0.759646  0.397051  0.734873  2.242058   
 2  0.255986  0.049647 -0.081939    -0.515898  0.486954  1.771510  2.921066   
 3  0.635298 -0.556811 -0.160933     0.172807  0.389805  0.660308  2.537048   
 4  0.743673 -0.344551 -0.121436    -1.083647  0.454792  1.497868  2.898373   
 
    Total_Nutrients  Temperature_Humidity  Log_Rainfall  Label  Label_Encoded  
 0         0.287062             -0.203138      1.483789  wheat              0  
 1         0.399702      

In [12]:
# Splitting the dataset into training and testing sets
X = data_scaled[features_to_scale]
y = data_scaled['Label_Encoded']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Predicting the test set results
y_pred = rf_classifier.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

accuracy


0.990909090909091

In [13]:
classification_rep

'              precision    recall  f1-score   support\n\n           0       0.94      0.89      0.92        19\n           1       1.00      1.00      1.00        21\n           2       1.00      1.00      1.00        26\n           3       1.00      1.00      1.00        20\n           4       1.00      1.00      1.00        23\n           5       1.00      0.96      0.98        24\n           6       1.00      1.00      1.00        19\n           7       1.00      1.00      1.00        20\n           8       0.92      1.00      0.96        11\n           9       1.00      1.00      1.00        23\n          10       1.00      1.00      1.00        21\n          11       1.00      1.00      1.00        19\n          12       1.00      1.00      1.00        14\n          13       1.00      1.00      1.00        19\n          14       1.00      1.00      1.00        17\n          15       1.00      1.00      1.00        23\n          16       1.00      1.00      1.00        14\n       

In [14]:
import joblib

# Save the model to a .joblib file
model_filename ='/Users/sivagar/Desktop/ucsc/task1/random_forest_classifier.joblib'
joblib.dump(rf_classifier, model_filename)

# Load the model to confirm it's saved correctly
loaded_model = joblib.load(model_filename)

# Just for demonstration, let's predict using the first few entries of the test set
sample_data = X_test.head()
predicted_labels = loaded_model.predict(sample_data)
predicted_label_names = [data_scaled['Label'].unique()[label] for label in predicted_labels]

model_filename, predicted_label_names


('/Users/sivagar/Desktop/ucsc/task1/random_forest_classifier.joblib',
 ['raspberries', 'plums', 'strawberries', 'strawberries', 'pears'])