In [1]:
# Importing required libraries
import pandas as pd  # For data manipulation
import seaborn as sns  # For data visualization
import matplotlib.pyplot as plt  # For plotting graphs
from sklearn.model_selection import train_test_split  # For splitting the data
from sklearn.ensemble import RandomForestClassifier  # Machine learning model
from sklearn.metrics import accuracy_score  # For model evaluation
import pickle  # To save and load models

# To display plots inline in Jupyter
%matplotlib inline


In [11]:
# Load the dataset
df = pd.read_csv('Crop_recommendation.csv')

# Display basic information about the dataset
print("First 5 rows of the dataset:")
print(df.head())  # Display the first 5 rows

print("\nDataset Info:")
print(df.info())  # Display data types and missing value information


First 5 rows of the dataset:
   Nitrogen  phosphorus  potassium  temperature   humidity        ph  \
0        90          42         43    20.879744  82.002744  6.502985   
1        85          58         41    21.770462  80.319644  7.038096   
2        60          55         44    23.004459  82.320763  7.840207   
3        74          35         40    26.491096  80.158363  6.980401   
4        78          42         42    20.130175  81.604873  7.628473   

     rainfall label  Unnamed: 8  Unnamed: 9  
0  202.935536  rice         NaN         NaN  
1  226.655537  rice         NaN         NaN  
2  263.964248  rice         NaN         NaN  
3  242.864034  rice         NaN         NaN  
4  262.717340  rice         NaN         NaN  

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Nitrogen     2200 non-null   int64  
 1   phosphoru

In [9]:
# Check for missing values in each column
print("Missing values in each column:")
print(df.isnull().sum())

# Check for duplicate rows
print(f"\nNumber of duplicate rows: {df.duplicated().sum()}")

# Remove duplicate rows (if any)
df = df.drop_duplicates()

# Confirm changes
print("\nAfter cleaning:")
print(f"Dataset shape: {df.shape}")


Missing values in each column:
Nitrogen          0
phosphorus        0
potassium         0
temperature       0
humidity          0
ph                0
rainfall          0
label             0
Unnamed: 8     2200
Unnamed: 9     2200
dtype: int64

Number of duplicate rows: 0

After cleaning:
Dataset shape: (2200, 10)


In [17]:
# Check if the DataFrame has loaded correctly
print(f"Dataset Shape: {df.shape}")  # Ensure rows and columns exist
print(df.head())  # Preview the dataset

# Generate a heatmap for feature correlation
try:
    plt.figure(figsize=(10, 8))  # Set the figure size
    sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Feature Correlation Heatmap')
    plt.show()
except Exception as e:
    print("An error occurred:", e)


Dataset Shape: (2200, 10)
   Nitrogen  phosphorus  potassium  temperature   humidity        ph  \
0        90          42         43    20.879744  82.002744  6.502985   
1        85          58         41    21.770462  80.319644  7.038096   
2        60          55         44    23.004459  82.320763  7.840207   
3        74          35         40    26.491096  80.158363  6.980401   
4        78          42         42    20.130175  81.604873  7.628473   

     rainfall label  Unnamed: 8  Unnamed: 9  
0  202.935536  rice         NaN         NaN  
1  226.655537  rice         NaN         NaN  
2  263.964248  rice         NaN         NaN  
3  242.864034  rice         NaN         NaN  
4  262.717340  rice         NaN         NaN  
An error occurred: could not convert string to float: 'rice'


<Figure size 1000x800 with 0 Axes>

In [37]:
# Check the actual column names
print("Column Names in Dataset:")
print(df.columns)


Column Names in Dataset:
Index(['Nitrogen', 'phosphorus', 'potassium', 'temperature', 'humidity', 'ph',
       'rainfall', 'label', 'Unnamed: 8', 'Unnamed: 9'],
      dtype='object')


In [39]:
# After checking the column names, update this with the correct full column names.
X = df[['Nitrogen', 'phosphorus', 'potassium', 'temperature', 'humidity', 'ph',
       'rainfall', 'label', 'Unnamed: 8', 'Unnamed: 9']]  # Features
y = df['label']  # Target (crop name)

# Split the data into training (80%) and testing (20%) sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape of the training and testing data
print(f"Training Data Shape: {X_train.shape}, {y_train.shape}")
print(f"Testing Data Shape: {X_test.shape}, {y_test.shape}")


Training Data Shape: (1760, 10), (1760,)
Testing Data Shape: (440, 10), (440,)


In [47]:
# Check if any columns in X_train are non-numeric
print(X_train.dtypes)


Nitrogen         int64
phosphorus       int64
potassium        int64
temperature    float64
humidity       float64
ph             float64
rainfall       float64
label           object
Unnamed: 8     float64
Unnamed: 9     float64
dtype: object


In [57]:
from sklearn.preprocessing import LabelEncoder

# Assume df is your DataFrame and the 'label' column is categorical

# Initialize the label encoder
label_encoder = LabelEncoder()

# Convert the 'label' column to numeric (integers)
df['label'] = label_encoder.fit_transform(df['label'])

# Convert the 'label' column to float
df['label'] = df['label'].astype('float')

# Check the result
print(df['label'])


0       20.0
1       20.0
2       20.0
3       20.0
4       20.0
        ... 
2195     5.0
2196     5.0
2197     5.0
2198     5.0
2199     5.0
Name: label, Length: 2200, dtype: float64


In [61]:
from sklearn.preprocessing import LabelEncoder

# Initialize the label encoder
label_encoder = LabelEncoder()

# Convert the 'label' column to numeric (integers) and then to float
df['label'] = label_encoder.fit_transform(df['label']).astype('float')

# Check the result
print(df['label'])


0       20.0
1       20.0
2       20.0
3       20.0
4       20.0
        ... 
2195     5.0
2196     5.0
2197     5.0
2198     5.0
2199     5.0
Name: label, Length: 2200, dtype: float64


In [63]:
# Convert 'Unnamed: 8' and 'Unnamed: 9' to float (if necessary)
df['Unnamed: 8'] = df['Unnamed: 8'].astype('float')
df['Unnamed: 9'] = df['Unnamed: 9'].astype('float')

# Check the result
print(df[['Unnamed: 8', 'Unnamed: 9']])


      Unnamed: 8  Unnamed: 9
0            NaN         NaN
1            NaN         NaN
2            NaN         NaN
3            NaN         NaN
4            NaN         NaN
...          ...         ...
2195         NaN         NaN
2196         NaN         NaN
2197         NaN         NaN
2198         NaN         NaN
2199         NaN         NaN

[2200 rows x 2 columns]


In [65]:
# Convert all integer columns to float (if needed)
df[['Nitrogen', 'phosphorus', 'potassium']] = df[['Nitrogen', 'phosphorus', 'potassium']].astype('float')

# Check the result
print(df[['Nitrogen', 'phosphorus', 'potassium']].dtypes)


Nitrogen      float64
phosphorus    float64
potassium     float64
dtype: object


In [67]:
from sklearn.preprocessing import LabelEncoder

# Convert 'label' column (categorical) to numeric and then to float
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label']).astype('float')

# Convert 'Unnamed: 8' and 'Unnamed: 9' to float
df['Unnamed: 8'] = df['Unnamed: 8'].astype('float')
df['Unnamed: 9'] = df['Unnamed: 9'].astype('float')

# Convert integer columns ('Nitrogen', 'phosphorus', 'potassium') to float (if needed)
df[['Nitrogen', 'phosphorus', 'potassium']] = df[['Nitrogen', 'phosphorus', 'potassium']].astype('float')

# Check the updated data types
print(df.dtypes)


Nitrogen       float64
phosphorus     float64
potassium      float64
temperature    float64
humidity       float64
ph             float64
rainfall       float64
label          float64
Unnamed: 8     float64
Unnamed: 9     float64
dtype: object


In [69]:
# Select features and target
X = df[['Nitrogen', 'phosphorus', 'potassium', 'temperature', 'humidity', 'ph', 'rainfall']]
y = df['label']  # Target: the 'label' column


In [71]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shapes of the resulting splits
print(f"Training Data Shape: {X_train.shape}, {y_train.shape}")
print(f"Testing Data Shape: {X_test.shape}, {y_test.shape}")


Training Data Shape: (1760, 7), (1760,)
Testing Data Shape: (440, 7), (440,)


In [73]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)


In [75]:
# Evaluate the model on the testing set
accuracy = model.score(X_test, y_test)
print(f"Model Accuracy: {accuracy * 100:.2f}%")


Model Accuracy: 99.32%


In [77]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Print the first few predictions
print(f"Predictions: {y_pred[:5]}")


Predictions: [15. 21. 17. 17.  0.]


In [79]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Step 1: Split the data into features (X) and target (y)
X = df[['Nitrogen', 'phosphorus', 'potassium', 'temperature', 'humidity', 'ph', 'rainfall']]
y = df['label']

# Step 2: Split into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shapes of the resulting splits
print(f"Training Data Shape: {X_train.shape}, {y_train.shape}")
print(f"Testing Data Shape: {X_test.shape}, {y_test.shape}")

# Step 3: Train the Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 4: Evaluate the model's accuracy
accuracy = model.score(X_test, y_test)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Step 5: (Optional) Make predictions
y_pred = model.predict(X_test)
print(f"Predictions: {y_pred[:5]}")


Training Data Shape: (1760, 7), (1760,)
Testing Data Shape: (440, 7), (440,)
Model Accuracy: 99.32%
Predictions: [15. 21. 17. 17.  0.]


In [81]:
import pickle

# Save the trained model to a file using pickle
with open('crop_yield_model.pkl', 'wb') as file:
    pickle.dump(model, file)

print("Model saved successfully using pickle.")


Model saved successfully using pickle.
