In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

C:\Users\Imahv\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.23-246-g3d31191b-gcc_10_3_0.dll
C:\Users\Imahv\anaconda3\lib\site-packages\numpy\.libs\libopenblas64__v0.3.23-gcc_10_3_0.dll


In [2]:
Farming = pd.read_csv("C:\\Users\\Imahv\\MLCourse\\crop_Data.csv") #reading the csv file

In [3]:
Farming.head(1) #printing the first 30 rows of the dataset

Unnamed: 0,temperature,humidity,ph,water availability,harvest season,label,Country
0,20.879744,82.002744,6.502985,202.935536,rainy,rice,Nigeria


In [4]:
Farming.tail(5) #printing the last 30 rows of the dataset

Unnamed: 0,temperature,humidity,ph,water availability,harvest season,label,Country
1395,23.874845,86.792613,6.718725,177.514731,rainy,jute,South Africa
1396,23.928879,88.071123,6.880205,154.660874,rainy,jute,South Africa
1397,24.814412,81.686889,6.861069,190.788639,rainy,jute,Nigeria
1398,24.447439,82.286484,6.769346,190.968489,rainy,jute,Nigeria
1399,26.574217,73.819949,7.261581,159.322307,rainy,jute,South Africa


In [5]:
Farming.shape #printing the shape of the dataset

(1400, 7)

In [6]:
Farming.isnull().sum() #checking for null values

temperature           0
humidity              0
ph                    0
water availability    0
harvest season        0
label                 0
Country               0
dtype: int64

In [7]:
Farming.info() #printing the information of the dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1400 entries, 0 to 1399
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   temperature         1400 non-null   float64
 1   humidity            1400 non-null   float64
 2   ph                  1400 non-null   float64
 3   water availability  1400 non-null   float64
 4   harvest season      1400 non-null   object 
 5   label               1400 non-null   object 
 6   Country             1400 non-null   object 
dtypes: float64(4), object(3)
memory usage: 76.7+ KB


In [8]:
Farming.describe() #describing the dataset

Unnamed: 0,temperature,humidity,ph,water availability
count,1400.0,1400.0,1400.0,1400.0
mean,24.971621,64.611062,6.565246,91.784651
std,4.081622,22.753785,0.835101,58.682258
min,15.330426,14.25804,3.504752,20.211267
25%,22.178239,56.824217,6.068795,51.546542
50%,25.140245,68.288321,6.524478,72.379183
75%,27.963227,82.710409,7.042343,107.428334
max,36.977944,94.962187,9.935091,298.560117


In [9]:
Farming.dtypes #printing the data types of the dataset

temperature           float64
humidity              float64
ph                    float64
water availability    float64
harvest season         object
label                  object
Country                object
dtype: object

In [10]:
categorical_columns = ['label','Country']
for column in categorical_columns:
    value_counts = Farming[column].value_counts()
    print(f"Value counts for {column}:")
    print(value_counts)

Value counts for label:
maize          200
rice           100
chickpea       100
kidneybeans    100
pigeonpeas     100
mothbeans      100
mungbean       100
blackgram      100
lentil         100
watermelon     100
muskmelon      100
cotton         100
jute           100
Name: label, dtype: int64
Value counts for Country:
Nigeria         713
South Africa    468
Kenya           155
Sudan            64
Name: Country, dtype: int64


In [11]:
X = Farming[['temperature', 'humidity','ph', 'water availability','label', 'Country']].copy()
y = Farming['harvest season'].copy() #separating the dependent variable

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [13]:
#Temperature Level
Farming['temperature'] = pd.cut(Farming['temperature'], bins=[15, 19, 24, 29, float('inf')], labels=['Cool', 'Mild', 'Warm', 'Hot'])

#Humidity Level
Farming['humidity'] = pd.cut(Farming['humidity'], bins=[0, 20, 40, 60, 80, float('inf')], labels=['Low', 'Moderate', 'Average', 'High', 'Very High'])

# pH Level
Farming['ph'] = pd.cut(Farming['ph'], bins=[0, 2, 6, 7, 10, float('inf')], labels=['Strongly Acidic', 'Moderately Acidic', 'Neutral', 'Moderately Alkaline', 'Highly Alkaline'])

# Water Availability Level
Farming['water availability'] = pd.cut(Farming['water availability'], bins=[float('-inf'), 50, 100, float('inf')], labels=['Low', 'Moderate', 'High'])

In [14]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['ph', 'water availability']), #scaling the numerical columns
        ('cat', OneHotEncoder(), ['label', 'Country']) #one-hot encoding the categorical columns
    ])

# Create the pipeline with the preprocessing steps and the model
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', RandomForestClassifier())])

# Fit the pipeline to your data
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print("Accuracy:", accuracy)
print(report)

Accuracy: 0.9
              precision    recall  f1-score   support

       rainy       0.85      0.92      0.88       113
      spring       1.00      1.00      1.00        20
      summer       1.00      1.00      1.00        57
      winter       0.89      0.79      0.84        90

    accuracy                           0.90       280
   macro avg       0.93      0.93      0.93       280
weighted avg       0.90      0.90      0.90       280



In [15]:
# Access the OneHotEncoder from the preprocessor
onehot_encoder = preprocessor.transformers_[1][1]

# Get the feature names for each category
categories = onehot_encoder.categories_
encoded_feature_names = [f"{col}={cat}" for col, cats in zip(categorical_columns, categories) for cat in cats]

# Create a DataFrame to display the encoded values
encoded_values = pd.DataFrame(onehot_encoder.transform(X_train[categorical_columns]).toarray(), columns=encoded_feature_names)

# Print the encoded values
print(encoded_values)

      label=blackgram  label=chickpea  label=cotton  label=jute  \
0                 1.0             0.0           0.0         0.0   
1                 0.0             0.0           0.0         1.0   
2                 0.0             0.0           0.0         0.0   
3                 0.0             0.0           0.0         0.0   
4                 0.0             0.0           0.0         1.0   
...               ...             ...           ...         ...   
1115              1.0             0.0           0.0         0.0   
1116              0.0             0.0           0.0         0.0   
1117              0.0             0.0           1.0         0.0   
1118              0.0             0.0           0.0         0.0   
1119              0.0             0.0           0.0         0.0   

      label=kidneybeans  label=lentil  label=maize  label=mothbeans  \
0                   0.0           0.0          0.0              0.0   
1                   0.0           0.0          0.0   

In [16]:
import pickle

# Make predictions on new data
new_data = pd.DataFrame({
    'temperature': [25],
    'humidity': [65],
    'ph': [6.5],
    'water availability': [80],
    'label': ['rice'],
    'Country': ['Nigeria']
})

# Use the pipeline to make predictions
prediction = pipeline.predict(new_data)

print("Prediction:", prediction)

# Save the pipeline as a pickle file
with open('crop_prediction_pipeline.pkl', 'wb') as f:
    pickle.dump(pipeline, f)

Prediction: ['rainy']
