In [22]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
pd.set_option("display.max_columns", None)
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
from collections import defaultdict
import pickle
from sklearn.preprocessing import MinMaxScaler

In [23]:
data = pd.read_csv("data/indiancrop_dataset.csv")

In [24]:
data.head()

Unnamed: 0,N_SOIL,P_SOIL,K_SOIL,TEMPERATURE,HUMIDITY,ph,RAINFALL,STATE,CROP_PRICE,CROP
0,90,42,43,20.879744,82.002744,6.502985,202.935536,Andaman and Nicobar,7000,Rice
1,85,58,41,21.770462,80.319644,7.038096,226.655537,Andaman and Nicobar,5000,Rice
2,60,55,44,23.004459,82.320763,7.840207,263.964248,Andaman and Nicobar,7000,Rice
3,74,35,40,26.491096,80.158363,6.980401,242.864034,Andaman and Nicobar,7000,Rice
4,78,42,42,20.130175,81.604873,7.628473,262.71734,Andaman and Nicobar,120000,Rice


In [25]:
data.shape

(2200, 10)

In [26]:
data.isnull().sum()

N_SOIL         0
P_SOIL         0
K_SOIL         0
TEMPERATURE    0
HUMIDITY       0
ph             0
RAINFALL       0
STATE          0
CROP_PRICE     0
CROP           0
dtype: int64

In [27]:
for feature in data.columns:
    print(f"Feature: {feature:<20} | Number of unique data points: {len(data[feature].unique().tolist())}")

Feature: N_SOIL               | Number of unique data points: 137
Feature: P_SOIL               | Number of unique data points: 117
Feature: K_SOIL               | Number of unique data points: 73
Feature: TEMPERATURE          | Number of unique data points: 2200
Feature: HUMIDITY             | Number of unique data points: 2200
Feature: ph                   | Number of unique data points: 2200
Feature: RAINFALL             | Number of unique data points: 2200
Feature: STATE                | Number of unique data points: 26
Feature: CROP_PRICE           | Number of unique data points: 502
Feature: CROP                 | Number of unique data points: 22


In [28]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   N_SOIL       2200 non-null   int64  
 1   P_SOIL       2200 non-null   int64  
 2   K_SOIL       2200 non-null   int64  
 3   TEMPERATURE  2200 non-null   float64
 4   HUMIDITY     2200 non-null   float64
 5   ph           2200 non-null   float64
 6   RAINFALL     2200 non-null   float64
 7   STATE        2200 non-null   object 
 8   CROP_PRICE   2200 non-null   int64  
 9   CROP         2200 non-null   object 
dtypes: float64(4), int64(4), object(2)
memory usage: 172.0+ KB


In [29]:
categorical_features = [feature for feature in data.columns if data[feature].dtype == 'O']
print(categorical_features)

['STATE', 'CROP']


In [30]:
data[categorical_features].head(10)

Unnamed: 0,STATE,CROP
0,Andaman and Nicobar,Rice
1,Andaman and Nicobar,Rice
2,Andaman and Nicobar,Rice
3,Andaman and Nicobar,Rice
4,Andaman and Nicobar,Rice
5,Andaman and Nicobar,Rice
6,Andaman and Nicobar,Rice
7,Andaman and Nicobar,Rice
8,Andaman and Nicobar,Rice
9,Andaman and Nicobar,Rice


In [31]:
def label_encoder(df, c_features):
    dict_list = []
    for feature in c_features:
        df[feature] = df[feature].astype(str)
        labels = df[feature].unique().tolist()
        labels.sort()
        class_dict = {}
        for idx, label in enumerate(labels):
            class_dict[label] = idx
        
        df[feature] = df[feature].map(class_dict)
        print(f"Feature: {feature}\n{class_dict}\n")
        
        dict_list.append(class_dict)
        with open(file=f"models/{feature}_label_encoding_infos.pkl", mode="wb") as file:
            pickle.dump(obj=dict_list, file=file)
        
    return data

In [32]:
data = label_encoder(data, categorical_features)
data = data.sample(frac=1).reset_index(drop=True)
data.head()

Feature: STATE
{'Andaman and Nicobar': 0, 'Andhra Pradesh': 1, 'Assam': 2, 'Chattisgarh': 3, 'Goa': 4, 'Gujarat': 5, 'Haryana': 6, 'Himachal Pradesh': 7, 'Jammu and Kashmir': 8, 'Karnataka': 9, 'Kerala': 10, 'Madhya Pradesh': 11, 'Maharashtra': 12, 'Manipur': 13, 'Meghalaya': 14, 'Nagaland': 15, 'Odisha': 16, 'Pondicherry': 17, 'Punjab': 18, 'Rajasthan': 19, 'Tamil Nadu': 20, 'Telangana': 21, 'Tripura': 22, 'Uttar Pradesh': 23, 'Uttrakhand': 24, 'West Bengal': 25}

Feature: CROP
{'Apple': 0, 'Banana': 1, 'Blackgram': 2, 'ChickPea': 3, 'Coconut': 4, 'Coffee': 5, 'Cotton': 6, 'Grapes': 7, 'Jute': 8, 'KidneyBeans': 9, 'Lentil': 10, 'Maize': 11, 'Mango': 12, 'MothBeans': 13, 'MungBean': 14, 'Muskmelon': 15, 'Orange': 16, 'Papaya': 17, 'PigeonPeas': 18, 'Pomegranate': 19, 'Rice': 20, 'Watermelon': 21}



Unnamed: 0,N_SOIL,P_SOIL,K_SOIL,TEMPERATURE,HUMIDITY,ph,RAINFALL,STATE,CROP_PRICE,CROP
0,31,29,26,28.223734,47.405191,5.024125,97.768323,18,5390,12
1,36,65,16,25.712698,64.112333,7.692014,50.170678,12,2900,10
2,39,52,53,32.512474,94.659041,6.704204,51.070481,23,700,17
3,32,43,22,31.999286,54.107746,5.270749,71.62667,10,4100,13
4,29,21,45,23.409815,93.13277,6.74926,105.224074,16,500,19


In [33]:
scaler = MinMaxScaler()
scaler = scaler.fit(data.values)
transformed_data = scaler.transform(data.values)

print(transformed_data)

[[0.22142857 0.17142857 0.105      ... 0.72       0.04490075 0.57142857]
 [0.25714286 0.42857143 0.055      ... 0.48       0.0241504  0.47619048]
 [0.27857143 0.33571429 0.24       ... 0.92       0.00581676 0.80952381]
 ...
 [0.71428571 0.49285714 0.235      ... 0.72       0.01665028 0.04761905]
 [0.00714286 0.17857143 0.025      ... 0.92       0.00540009 0.76190476]
 [0.57142857 0.09285714 0.23       ... 0.92       0.01873365 0.71428571]]


In [34]:
data = pd.DataFrame(transformed_data, columns=data.columns)
data.head()

Unnamed: 0,N_SOIL,P_SOIL,K_SOIL,TEMPERATURE,HUMIDITY,ph,RAINFALL,STATE,CROP_PRICE,CROP
0,0.221429,0.171429,0.105,0.556619,0.386674,0.236282,0.278633,0.72,0.044901,0.571429
1,0.257143,0.428571,0.055,0.484566,0.581569,0.651173,0.107633,0.48,0.02415,0.47619
2,0.278571,0.335714,0.24,0.679682,0.937907,0.497556,0.110865,0.92,0.005817,0.809524
3,0.228571,0.271429,0.085,0.664956,0.464861,0.274635,0.184716,0.4,0.034151,0.619048
4,0.207143,0.114286,0.2,0.418485,0.920103,0.504563,0.305418,0.64,0.00415,0.904762


In [35]:
with open(file="models/scaler.pkl", mode="wb") as file:
        pickle.dump(obj=scaler, file=file)

In [36]:
class BisectingMinMaxDBSCAN:
    def __init__(self, epsilon, min_points, max_depth):
        self.epsilon = epsilon
        self.min_points = min_points
        self.max_depth = max_depth
        self.visited = defaultdict(bool)
        self.clusters = defaultdict(int)
        self.current_cluster = 0

    def calculate_distance(self, x, y):
        return np.sqrt(np.sum((x - y)**2))

    def find_neighbors(self, query_point, dataset):
        neighbors = []
        for i, point in enumerate(dataset):
            if self.calculate_distance(query_point, point) <= self.epsilon:
                neighbors.append(i)
        return neighbors

    def split_dataset(self, dataset, min_value, max_value):
        partition1 = []
        partition2 = []
        for point in dataset:
            dist_min = self.calculate_distance(point, min_value)
            dist_max = self.calculate_distance(point, max_value)
            if dist_min <= dist_max:
                partition1.append(point)
            else:
                partition2.append(point)
        return np.array(partition1), np.array(partition2)

    def explore_density_reachability(self, point_index, neighbors, dataset):
        self.visited[point_index] = True
        self.clusters[point_index] = self.current_cluster
        for neighbor in neighbors:
            if not self.visited[neighbor]:
                new_neighbors = self.find_neighbors(dataset[neighbor], dataset)
                if len(new_neighbors) >= self.min_points:
                    neighbors.extend(new_neighbors)
                self.explore_density_reachability(neighbor, new_neighbors, dataset)

    def perform_bisecting_min_max_dbscan(self, dataset):
        min_value = dataset.min(axis=0)
        max_value = dataset.max(axis=0)

        if self.calculate_distance(min_value, max_value) <= self.epsilon:
            for i in range(len(dataset)):
                if not self.visited[i]:
                    self.clusters[i] = self.current_cluster
                    self.visited[i] = True
                    self.current_cluster += 1
        elif (
            self.calculate_distance(min_value, max_value) >= self.epsilon
            and self.calculate_distance(min_value, max_value) <= 2 * self.epsilon + 1
            and self.max_depth > 1
        ):
            partition1, partition2 = self.split_dataset(dataset, min_value, max_value)
            clusters1 = self.perform_bisecting_min_max_dbscan(partition1)
            clusters2 = self.perform_bisecting_min_max_dbscan(partition2)

            for i, cluster in enumerate(clusters1.values()):
                if cluster != -1:
                    self.clusters[i] = cluster

            for i, cluster in enumerate(clusters2.values()):
                if cluster != -1:
                    self.clusters[i + len(clusters1)] = cluster + len(set(clusters1.values()))
        else:
            for point_index in range(len(dataset)):
                if self.visited[point_index]:
                    continue
                neighbors = self.find_neighbors(dataset[point_index], dataset)
                if len(neighbors) < self.min_points:
                    self.clusters[point_index] = -1
                else:
                    self.explore_density_reachability(point_index, neighbors, dataset)
                    self.current_cluster += 1

        return self.clusters

In [37]:
epsilon = 0.4
min_points = 10
max_depth = 100000
model = BisectingMinMaxDBSCAN(epsilon=epsilon, min_points=min_points, max_depth=max_depth)

In [38]:
result_clusters = model.perform_bisecting_min_max_dbscan(data.values)

In [39]:
data['class'] = list(result_clusters.values())
data = data.drop(data.index.values[-1])

In [40]:
data.head()

Unnamed: 0,N_SOIL,P_SOIL,K_SOIL,TEMPERATURE,HUMIDITY,ph,RAINFALL,STATE,CROP_PRICE,CROP,class
0,0.221429,0.171429,0.105,0.556619,0.386674,0.236282,0.278633,0.72,0.044901,0.571429,0
1,0.257143,0.428571,0.055,0.484566,0.581569,0.651173,0.107633,0.48,0.02415,0.47619,0
2,0.278571,0.335714,0.24,0.679682,0.937907,0.497556,0.110865,0.92,0.005817,0.809524,0
3,0.228571,0.271429,0.085,0.664956,0.464861,0.274635,0.184716,0.4,0.034151,0.619048,0
4,0.207143,0.114286,0.2,0.418485,0.920103,0.504563,0.305418,0.64,0.00415,0.904762,0


In [41]:
data.to_csv("clustered_data.csv", index=False)