## Import Clustering Libs and clustered-Processed-Data

In [23]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

## Data Cheaking and splitting for training

In [11]:
data = pd.read_csv("../data/clustered_Data.csv")

In [13]:
data = data.reset_index(drop=True)

In [14]:
data.head()

Unnamed: 0,PdDistrict,X,Y,Hour,Month,day,n_days,Clusters
0,5,-0.871797,-0.055218,0.241546,1.331962,-0.066074,-0.918547,0
1,4,-0.266478,1.280572,0.394109,-0.126598,-0.179966,1.447257,0
2,0,1.905681,-1.337436,-0.368703,1.331962,0.958949,1.570342,0
3,5,-1.939273,0.011803,0.546671,-1.001734,-0.749423,1.37401,0
4,7,1.152978,0.220264,-2.046887,-0.126598,-0.863315,0.06387,1


In [15]:
data.columns

Index(['PdDistrict', 'X', 'Y', 'Hour', 'Month', 'day', 'n_days', 'Clusters'], dtype='object')

In [16]:
Train = data.drop(columns=['Clusters'])
Test = data['Clusters']

In [18]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(Train, Test, test_size=0.2, random_state=42)

## Classification Models

### First : Using DecisionTree

In [19]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
tree_clf=DecisionTreeClassifier(random_state=42)

param_grid = [
    {
        'criterion': ['gini', 'entropy', 'log_loss'],    
        'max_depth': [None, 5, 10, 20],                  
        'min_samples_split': [10, 50, 100],                 
        'min_samples_leaf': [5, 20, 50],                 
        'max_features': [None, 'sqrt', 'log2']           
    }
]

grid_search = GridSearchCV(tree_clf , param_grid , cv=5)

grid_search.fit(x_train,y_train)

In [32]:
from sklearn.metrics import accuracy_score,precision_score, recall_score, f1_score

predict_class = grid_search.predict(x_test)

accuracy = accuracy_score(y_test, predict_class)
print("Test Accuracy:", accuracy)

print("Predicted classes:")
print(predict_class)
print("Predicted classes:")
print(np.array(y_test))

Test Accuracy: 0.976875
Predicted classes:
[2 0 2 ... 1 1 1]
Predicted classes:
[2 0 2 ... 1 1 1]


### Second : Using RandomForest

In [33]:
from sklearn.ensemble import RandomForestClassifier
rnd_clf=RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [50, 100 ,400],
    'max_depth': [None, 10 , 50 ,70],
    'min_samples_split': [2, 5 ,10],
    'min_samples_leaf': [1, 2 ,4],
    'max_features': ['auto', 'sqrt'],
    'bootstrap': [True, False]
}

grid_search = GridSearchCV(rnd_clf, param_grid, cv=5, scoring='accuracy', n_jobs = -1)

grid_search.fit(x_train, y_train)

print("Best parameters:", grid_search.best_params_)

best_tree_model = grid_search.best_estimator_

Best parameters: {'bootstrap': True, 'max_depth': 70, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 400}


In [34]:
predict_class = grid_search.predict(x_test)

accuracy = accuracy_score(y_test, predict_class)
print("Test Accuracy:", accuracy)

print("Predicted classes:")
print(predict_class)
print("Predicted classes:")
print(np.array(y_test))

Test Accuracy: 0.988
Predicted classes:
[2 0 2 ... 1 1 1]
Predicted classes:
[2 0 2 ... 1 1 1]


In [None]:
cm = confusion_matrix(y_test, y_pred_rf_test)
labels = [f"Cluster {i}" for i in range(cm.shape[0])]
plt.figure(figsize=(10, 7))
sns.heatmap(
    cm,
    annot=True,
    fmt='d',
    cmap='Blues',
    xticklabels=labels,
    yticklabels=labels,
    linewidths=0.5,
    linecolor='gray'
)
plt.title("Confusion Matrix - Random Forest (Test Data)", fontsize=14, weight='bold')
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.tight_layout()
plt.show()