<a href="https://colab.research.google.com/github/Min-Thway-Htut/Machine-Learning/blob/master/k_neighbour.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
from sklearn.datasets import fetch_openml

In [7]:
data  = pd.read_csv('/content/sample_data/mnist_test.csv')

In [9]:
print(data.head())

   7  0  0.1  0.2  0.3  0.4  0.5  0.6  0.7  0.8  ...  0.658  0.659  0.660  \
0  2  0    0    0    0    0    0    0    0    0  ...      0      0      0   
1  1  0    0    0    0    0    0    0    0    0  ...      0      0      0   
2  0  0    0    0    0    0    0    0    0    0  ...      0      0      0   
3  4  0    0    0    0    0    0    0    0    0  ...      0      0      0   
4  1  0    0    0    0    0    0    0    0    0  ...      0      0      0   

   0.661  0.662  0.663  0.664  0.665  0.666  0.667  
0      0      0      0      0      0      0      0  
1      0      0      0      0      0      0      0  
2      0      0      0      0      0      0      0  
3      0      0      0      0      0      0      0  
4      0      0      0      0      0      0      0  

[5 rows x 785 columns]


In [10]:
# Spliting dataset into training and test sets

from sklearn.model_selection import train_test_split

X = data.iloc[:, :-1]
y = data.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)


In [12]:
# Data Exploration and Preprocessing

print(data.describe())
print(data.info())
print(y.value_counts())

X_train = X_train
X_test = X_test

                 7       0     0.1     0.2     0.3     0.4     0.5     0.6  \
count  9999.000000  9999.0  9999.0  9999.0  9999.0  9999.0  9999.0  9999.0   
mean      4.443144     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
std       2.895897     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
min       0.000000     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
25%       2.000000     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
50%       4.000000     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
75%       7.000000     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
max       9.000000     0.0     0.0     0.0     0.0     0.0     0.0     0.0   

          0.7     0.8  ...        0.658        0.659        0.660  \
count  9999.0  9999.0  ...  9999.000000  9999.000000  9999.000000   
mean      0.0     0.0  ...     0.179318     0.163616     0.052605   
std       0.0     0.0  ...     5.674433     5.736359     2.420125   
min       0.0     0.0

In [13]:
#Feature Engineering

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

In [14]:
# Define a Grid of Hyperparameters

from sklearn.model_selection import GridSearchCV

param_grid_knn = {
    'n_neighbors': [3,5,7],
    'weights': ['uniform', 'distance']
}

param_grid_dt = {
    'max_depth' : [10,20,30],
    'min_samples_split': [2,5,10]
}

param_grid_svm = {
    'C': [0.1,1,10],
    'kernel': ['linear', 'rbf']
}

In [17]:
# Conduct Grid Search with K-Fold Cross-Validation

from sklearn.model_selection import StratifiedKFold

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

knn = KNeighborsClassifier()
grid_search_knn = GridSearchCV(knn, param_grid_knn, cv=cv, n_jobs=-1)
grid_search_knn.fit(X_train, y_train)

dt = DecisionTreeClassifier()
grid_search_dt = GridSearchCV(dt, param_grid_dt, cv=cv, n_jobs=-1 )
grid_search_dt.fit(X_train, y_train)

svm = SVC()
grid_search_svm = GridSearchCV(svm, param_grid_svm, cv=cv, n_jobs=-1 )

print("Best KNN:", grid_search_knn.best_params_)
print("Best Decision Tree:", grid_search_dt.best_params_)


Best KNN: {'n_neighbors': 3, 'weights': 'uniform'}
Best Decision Tree: {'max_depth': 10, 'min_samples_split': 2}


In [22]:
# Train the best model
best_knn = grid_search_knn.best_estimator_
best_dt = grid_search_dt.best_estimator_

best_knn.fit(X_train, y_train)
best_dt.fit(X_train, y_train)

In [28]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Evaluate KNN
y_pred_knn = best_knn.predict(X_test)
print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn))
print("KNN Classification Report:\n", classification_report(y_test, y_pred_knn))
print("KNN Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn))

# Evaluate Decision Tree
y_pred_dt = best_dt.predict(X_test)
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Decision Tree Classification Report:\n", classification_report(y_test, y_pred_dt))
print("Decision Tree Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))

\

KNN Accuracy: 1.0
KNN Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2000

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000

KNN Confusion Matrix:
 [[2000]]
Decision Tree Accuracy: 1.0
Decision Tree Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2000

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000

Decision Tree Confusion Matrix:
 [[2000]]
