In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Load dataset
dataset = pd.read_csv('./datasets/Social_Network_Ads.csv')

# View basic details
print(dataset.info())
print(dataset.head())

# Drop unnecessary column and encode 'Gender' column
dataset = dataset.drop(columns=['User ID'])
dataset['Gender'] = LabelEncoder().fit_transform(dataset['Gender'])

# Feature and target variables
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

# Standardize the feature variables
X_scaled = StandardScaler().fit_transform(X)

# Split dataset into training and test sets (75% training, 25% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, random_state=42)

# Check the sizes of the splits
print(f"\nTraining set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

# KNN classifier with K=5
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train, y_train)

# Predict on test set
y_pred = knn_classifier.predict(X_test)

# Evaluation metrics
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)

# Classification report (includes accuracy, precision, recall, F1-score)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy * 100:.2f}%")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB
None
    User ID  Gender  Age  EstimatedSalary  Purchased
0  15624510    Male   19            19000          0
1  15810944    Male   35            20000          0
2  15668575  Female   26            43000          0
3  15603246  Female   27            57000          0
4  15804002    Male   19            76000          0

Training set size: 300
Test set size: 100

Confusion Matrix:
[[58  5]
 [ 3 34]]

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.92      0.94    