In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

# Load datasets
dataset = pd.read_csv('Social_Network_Ads.csv')
dataset1 = pd.read_csv('Social_Network_Adsnew.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

# Display the shape of the dataset
print("Number of rows:", dataset1.shape[0])
print("Number of columns:", dataset1.shape[1])

# Check no. of missing values in every column for training dataframe
pd.DataFrame(dataset1.isnull().sum()).T.style.background_gradient(cmap='Spectral_r')

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

print('The train dataset contains {:,d} rows & {} columns'.format(X_train.shape[0], X_train.shape[1]))
print('The test dataset contains {:,d} rows & {} columns'.format(X_test.shape[0], X_test.shape[1]))

# Feature scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

print(X_train)
print(y_train)
print(X_test)
print(y_test)

# Function to print misclassified points
def print_misclassified(y_test, y_pred, X_test, model_name):
    misclassified = np.where(y_test != y_pred)
    print(f"Misclassified points for {model_name}:")
    for i in misclassified[0]:
        print(f"Index: {i}, True Label: {y_test[i]}, Predicted Label: {y_pred[i]}, Features: {X_test[i]}")

# Train and predict with KNN
knn_classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
knn_classifier.fit(X_train, y_train)
y_pred_knn = knn_classifier.predict(X_test)
print_misclassified(y_test, y_pred_knn, X_test, "K-Nearest Neighbors")

# Train and predict with Decision Tree
dt_classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
dt_classifier.fit(X_train, y_train)
y_pred_dt = dt_classifier.predict(X_test)
print_misclassified(y_test, y_pred_dt, X_test, "Decision Tree")

# Train and predict with Random Forest
rf_classifier = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=0)
rf_classifier.fit(X_train, y_train)
y_pred_rf = rf_classifier.predict(X_test)
print_misclassified(y_test, y_pred_rf, X_test, "Random Forest")

# Evaluate the models
cm_knn = confusion_matrix(y_test, y_pred_knn)
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print("KNN Confusion Matrix:\n", cm_knn)
print("KNN Accuracy:", accuracy_knn)

cm_dt = confusion_matrix(y_test, y_pred_dt)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print("Decision Tree Confusion Matrix:\n", cm_dt)
print("Decision Tree Accuracy:", accuracy_dt)

cm_rf = confusion_matrix(y_test, y_pred_rf)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Confusion Matrix:\n", cm_rf)
print("Random Forest Accuracy:", accuracy_rf)


Number of rows: 400
Number of columns: 5
The train dataset contains 300 rows & 2 columns
The test dataset contains 100 rows & 2 columns
[[ 0.58164944 -0.88670699]
 [-0.60673761  1.46173768]
 [-0.01254409 -0.5677824 ]
 [-0.60673761  1.89663484]
 [ 1.37390747 -1.40858358]
 [ 1.47293972  0.99784738]
 [ 0.08648817 -0.79972756]
 [-0.01254409 -0.24885782]
 [-0.21060859 -0.5677824 ]
 [-0.21060859 -0.19087153]
 [-0.30964085 -1.29261101]
 [-0.30964085 -0.5677824 ]
 [ 0.38358493  0.09905991]
 [ 0.8787462  -0.59677555]
 [ 2.06713324 -1.17663843]
 [ 1.07681071 -0.13288524]
 [ 0.68068169  1.78066227]
 [-0.70576986  0.56295021]
 [ 0.77971394  0.35999821]
 [ 0.8787462  -0.53878926]
 [-1.20093113 -1.58254245]
 [ 2.1661655   0.93986109]
 [-0.01254409  1.22979253]
 [ 0.18552042  1.08482681]
 [ 0.38358493 -0.48080297]
 [-0.30964085 -0.30684411]
 [ 0.97777845 -0.8287207 ]
 [ 0.97777845  1.8676417 ]
 [-0.01254409  1.25878567]
 [-0.90383437  2.27354572]
 [-1.20093113 -1.58254245]
 [ 2.1661655  -0.79972756]
