In [17]:
# Assignment (4):

# The task involves:
# Each of us will search for a dataset and apply the 3 classifiers we studied together on it:
# - Decision Tree (DT)
# - K-Nearest Neighbors (KNN)
# - Naive Bayes (NB)
# In the end, we will calculate the accuracy for each model (we need to research how to calculate accuracy).

# Important Packages 
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import warnings
warnings.filterwarnings('ignore')

# Load dataset
cars_ds = pd.read_csv('cars.csv', delimiter=';')  # Adjust the delimiter as needed
print("Initial data preview:")
print(cars_ds.head())

# Assume the last column is the target and the rest are features
X = cars_ds.iloc[:, :-1]  # Features
y = cars_ds.iloc[:, -1]   # Target

# Identify categorical columns (exclude numeric columns)
categorical_features = X.select_dtypes(include=[object]).columns.tolist()

# Create a column transformer with OneHotEncoder for categorical data
column_transformer = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_features)
    ],
    remainder='passthrough'
)

# Apply transformation to feature data
X_encoded = column_transformer.fit_transform(X)

# Convert sparse matrix to dense if necessary for GaussianNB
X_dense = X_encoded.toarray()

# Count each class frequency and filter out classes with less than 2 instances
class_counts = pd.value_counts(y)
y = pd.Series(y)
X_filtered = X_dense[y.isin(class_counts[class_counts > 1].index)]
y_filtered = y[y.isin(class_counts[class_counts > 1].index)]

# Splitting the filtered data with stratification if possible
if len(y_filtered.unique()) > 1:  # Ensure there are at least two classes for stratification
    X_train, X_test, y_train, y_test = train_test_split(X_filtered, y_filtered, test_size=0.20, random_state=0, stratify=y_filtered)
else:
    X_train, X_test, y_train, y_test = train_test_split(X_filtered, y_filtered, test_size=0.20, random_state=0)

print("----------------------------------------------------------------------------")

# Decision Tree Classifier
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train, y_train)
dt_pred = dt_classifier.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_pred)
dt_classification_report = classification_report(y_test, dt_pred)
print("\nDecision Tree Classifier:")
print("Accuracy:", dt_accuracy)
print("Confusion Matrix:\n", confusion_matrix(y_test, dt_pred))
print("Classification Report:\n", dt_classification_report)

print("----------------------------------------------------------------------------")

# K-Nearest Neighbors Classifier
knn_classifier = KNeighborsClassifier()
knn_classifier.fit(X_train, y_train)
knn_pred = knn_classifier.predict(X_test)
knn_accuracy = accuracy_score(y_test, knn_pred)
knn_classification_report = classification_report(y_test, knn_pred)
print("\nK-Nearest Neighbors Classifier:")
print("Accuracy:", knn_accuracy)
print("Confusion Matrix:\n", confusion_matrix(y_test, knn_pred))
print("Classification Report:\n", knn_classification_report)

print("----------------------------------------------------------------------------")

# Gaussian Naive Bayes Classifier
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)
nb_pred = nb_classifier.predict(X_test)
nb_accuracy = accuracy_score(y_test, nb_pred)
nb_classification_report = classification_report(y_test, nb_pred)
print("\nGaussian Naive Bayes Classifier:")
print("Accuracy:", nb_accuracy)
print("Confusion Matrix:\n", confusion_matrix(y_test, nb_pred))
print("Classification Report:\n", nb_classification_report)


Initial data preview:
                         Car     MPG Cylinders Displacement Horsepower  \
0                     STRING  DOUBLE       INT       DOUBLE     DOUBLE   
1  Chevrolet Chevelle Malibu    18.0         8        307.0      130.0   
2          Buick Skylark 320    15.0         8        350.0      165.0   
3         Plymouth Satellite    18.0         8        318.0      150.0   
4              AMC Rebel SST    16.0         8        304.0      150.0   

   Weight Acceleration Model Origin  
0  DOUBLE       DOUBLE   INT    CAT  
1   3504.         12.0    70     US  
2   3693.         11.5    70     US  
3   3436.         11.0    70     US  
4   3433.         12.0    70     US  
----------------------------------------------------------------------------

Decision Tree Classifier:
Accuracy: 0.8170731707317073
Confusion Matrix:
 [[10  4  1]
 [ 6  9  1]
 [ 1  2 48]]
Classification Report:
               precision    recall  f1-score   support

      Europe       0.59      0.67    