# ML1 Assignment 2 
# Notebook 1
### Musab - 29409

# Effects of Cross Validation (CV)

In [56]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd

## Model Evaluation Functions

In [57]:
def evaluate_knn_regression(dataset, target_column, n_neighbors=5, cv=5):
    # Split the dataset into features and target
    X = dataset.drop(target_column, axis=1)
    y = dataset[target_column]

    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

    # Create a KNN regressor
    knn = KNeighborsRegressor(n_neighbors=n_neighbors)

    # Train the model without CV
    knn.fit(X_train, y_train)
    predictions = knn.predict(X_test)
    r2 = r2_score(y_test, predictions)
    print(f'R2 score without CV: {r2}')

    # Evaluate the model with CV
    scores_r2 = cross_val_score(knn, X, y, cv=cv, scoring='r2')
    avg_r2 = scores_r2.mean()
    print(f'Average R2 score with CV: {avg_r2}')

In [58]:
def evaluate_knn_classification(dataset, target_column, n_neighbors=5, cv=5):
    # Split the dataset into features and target
    X = dataset.drop(target_column, axis=1).values
    y = dataset[target_column].values

    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

    # Create a KNN classifier
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)

    # Train the model without CV
    knn.fit(X_train, y_train)
    predictions = knn.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print(f'Accuracy without CV: {accuracy}')

    # Evaluate the model with CV
    scores = cross_val_score(knn, X, y, cv=cv, scoring='accuracy')
    avg_accuracy = scores.mean()
    print(f'Average accuracy with CV: {avg_accuracy}')

## Regression Dataset: Automobile Prices

In [59]:
filename = r'E:\IBA\Semester 1\Machine Learning 1\Assignment\Assignment2\automobile\imports-85.data'
column_names = ['symboling', 'normalized_losses', 'make', 'fuel_type', 'aspiration',
    'num_of_doors', 'body_style', 'drive_wheels', 'engine_location', 'wheel_base', 
    'length', 'width', 'height', 'curb_weight', 'engine_type', 'num_of_cylinders', 
    'engine_size', 'fuel_system', 'bore', 'stroke', 'compression_ratio', 'horsepower', 
    'peak_rpm', 'city_mpg', 'highway_mpg', 'price']

In [60]:
df = pd.read_csv(filename, names=column_names, header=None, delimiter=',', na_values='?')
dataset1_name = 'Auto Mobile Dataset'

In [61]:
# Create a label encoder object
le = LabelEncoder()

# List of categorical columns to convert
categorical_cols = ['make', 'fuel_type', 'aspiration', 'num_of_doors', 'body_style', 
                    'drive_wheels', 'engine_location', 'engine_type', 'num_of_cylinders', 'fuel_system']

# Apply Label Encoder on each of the categorical columns:
for col in categorical_cols:
    df[col] = le.fit_transform(df[col].astype(str))

# Replace NaN values with 0
df.fillna(0, inplace=True)

# Display the first few rows of the DataFrame
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized_losses  205 non-null    float64
 2   make               205 non-null    int32  
 3   fuel_type          205 non-null    int32  
 4   aspiration         205 non-null    int32  
 5   num_of_doors       205 non-null    int32  
 6   body_style         205 non-null    int32  
 7   drive_wheels       205 non-null    int32  
 8   engine_location    205 non-null    int32  
 9   wheel_base         205 non-null    float64
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  curb_weight        205 non-null    int64  
 14  engine_type        205 non-null    int32  
 15  num_of_cylinders   205 non-null    int32  
 16  engine_size        205 non

In [62]:
evaluate_knn_regression(df, 'price')

R2 score without CV: 0.8866477610504466
Average R2 score with CV: 0.2595634558218654


### Interpretation for Regression Dataset

As we can see the goodness of fit drops drastically after cross validation, this means the selected model is not fit for predictions on this data set as performance drops when dataset is changed by folding 5 times.

## Classification Dataset: Student Dropout

In [63]:
filename2 = r'E:\IBA\Semester 1\Machine Learning 1\Assignment\Assignment2\student_dropout_data.csv'
df2 = pd.read_csv(filename2, delimiter=';')
dataset2_name = 'Student Drop Out Dataset'

In [64]:
le = LabelEncoder()
df2['Target'] = le.fit_transform(df2['Target'])
target_mapping = {index: label for index, label in enumerate(le.classes_)}

In [65]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4424 entries, 0 to 4423
Data columns (total 37 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Marital status                                  4424 non-null   int64  
 1   Application mode                                4424 non-null   int64  
 2   Application order                               4424 non-null   int64  
 3   Course                                          4424 non-null   int64  
 4   Daytime/evening attendance	                     4424 non-null   int64  
 5   Previous qualification                          4424 non-null   int64  
 6   Previous qualification (grade)                  4424 non-null   float64
 7   Nacionality                                     4424 non-null   int64  
 8   Mother's qualification                          4424 non-null   int64  
 9   Father's qualification                   

In [66]:
evaluate_knn_classification(df2, 'Target')

Accuracy without CV: 0.6135593220338983
Average accuracy with CV: 0.5978740700974001


### Interpretation for Classification Dataset

Here we can see that accuracy is quite close to accuracy without CV this means, that our model is perfoming well over different folds of the datasets. This means it is likely to perform well in live testing. But the accuracy is quite low so we should consider changing to an algorithm with better prediction capability.