# Assignment2 - Supervised Learning flow

# Part 1 - Student details:
* Please write the First Name and last 4 digits of the i.d. for each student. For example:
<pre>Israel 9812</pre>

In [83]:
# student 1: Almog 1460
# student 2: Michael 0027

## Part 2 - Initial Preparations 
You could add as many code cells as needed

In [113]:
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt

import seaborn as sns

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [114]:
def load_dataset(file_name, category_col_name):
    df = pd.read_csv(file_name)

    X = df.drop(columns=[category_col_name])
    y = df[category_col_name]

    return df, X, y    

In [None]:
# load training dataset
train_file_name = 'wine_train.csv'
category_col_name = 'target'

train_df, train_X, train_y = load_dataset(train_file_name, category_col_name)
train_X.head()

In [None]:
# load test dataset

test_file_name = 'wine_test.csv'
test_df, test_X, test_y = load_dataset(test_file_name, category_col_name)

test_X.head()

In [None]:
# print category types for statistics

train_df.dtypes
test_df.dtypes

In [128]:
# Function used to check for missing values in given dataframe

def checkMissingRows(df):
    complete_rows = df.dropna()
    num_complete_rows = len(complete_rows)
    total_rows = len(df)
    print(f"In There are {num_complete_rows} entries out of {total_rows} containing all values")

In [129]:
# See ranges of different features, with outliers
def createPlotBoxplot(df, x_col, y_col):  
    plt.figure(figsize=(10, 6))  
    sns.boxplot(data=df, x=x_col, y=y_col)  
    plt.xlabel(x_col)  
    plt.ylabel(y_col)  
    plt.title(f'Box Plot of {y_col} by {x_col}')  
    plt.grid(True)  
    plt.show() 

In [130]:
# check relationship between two numerical features
def createPlotScatter(df, column1, column2, target):  
    plt.figure(figsize=(10, 6))  
    plt.scatter(df[column1], df[column2], c=df[target], alpha=0.7)  
    plt.colorbar(label=target)  
    plt.xlabel(column1)  
    plt.ylabel(column2)  
    plt.title(f'Scatter Plot of {column1} vs {column2}')  
    plt.grid(True)  
    plt.show()  

In [None]:
# Call the different statistics functions

createPlotBoxplot(train_df, 'target', 'color_intensity')
createPlotScatter(train_df, 'color_intensity', 'hue', category_col_name)

checkMissingRows(train_df)
checkMissingRows(test_df)

## Part 3 - Experiments
You could add as many code cells as needed

In [136]:
# Declaration of pipelines used for testing in Grid Search CV.

pipelines = {
    'RandomForestClassifier': Pipeline([
        ('scaler', MinMaxScaler()), 
        ('model', RandomForestClassifier())
    ]),
    'SVC': Pipeline([
        ('scaler', MinMaxScaler()), 
        ('model', SVC())
    ]),
    'LogisticRegression': Pipeline([
        ('scaler', MinMaxScaler()), 
        ('model', LogisticRegression())
    ])
}

In [137]:
# Assign hyperparameters for each pipeline

param_grids = {
    'RandomForestClassifier': {
        'model__n_estimators': [25, 50, 125, 100, 150, 200],
        'model__max_depth': [None, 5, 10, 15, 20, 25, 30]
    },
    'SVC': {
        'model__C': [0.1, 1, 2, 5, 7.5, 10],
        'model__gamma': [0.01, 0.1, 1, 2, 3, 4, 5]
    },
    'LogisticRegression': {
        'model__C': [0.1, 1, 2, 5, 7.5, 10],
        'model__solver': ['liblinear']
    }
}

In [None]:
# Execute GridSearchCV with 5 folds using f1 macro to determine which hyperparameters gives the highest accuracy
# adding the results to a list

results = []
for model_name, pipeline in pipelines.items():
    param_grid = param_grids[model_name]
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_macro')
    grid_search.fit(train_X, train_y)
    
    best_model = grid_search.best_estimator_
    cv_scores = cross_val_score(best_model, train_X, train_y, cv=5, scoring='f1_macro')
    avg_cv_score = np.mean(cv_scores)
    
    results.append({
        'Model': model_name,
        'Best Parameters': grid_search.best_params_,
        'Average CV Score': avg_cv_score
    })


In [None]:
# Convert the list to a pandas dataframe

results_df = pd.DataFrame(results)
results_df

## Part 4 - Training 
Use the best combination of feature engineering, model (algorithm and hyperparameters) from the experiment part (part 3)

In [154]:
# Select the best model and hyperparameters according to the highest CV score.

best_index = results_df['Average CV Score'].idxmax()

best_model_name = results_df.loc[best_index, 'Model']
best_parameters = results_df.loc[best_index, 'Best Parameters']

match best_model_name:
    case "RandomForestClassifier":
        best_permutation = Pipeline([
            ('scaler', MinMaxScaler()), 
            ('model', RandomForestClassifier(max_depth=best_parameters['model__max_depth'], 
                                             n_estimators=best_parameters['model__n_estimators'], 
                                             random_state=42))
        ])
    case "SVC":
        best_permutation = Pipeline([
            ('scaler', MinMaxScaler()), 
            ('model', SVC(C=best_parameters['model__C'], gamma=best_parameters['model__gamma']))
        ])
    case "LogisticRegression":
        best_permutation = Pipeline([
            ('scaler', MinMaxScaler()), 
            ('model', LogisticRegression(C=best_parameters['model__C'], 
                                          solver=best_parameters['model__solver']))
        ])


## Part 5 - Apply on test and show model performance estimation

In [None]:
# Train the model using the best permutation of model, hyperparameters and feature engineering.

best_permutation.fit(train_X, train_y)
prediction_y = best_permutation.predict(test_X)


In [None]:
# Calculate accuracy score, confusion matrix and calssification report.

val_accuracy_scaled = accuracy_score(test_y, prediction_y)

print(f"Accuracy is {val_accuracy_scaled * 100}%!\n")
print(confusion_matrix(test_y, prediction_y))
print(classification_report(test_y, prediction_y))
