In [283]:
# Imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, average_precision_score, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import shuffle
from sklearn.ensemble import GradientBoostingRegressor
from tqdm import tqdm
import warnings
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn import tree
from sklearn.naive_bayes import CategoricalNB, GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.pipeline import make_pipeline
from sklearn.metrics import auc
from sklearn.calibration import LabelEncoder
import re

In [284]:
# Data loading

dataset_name = "glass.csv"

root_path = "https://raw.githubusercontent.com/matzim95/ML-datasets/master/"
path_to_data = root_path + dataset_name
df = pd.read_csv(path_to_data)

In [285]:
df = df.sample(frac=1)
df.head()

Unnamed: 0,ID,refractive index,Sodium,Magnesium,Aluminum,Silicon,Potassium,Calcium,Barium,Iron,Type
119,120,1.51652,13.56,3.57,1.47,72.45,0.64,7.96,0.0,0.0,building_windows_non_float_processed
87,88,1.51645,13.4,3.49,1.52,72.65,0.67,8.08,0.0,0.1,building_windows_non_float_processed
203,204,1.51658,14.8,0.0,1.99,73.11,0.0,8.28,1.71,0.0,headlamps
77,78,1.51627,13.0,3.58,1.54,72.83,0.61,8.04,0.0,0.0,building_windows_non_float_processed
75,76,1.5159,13.02,3.58,1.51,73.12,0.69,7.96,0.0,0.0,building_windows_non_float_processed


In [286]:
# Delete all-unique ID columns

def is_column_unique(col):
    return col.nunique() == len(col)

unique_columns = df.apply(is_column_unique, axis=0)
unique_columns_list = unique_columns[unique_columns].index.tolist()

while len(unique_columns_list) > 0:
    id_present = ''
    id_present = input(f'Found columns with all unique values: {unique_columns_list}. Does the list contain ID column/s? (y/n)')
    
    if id_present == 'y':
        if len(unique_columns_list) == 1:
            df.drop(unique_columns_list[0], axis = 1, inplace = True)
            print(f"Column {unique_columns_list[0]} has been removed from analysis.")
        else:    
            id_col_name = input(str((f"Please select the id column, so that it is deleted from analysis: {unique_columns_list}.")))
            df.drop(id_col_name, axis = 1, inplace = True)
            print(f"Column {id_col_name} has been removed from analysis")
    else: break
    unique_columns = df.apply(is_column_unique, axis=0)
    unique_columns_list = unique_columns[unique_columns].index.tolist()




print("Columns with all unique values:")
print(unique_columns_list)

Column ID has been removed from analysis.
Columns with all unique values:
[]


In [287]:
print("The list of available columns:", list(df.columns))

label_col = str(input(f'The dataframe has following columns: {list(df.columns)}.\nSelect the labels (Y) column: '))

The list of available columns: ['refractive index', 'Sodium', 'Magnesium', 'Aluminum', 'Silicon', 'Potassium', 'Calcium', 'Barium', 'Iron', 'Type']


In [288]:
# Dropping NAs

# List of common missing value expressions
missing_values = ['?', 'n/a', 'NA', "nan", 'null', '-', '']

# Replace missing value expressions with NaN
df.replace(missing_values, np.nan, inplace=True)

# Count the number of rows before dropping
total_rows_before = df.shape[0]

# Drop rows with any NaN values
df_dropped = df.dropna()

# Count the number of rows after dropping
total_rows_after = df_dropped.shape[0]

# Calculate the number of rows deleted and its percentage
rows_deleted = total_rows_before - total_rows_after
percentage_deleted = (rows_deleted / total_rows_before) * 100

dropping = ''
if rows_deleted > 0:
    dropping = input(F"Warning! The database contains {rows_deleted} rows with MISSING VALUES. It makes {percentage_deleted:.2f}% of the database.\nWould you like to delete these rows? (y/n)")
    if dropping == 'y':
        df = df_dropped.copy()
        print(f"Number of rows deleted: {rows_deleted}")
        print(f"Percentage of database deleted: {percentage_deleted:.2f}%")
    else:
        print('Proceeding without dropping the missing values. The missing values are replaced with zeros.')
        df.replace(np.nan, 0, inplace=True)
else: print("No missing values found in the database.")

No missing values found in the database.


In [289]:
# Dropping duplicates

# Count the number of duplicate rows before dropping
total_duplicates_before = df.duplicated().sum()

# Drop duplicate rows
df_deduplicated = df.drop_duplicates()

# Count the number of rows after dropping duplicates
total_duplicates_after = df.shape[0] - df_deduplicated.shape[0]

# Calculate the number of duplicate rows deleted and its percentage
if total_duplicates_before > 0:
    duplicates_deleted = total_duplicates_before - total_duplicates_after
    percentage_duplicates_deleted = (duplicates_deleted / total_duplicates_before) * 100
else:
    duplicates_deleted = 0
    percentage_duplicates_deleted = 0

dropping = ''
if duplicates_deleted > 0:
    dropping = input(f"Warning! The database contains {duplicates_deleted} DUPLICATE ROWS. It makes {percentage_duplicates_deleted:.2f}% of the database.\nWould you like to delete these rows? (y/n)")
    if dropping == 'y':
        df = df_deduplicated.copy()
        print(f"Number of duplicate rows deleted: {duplicates_deleted}")
        print(f"Percentage of duplicates deleted: {percentage_duplicates_deleted:.2f}%")
    else:
        print('Proceeding without dropping duplicate rows.')
else:
    print('No duplicate rows found in the database.')



No duplicate rows found in the database.


In [290]:
# Identify columns containing only integers
integer_columns = []
for col in df.columns:
    if df[col].apply(lambda x: isinstance(x, (int, bool, str)) and re.match(r'^-?\d+$', str(x)) is not None).all():
        integer_columns.append(col)

# Convert identified columns to numeric dtype
for col in integer_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Print updated dtypes
print(df.dtypes)

print(integer_columns)

refractive index    float64
Sodium              float64
Magnesium           float64
Aluminum            float64
Silicon             float64
Potassium           float64
Calcium             float64
Barium              float64
Iron                float64
Type                 object
dtype: object
[]


In [291]:
X = df.drop(label_col, axis=1).copy()
y = df[label_col]
label_list = list(y.unique())

In [292]:
# X encoding

threshold_percentage = 0.9  # 90%
unique_values_threshold = 6

# Step 1: Attempt to convert columns with mixed types to numeric where possible
def try_convert_to_numeric(column):
    return pd.to_numeric(column, errors='coerce')

# Step 2: Determine whether a column is numeric based on the threshold
def is_numeric_column(column, threshold_percentage):
    column_numeric = try_convert_to_numeric(column)
    num_numeric = column_numeric.notna().sum()
    return (num_numeric / len(column)) >= threshold_percentage

# Step 3: Classify columns and drop non-numeric values from numeric columns
numeric_columns = []
categorical_columns = []

for col in X.columns:
    if is_numeric_column(X[col], threshold_percentage):
        column_numeric = try_convert_to_numeric(X[col])
        if column_numeric.nunique() < unique_values_threshold:
            categorical_columns.append(col)
        else:
            numeric_columns.append(col)
            # Drop non-numeric values from numeric columns
            X[col] = column_numeric
    else:
        categorical_columns.append(col)

# Print the columns
print("Numeric Columns:")
print(numeric_columns)

print("\nCategorical Columns:")
print(categorical_columns)


Numeric Columns:
['refractive index', 'Sodium', 'Magnesium', 'Aluminum', 'Silicon', 'Potassium', 'Calcium', 'Barium', 'Iron']

Categorical Columns:
[]


In [293]:
categorical_columns_toformat = []
threshold_categories = 15
for col in categorical_columns:
    if len(list(X[col].unique())) > threshold_categories:
        categorical_columns_toformat.append(col)

if categorical_columns_toformat:
    dropping = ''
    dropping = input(f"Warning! Found {len(categorical_columns_toformat)} categorical columns where the number of categories is higher than {threshold_categories}: {categorical_columns_toformat}. You may need to check the formatting of the columns. Drop the columns for the analysis? (y/n)")
    if dropping == 'y':
        X.drop(categorical_columns_toformat, axis=1, inplace=True)

        numeric_columns = []
        categorical_columns = []

        for col in X.columns:
            if is_numeric_column(X[col], threshold_percentage):
                column_numeric = try_convert_to_numeric(X[col])
                if column_numeric.nunique() < unique_values_threshold:
                    categorical_columns.append(col)
                else:
                    numeric_columns.append(col)
                    # Drop non-numeric values from numeric columns
                    X[col] = column_numeric
            else:
                categorical_columns.append(col)

        # Print the columns
        print("Numeric Column after dropping:")
        print(numeric_columns)

        print("\nCategorical Columns after dropping:")
        print(categorical_columns)


In [294]:
# Encoding categorical variables

if categorical_columns:
    print(X[categorical_columns].dtypes)
    print("\n")
    integer_dtypes = [int, np.int8, np.int16, np.int32, np.int64, 
                    np.uint, np.uint8, np.uint16, np.uint32, np.uint64,
                    float, np.float16, np.float32, np.float64]

    for col in categorical_columns:
        if X[col].dtype not in integer_dtypes:
            # Step 1: Initialize and fit the LabelEncoder
            original_labels = X[col].unique()
            encoder = LabelEncoder()
            X[col] = encoder.fit_transform(X[col])

            # Step 2: Extract the original labels and their corresponding encoded values
            
            encoded_values = list(range(len(original_labels)))

            # Step 3: Create and display the reference table
            reference_table = dict(zip(original_labels, encoded_values))

            print(f"The values of column {col} are not numerical and have been encoded.\nReference Table of {col}:")
            for label, encoded in reference_table.items():
                print(f"{label} -> {encoded}")
            print('\n')
        else:
            print(f'The labels of column {col} are numerical.')



In [295]:
scaling = ''
scaling = input("Scale the numerical data using the StandardScaler? (y/n)")
if scaling == "y":
    scaler = StandardScaler()
    scaler.fit(X[numeric_columns])
    X[numeric_columns] = scaler.transform(X[numeric_columns])
    print("Numerical data has been normalized.")


Numerical data has been normalized.


In [296]:
# Checking for label balance

try:
    label_counts = pd.concat([y.value_counts(), 
                y.value_counts(normalize=True).mul(100).round(2)],axis=1, keys=('counts','percentage'))
except:
    print("Error, check the labels column.")

unbalanced = False
label_low = {}
for i in range(len(label_list)):
    if label_counts['percentage'].iloc[i] <= 10:
        label_low[label_list[i]] = label_counts['percentage'].iloc[i]
        unbalanced = True

if unbalanced:
    print('Warning! The dataset is unbalanced in terms of labels!\n')
    for i in label_low:
        print(i,"makes",label_low[i],'% of the dataset.')
else:
    print('The dataset is balanced.')

print("\n",label_counts)


containers makes 7.94 % of the dataset.
building_windows_float_processed makes 6.07 % of the dataset.
tableware makes 4.21 % of the dataset.

                                       counts  percentage
Type                                                    
building_windows_non_float_processed      76       35.51
building_windows_float_processed          70       32.71
headlamps                                 29       13.55
vehicle_windows_float_processed           17        7.94
containers                                13        6.07
tableware                                  9        4.21


In [297]:
# Create a DataFrame from the array
y_types = pd.DataFrame({'values': y})

# Add a new column that contains the type of each entry
y_types['types'] = y_types['values'].apply(lambda x: type(x).__name__)

# Print the unique types
unique_types = y_types['types'].unique()
print("Unique types in the y array:")
print(unique_types)

Unique types in the y array:
['str']


In [298]:
# Labe; encoding
# Define the integer types
integer_dtypes = [int, np.int8, np.int16, np.int32, np.int64, 
                  np.uint, np.uint8, np.uint16, np.uint32, np.uint64]

# Check if the elements in y are integers
if not np.issubdtype(y.dtype, np.integer):
    # Step 1: Initialize and fit the LabelEncoder
    encoder = LabelEncoder()
    y_encoded = encoder.fit_transform(y)

    # Step 2: Extract the original labels and their corresponding encoded values
    original_labels = encoder.classes_
    encoded_values = list(range(len(original_labels)))

    # Step 3: Create and display the reference table
    reference_table = dict(zip(original_labels, encoded_values))

    print("The labels are not numerical and have been encoded.\n\nReference Table:")
    for label, encoded in reference_table.items():
        print(f"{label} -> {encoded}")

    # Assign encoded labels back to y
    y = y_encoded
else:
    print('The labels are numerical.')

The labels are not numerical and have been encoded.

Reference Table:
building_windows_float_processed -> 0
building_windows_non_float_processed -> 1
containers -> 2
headlamps -> 3
tableware -> 4
vehicle_windows_float_processed -> 5


In [299]:
unique_classes = np.unique(y)
num_unique_classes = len(unique_classes)

multiclass = False

if num_unique_classes == 2:
    print("The classification task is binary.")
elif num_unique_classes > 2:
    print("The classification task is multiclass.")
    multiclass = True
else:
    print("The classification task is not clearly defined (less than 2 unique classes).")

The classification task is multiclass.


In [300]:
X[label_col+" (Label)"] = y
df = X.copy()

In [301]:
categorical_columns.append(label_col + " (Label)")

for col in categorical_columns:
    df[col] = df[col].astype('category')

In [302]:
# Save the summary DataFrame to a CSV file
filename = input("Save the processed data? If yes, provide the name for csv file: ")

if filename:
    df.to_csv(f'{filename}.csv', index=False)
    print(f"Data saved as file: {filename}.csv")

Data saved as file: glass.csv


In [303]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [304]:
cv = StratifiedKFold(n_splits=20, shuffle=True, random_state=42)
metric = "f1"
multiclass_averaging = "macro"
if multiclass:
    metric = metric + "_" + multiclass_averaging

model_LR = LogisticRegression()

model_NB = GaussianNB()

model_LDA = LinearDiscriminantAnalysis()

model_QDA = QuadraticDiscriminantAnalysis()

model_KN = KNeighborsClassifier()
KN_grid = {
    "metric": ['euclidean', 'manhattan', 'cosine'],
    "n_neighbors": range(1, 31, 2)
    }

model_DT = DecisionTreeClassifier()
DT_grid={
        "max_depth": range(1,21)
        }

model_RF = RandomForestClassifier(random_state=42)
RF_grid={
    'n_estimators': [10, 50, 100, 200],
    "max_depth": range(2, 16, 2)
}

model_AB = AdaBoostClassifier(estimator=DecisionTreeClassifier(random_state=42))
AB_grid={
    'n_estimators': [10, 50, 100, 200],
    "estimator__max_depth": range(1, 6)
}

model_GB = GradientBoostingClassifier(random_state=42)
GB_grid={
        'n_estimators': [10, 50, 100, 200],
        "max_depth": range(2, 6)
    }

model_SVC = SVC()
SVC_grid={
        'C': [0.1, 1, 10, 100],
        'kernel': ['linear', 'sigmoid', 'rbf', 'poly'],
        'gamma': [0.001, 0.01, 0.1, 1]
    }

model_grid_dict = {
    model_LR: {},  # No hyperparameters for grid search
    model_NB: {},  # No hyperparameters for grid search
    model_LDA: {},  # No hyperparameters for grid search
    model_QDA: {},  # No hyperparameters for grid search
    model_KN: KN_grid,
    model_DT: DT_grid,
    model_RF: RF_grid,
    model_AB: AB_grid,
    model_GB: GB_grid,
    model_SVC: SVC_grid
}

In [305]:
metric_mean = []
metric_error = []
model_names = []
model_params = []

In [306]:
def run_classificator(model, metric, param_grid = False):
    if param_grid:
        clf = GridSearchCV(
            estimator = model,
            param_grid = param_grid,
            cv=cv,
            scoring = metric
        )
        clf.fit(X, y)
        print(f"\n\nGrid search results for {model.__class__.__name__}:")
        print("\tScore of the best parameters:",round(np.max(clf.cv_results_["mean_test_score"]), 2))
        print("\tBest parameters:",clf.cv_results_["params"][np.argmax(clf.cv_results_["mean_test_score"])])

        best_params = clf.best_params_
        model.set_params(**best_params)
        model_params.append(str(best_params))
    else:
        model_params.append(np.nan)

    pipeline = make_pipeline(StandardScaler(), model)
    scores = cross_val_score(pipeline, X, y, cv = cv, scoring = metric)

    print(f"{model.__class__.__name__} cross-validation mean score: {scores.mean().round(3)}+-{(1.96 * scores.std() / np.sqrt(len(scores))).round(3)}\n\n")

    metric_mean.append(scores.mean().round(3))
    metric_error.append((1.96 * scores.std() / np.sqrt(20)).round(3))
    model_names.append(f'{model.__class__.__name__}')


In [307]:
warnings.filterwarnings("ignore")

for model, grid in tqdm(model_grid_dict.items()):
    run_classificator(model=model, metric=metric, param_grid=grid)

warnings.resetwarnings()


 20%|██        | 2/10 [00:00<00:01,  4.30it/s]

LogisticRegression cross-validation mean score: 0.868+-0.056


GaussianNB cross-validation mean score: 1.0+-0.0




 40%|████      | 4/10 [00:00<00:01,  5.98it/s]

LinearDiscriminantAnalysis cross-validation mean score: 1.0+-0.0


QuadraticDiscriminantAnalysis cross-validation mean score: 0.271+-0.185




 50%|█████     | 5/10 [00:07<00:11,  2.38s/it]



Grid search results for KNeighborsClassifier:
	Score of the best parameters: 0.93
	Best parameters: {'metric': 'manhattan', 'n_neighbors': 5}
KNeighborsClassifier cross-validation mean score: 0.895+-0.046




 60%|██████    | 6/10 [00:09<00:08,  2.25s/it]



Grid search results for DecisionTreeClassifier:
	Score of the best parameters: 0.99
	Best parameters: {'max_depth': 7}
DecisionTreeClassifier cross-validation mean score: 0.969+-0.043




Grid search results for RandomForestClassifier:
	Score of the best parameters: 0.97
	Best parameters: {'max_depth': 6, 'n_estimators': 200}


 70%|███████   | 7/10 [01:35<01:29, 29.78s/it]

RandomForestClassifier cross-validation mean score: 0.962+-0.04




Grid search results for AdaBoostClassifier:
	Score of the best parameters: 1.0
	Best parameters: {'estimator__max_depth': 3, 'n_estimators': 10}


 80%|████████  | 8/10 [02:28<01:14, 37.29s/it]

AdaBoostClassifier cross-validation mean score: 1.0+-0.0




Grid search results for GradientBoostingClassifier:
	Score of the best parameters: 1.0
	Best parameters: {'max_depth': 2, 'n_estimators': 100}


 90%|█████████ | 9/10 [06:55<01:49, 109.03s/it]

GradientBoostingClassifier cross-validation mean score: 1.0+-0.0




100%|██████████| 10/10 [07:06<00:00, 78.65s/it]



Grid search results for SVC:
	Score of the best parameters: 0.97
	Best parameters: {'C': 10, 'gamma': 0.001, 'kernel': 'linear'}
SVC cross-validation mean score: 0.943+-0.051




100%|██████████| 10/10 [07:06<00:00, 42.63s/it]


In [310]:
data = {
    'Model': model_names,
    'Parameters': model_params,
    'Mean Score': metric_mean,
    'Error (+-)': metric_error
}

# Create a DataFrame from the dictionary
df_results = pd.DataFrame(data)

In [311]:
df_results

Unnamed: 0,Model,Parameters,Mean Score,Error (+-)
0,LogisticRegression,,0.868,0.056
1,GaussianNB,,1.0,0.0
2,LinearDiscriminantAnalysis,,1.0,0.0
3,QuadraticDiscriminantAnalysis,,0.271,0.185
4,KNeighborsClassifier,"{'metric': 'manhattan', 'n_neighbors': 5}",0.895,0.046
5,DecisionTreeClassifier,{'max_depth': 7},0.969,0.043
6,RandomForestClassifier,"{'max_depth': 6, 'n_estimators': 200}",0.962,0.04
7,AdaBoostClassifier,"{'estimator__max_depth': 3, 'n_estimators': 10}",1.0,0.0
8,GradientBoostingClassifier,"{'max_depth': 2, 'n_estimators': 100}",1.0,0.0
9,SVC,"{'C': 10, 'gamma': 0.001, 'kernel': 'linear'}",0.943,0.051
