# MACHINE LEARNING PROJECT


## Importing packages

In [1]:
import pandas as pd
import glob
import numpy as np
import matplotlib.pyplot as plt
import time

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

## Loading dataset


In [4]:
def load_data(file_pattern, delimiter=';'):
    files = glob.glob(file_pattern)
    dfs = [pd.read_csv(file, delimiter=delimiter) for file in files]
    return pd.concat(dfs, ignore_index=True)

In [5]:
# Load training data for each neural network
efficientnet_train_data = load_data('EfficientNet/TRAIN_*.csv')
mobilenet_train_data = load_data('MobileNet/TRAIN_*.csv')
resnet_train_data = load_data('ResNet/TRAIN_*.csv')

In [6]:
efficientnet_test_data = pd.read_csv('efficientNet_test/TEST.csv', delimiter=';')
mobilenet_test_data = pd.read_csv('mobileNet_test/TEST.csv', delimiter=';')
resnet_test_data = pd.read_csv('resNet_test/TEST.csv', delimiter=';')

### Combining dataset

In [7]:
train_data = pd.concat([efficientnet_train_data, mobilenet_train_data, resnet_train_data], ignore_index=True)
test_data = pd.concat([efficientnet_test_data, mobilenet_test_data, resnet_test_data], ignore_index=True)

In [8]:
#train_data.describe()

In [9]:
#train_data.head()

In [10]:
# Saving data to csv to confirm the proper concatination of data
#train_data.to_csv('combined_train_data.csv', index=False)
#test_data.to_csv('combined_test_data.csv', index=False)

## Pre-processing & Normalization

In [11]:
# Dropping the first column 
# train_data.drop('Unnamed', axis=1)

**_X_train_** is dataset without last three columns

**_y_train_** contains only **_labels_** column

**_X_test_img_** is the test dataset without **_image_** column

**_y_test_img_** contains only **_image_** column

In [12]:
# Separate features and labels
X_train = train_data.drop(['images', 'cone_name', 'label'], axis=1)
y_train = train_data['label']
X_test_img = test_data.drop('image', axis=1)
y_test_img = test_data['image']

In [13]:
#from sklearn.impute import SimpleImputer

#imputer = SimpleImputer(strategy='mean')
#X_train_imputed = imputer.fit_transform(X_train)
#
# Ensure imputation has been done correctly
#X_train_imputed = pd.DataFrame(X_train_imputed, columns=X_train.columns)

Large portion of dataset is missing values, imputation might not be reliable.

Consider models that can handle missing values inherently, such as tree-based models

Some models inherently handle missing values better than others. For example, Decision Trees 

and Random Forests can handle missing values by making decisions based on available data, 

while linear models and Naive Bayes typically require complete data.

In [14]:
#X_train = X_train_imputed

In [15]:
#X_train.head()

In [16]:
#X_test_img.head()

In [17]:
#y_train.head()

In [18]:
#y_test_img.head()

In [19]:
#X_train.describe()

In [20]:
# Don't drop any row because 

#X_train.dropna(inplace=True)
#X_train.describe()

In [21]:
# CHeck if normalization is needed
#scaler = StandardScaler()
#scaler.fit(X)
#X = scaler.transform(X)

## PCA Dimensionality Reduction

In [22]:
# Dimensionality reduction using PCA
#pca = PCA(n_components=512)
#X_train_pca = pca.fit_transform(X_train)
#X_test_pca = pca.transform(X_test_img)

## Train-Test splitting

Splitting **_train_** dataset into **70/30** ratio

**_X_train_** is dataset without last three columns

**_y_train_** contains only **_labels_** column

**_X_test_img_** is the test dataset without **_image_** column

**_y_test_img_** contains only **_image_** column

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.3, random_state=42)

In [24]:
#X_train.describe()

# Gradient Boosting



In [25]:
y_train = y_train - 1
y_test = y_test - 1

In [None]:
#from xgboost import XGBClassifier

xgb_model = XGBClassifier(n_estimators=400, random_state=42)

# Train the model
start_time = time.time()
xgb_model.fit(X_train, y_train)
end_time = time.time()

train_time = end_time-start_time
#print(f'Train time: {train_time}')

In [None]:
# Make predictions
#y_pred_xgb = xgb_model.predict(X_test)

# Evaluate accuracy
#accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
#print(f'XGBoost Accuracy: {accuracy_xgb:.2f}')

In [None]:
"""# Define the parameter grid
param_grid = {
    'n_estimators': [400]
}

# Initialize the XGBClassifier
xgb = XGBClassifier(random_state=42)

# Initialize GridSearchCV
start_time = time.time()
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=2)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

end_time = time.time()

train_time = end_time-start_time
print(f'Train time: {train_time}')
"""

In [None]:
"""# Best parameters and best score
print(f'Best parameters: {grid_search.best_params_}')
print(f'Best cross-validation accuracy: {grid_search.best_score_:.2f}')

# Use the best model
best_xgb = grid_search.best_estimator_
best_xgb.fit(X_train, y_train)
y_pred_best_xgb = best_xgb.predict(X_test)
accuracy_best_xgb = accuracy_score(y_test, y_pred_best_xgb)
print(f'XGBoost Best Model Accuracy: {accuracy_best_xgb:.2f}')
"""