In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import glob
from xgboost import XGBClassifier

In [2]:
def load_data(file_pattern, delimiter=';'):
    """
    Funcion for loading datasets
    """
    files = glob.glob(file_pattern)
    dfs = [pd.read_csv(file, delimiter=delimiter) for file in files]
    return pd.concat(dfs, ignore_index=True)

In [3]:
# Load training data
efficientnet_train_data = load_data('EfficientNet/TRAIN_*.csv')
mobilenet_train_data = load_data('MobileNet/TRAIN_*.csv')
resnet_train_data = load_data('ResNet/TRAIN_*.csv')

# Load test data
efficientnet_test_data = pd.read_csv('EfficientNet_test/TEST.csv', delimiter=';')
mobilenet_test_data = pd.read_csv('MobileNet_test/TEST.csv', delimiter=';')
resnet_test_data = pd.read_csv('ResNet_test/TEST.csv', delimiter=';')

In [4]:
# Combining dataset
df_merged = efficientnet_train_data.merge(mobilenet_train_data, on=['images','cone_name','label'], suffixes=('_eff', '_mob'))
df_merged = df_merged.merge(resnet_train_data, on=['images','cone_name','label'])

df_merged_test = efficientnet_test_data.merge(mobilenet_test_data, on=['image'], suffixes=('_eff', '_mob'))
df_merged_test = df_merged_test.merge(resnet_test_data, on=['image'])

In [5]:
# Dropping columns from merged dataframe of train dataset
X = df_merged.drop(['Unnamed: 0_eff','Unnamed: 0_mob','Unnamed: 0','images','cone_name','label'], axis=1)
y = df_merged['label'] - 1     # Changing index of label to start from '0'

# Dropping columns from test dataset
X_test_img = df_merged_test.drop(['Unnamed: 0_eff','Unnamed: 0_mob','Unnamed: 0'],
                                 axis=1)
X_img = X_test_img.drop('image', axis=1)
y_img = X_test_img['image']

In [6]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Initialize the Random Forest Classifier
#rf_clf = RandomForestClassifier(n_estimators=300, random_state=42)

# Train the model
#rf_clf.fit(X_train, y_train)

# Make predictions on the test set
#y_pred = rf_clf.predict(X_test)

## Initialize XGBoost
xgb_clf = XGBClassifier(n_estimators=100)
xgb_clf.fit(X_train, y_train)
y_pred = xgb_clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Detailed classification report
print('Classification Report:')
print(classification_report(y_test, y_pred))

# Confusion matrix
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.9051029543419875
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.77      0.82       164
           1       0.92      0.95      0.94       383
           2       0.79      0.75      0.77       144
           3       0.94      0.96      0.95       426

    accuracy                           0.91      1117
   macro avg       0.88      0.86      0.87      1117
weighted avg       0.90      0.91      0.90      1117

Confusion Matrix:
[[127  12  19   6]
 [  3 365   5  10]
 [ 11  13 108  12]
 [  3   7   5 411]]


In [7]:
def predict_and_save(model, X_test, y_img, output_file='submission.csv'):
    """
    Predict and save the test results to a CSV file.
    """
    pred = model.predict(X_test)
    pred = pred + 1 # Changing index back to '1' from '0'
    submission_df = pd.DataFrame({'ID': y_img, 'Class': pred})
    submission_df.to_csv(output_file, index=False)
    print(f"Predictions saved to {output_file}")
    
predict_and_save(xgb_clf, X_img, y_img)
#predict_and_save(rf_clf, X_img, y_img, output_file='submission.csv')

Predictions saved to submission.csv
