In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE

In [2]:
# Loading the data
X_train = pd.read_csv(
    '/Users/ilya/Desktop/Course_work_Data_mining/CompleteDataSet/x_train_all.csv')

X_test = pd.read_csv(
    '/Users/ilya/Desktop/Course_work_Data_mining/CompleteDataSet/x_test_all.csv')

y_train = pd.read_csv(
    "/Users/ilya/Desktop/Course_work_Data_mining/CompleteDataSet/y_train_all.csv")

y_test = pd.read_csv(
    "/Users/ilya/Desktop/Course_work_Data_mining/CompleteDataSet/y_test_all.csv")

In [3]:
y_train['0'].value_counts()

0
2    2250
1    2220
4    1980
3    1410
8     540
6     360
9     270
7     240
0     210
5     210
Name: count, dtype: int64

### Now we can see that our classes are imbalanced.

In [8]:
# Normalize the data to range [0, 1]
X_train_norm = X_train / 255
X_test_norm = X_test / 255

In [5]:
X_train_norm.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2294,2295,2296,2297,2298,2299,2300,2301,2302,2303
0,0.305882,0.301961,0.298039,0.321569,0.341176,0.360784,0.407843,0.466667,0.458824,0.470588,...,0.341176,0.309804,0.282353,0.298039,0.32549,0.372549,0.388235,0.384314,0.372549,0.368627
1,0.286275,0.294118,0.309804,0.305882,0.298039,0.294118,0.34902,0.419608,0.521569,0.490196,...,0.376471,0.364706,0.333333,0.301961,0.270588,0.286275,0.32549,0.392157,0.396078,0.396078
2,0.282353,0.294118,0.309804,0.301961,0.317647,0.34902,0.411765,0.427451,0.337255,0.352941,...,0.384314,0.372549,0.345098,0.313725,0.286275,0.278431,0.290196,0.313725,0.34902,0.372549
3,0.262745,0.27451,0.290196,0.313725,0.364706,0.419608,0.431373,0.376471,0.270588,0.392157,...,0.439216,0.360784,0.341176,0.321569,0.301961,0.282353,0.27451,0.282353,0.317647,0.345098
4,0.290196,0.290196,0.286275,0.282353,0.301961,0.341176,0.407843,0.427451,0.329412,0.32549,...,0.392157,0.384314,0.388235,0.392157,0.388235,0.34902,0.305882,0.258824,0.266667,0.282353


## Let's use RandomForestClassifier to understand how it will performe on our data

In [19]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state=42)

rf_model.fit(X_train_norm, y_train.values.ravel())

y_pred_train_rf = rf_model.predict(X_train_norm)
y_pred_rf = rf_model.predict(X_test_norm)

# Calculating the metrics
accuracy_train = accuracy_score(y_train, y_pred_train_rf)
accuracy_test = accuracy_score(y_test, y_pred_rf)
class_report = classification_report(y_test, y_pred_rf)

print("Train Accuracy:", accuracy_train.round(3))
print("Test Accuracy:", accuracy_test.round(3))
print(class_report)

Train Accuracy: 1.0
Test Accuracy: 0.772
              precision    recall  f1-score   support

           0       0.85      0.18      0.30        60
           1       0.73      0.87      0.80       720
           2       0.82      0.83      0.83       750
           3       0.75      0.80      0.77       450
           4       0.82      0.79      0.81       660
           5       0.88      0.50      0.64        60
           6       0.71      0.53      0.61        90
           7       0.89      0.28      0.43        60
           8       0.64      0.69      0.66       150
           9       0.73      0.49      0.59        90

    accuracy                           0.77      3090
   macro avg       0.78      0.60      0.64      3090
weighted avg       0.78      0.77      0.76      3090



### As we can see from the classification report the performance of the model on the test data isn't good enough. Even though the model gets 100% accuracy on the train data, it means that model is overfitted.

In [11]:
from catboost import CatBoostClassifier

cb_model = CatBoostClassifier(random_seed=42)

cb_model.fit(X_train_norm, y_train)

y_pred_cb = cb_model.predict(X_test_norm)

# Calculating the metrics
accuracy = accuracy_score(y_test, y_pred_cb)
class_report = classification_report(y_test, y_pred_cb)

print(accuracy.round(3))
print(class_report)

Learning rate set to 0.088844
0:	learn: 2.1266410	total: 578ms	remaining: 9m 37s
1:	learn: 1.9814639	total: 1.11s	remaining: 9m 16s
2:	learn: 1.8897637	total: 1.58s	remaining: 8m 45s
3:	learn: 1.7933216	total: 2.06s	remaining: 8m 32s
4:	learn: 1.7293327	total: 2.54s	remaining: 8m 25s
5:	learn: 1.6687654	total: 3.02s	remaining: 8m 19s
6:	learn: 1.6133945	total: 3.53s	remaining: 8m 21s
7:	learn: 1.5539859	total: 4.01s	remaining: 8m 17s
8:	learn: 1.5123005	total: 4.49s	remaining: 8m 13s
9:	learn: 1.4621117	total: 4.95s	remaining: 8m 10s
10:	learn: 1.4188325	total: 5.42s	remaining: 8m 7s
11:	learn: 1.3843246	total: 5.88s	remaining: 8m 4s
12:	learn: 1.3535123	total: 6.36s	remaining: 8m 2s
13:	learn: 1.3183720	total: 6.83s	remaining: 8m
14:	learn: 1.2857124	total: 7.29s	remaining: 7m 58s
15:	learn: 1.2619086	total: 7.75s	remaining: 7m 56s
16:	learn: 1.2351226	total: 8.22s	remaining: 7m 55s
17:	learn: 1.2148109	total: 8.7s	remaining: 7m 54s
18:	learn: 1.1939833	total: 9.17s	remaining: 7m 53s


### Gradient Boosting Classifier - CatBoost gets better results than Random Forest, yet it's still not in comparison with neural networks.

## Now we will use SMOTE technique to oversample the minority class

In [12]:
# Apply SMOTE for class imbalance
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(
    X_train_norm, y_train['0'])

print(y_train_resampled.value_counts())

0
0    2250
1    2250
2    2250
3    2250
4    2250
5    2250
6    2250
7    2250
8    2250
9    2250
Name: count, dtype: int64


### Now our classes are balanced and let's see if it will improve the model's performance

In [14]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state=42)

rf_model.fit(X_train_resampled, y_train_resampled.ravel())

y_pred_rf = rf_model.predict(X_test_norm)

# Calculating the metrics
accuracy = accuracy_score(y_test, y_pred_rf)
class_report = classification_report(y_test, y_pred_rf)

print(accuracy.round(3))
print(class_report)

0.772
              precision    recall  f1-score   support

           0       0.67      0.17      0.27        60
           1       0.76      0.83      0.80       720
           2       0.81      0.81      0.81       750
           3       0.76      0.81      0.79       450
           4       0.78      0.81      0.80       660
           5       0.87      0.55      0.67        60
           6       0.74      0.66      0.69        90
           7       0.83      0.25      0.38        60
           8       0.68      0.75      0.72       150
           9       0.69      0.52      0.59        90

    accuracy                           0.77      3090
   macro avg       0.76      0.62      0.65      3090
weighted avg       0.77      0.77      0.76      3090



In [17]:
from catboost import CatBoostClassifier

cb_model = CatBoostClassifier(random_seed=42)

cb_model.fit(X_train_resampled, y_train_resampled)

y_pred_cb = cb_model.predict(X_test_norm)

# Calculating the metrics
accuracy = accuracy_score(y_test, y_pred_cb)
class_report = classification_report(y_test, y_pred_cb)

print(accuracy.round(3))
print(class_report)

Learning rate set to 0.092744
0:	learn: 2.0845246	total: 689ms	remaining: 11m 28s
1:	learn: 1.9233736	total: 1.38s	remaining: 11m 27s
2:	learn: 1.8071139	total: 2.02s	remaining: 11m 9s
3:	learn: 1.7059848	total: 2.67s	remaining: 11m 4s
4:	learn: 1.6178162	total: 3.35s	remaining: 11m 7s
5:	learn: 1.5484650	total: 3.98s	remaining: 10m 59s
6:	learn: 1.4893826	total: 4.61s	remaining: 10m 54s
7:	learn: 1.4280066	total: 5.27s	remaining: 10m 53s
8:	learn: 1.3686749	total: 5.93s	remaining: 10m 52s
9:	learn: 1.3210604	total: 6.58s	remaining: 10m 51s
10:	learn: 1.2822973	total: 7.22s	remaining: 10m 49s
11:	learn: 1.2377536	total: 7.88s	remaining: 10m 48s
12:	learn: 1.2053251	total: 8.52s	remaining: 10m 46s
13:	learn: 1.1684772	total: 9.18s	remaining: 10m 46s
14:	learn: 1.1402469	total: 9.82s	remaining: 10m 44s
15:	learn: 1.1136762	total: 10.4s	remaining: 10m 42s
16:	learn: 1.0831657	total: 11.1s	remaining: 10m 41s
17:	learn: 1.0621857	total: 11.8s	remaining: 10m 43s
18:	learn: 1.0371728	total: 1

### Unfotunately this approach did not improve the accuracy of both models