In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import dataframe_image as dfi

In [3]:
# Load the game_clean_df.csv dataset.
game_df = pd.read_csv("../Resources/game_clean_df.csv")
game_df.head()

Unnamed: 0.1,Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Global_Sales,Critic_Score,Developer,Rating
0,42,Grand Theft Auto V,PS4,2014,Action,Take-Two Interactive,12610000,97,Rockstar North,M
1,62,New Super Mario Bros. 2,3DS,2012,Platform,Nintendo,9900000,78,Nintendo,E
2,73,Animal Crossing: New Leaf,3DS,2012,Simulation,Nintendo,9160000,88,Nintendo,E
3,77,FIFA 16,PS4,2015,Sports,Electronic Arts,8570000,82,EA Sports,E
4,92,Call of Duty: Advanced Warfare,PS4,2014,Shooter,Activision,7660000,83,Sledgehammer Games,M


In [4]:
# Drop the Unnamed: 0 column.
game_df = game_df.drop(columns=["Unnamed: 0"])

print(game_df.shape)
game_df.head(10)

(867, 9)


Unnamed: 0,Name,Platform,Year_of_Release,Genre,Publisher,Global_Sales,Critic_Score,Developer,Rating
0,Grand Theft Auto V,PS4,2014,Action,Take-Two Interactive,12610000,97,Rockstar North,M
1,New Super Mario Bros. 2,3DS,2012,Platform,Nintendo,9900000,78,Nintendo,E
2,Animal Crossing: New Leaf,3DS,2012,Simulation,Nintendo,9160000,88,Nintendo,E
3,FIFA 16,PS4,2015,Sports,Electronic Arts,8570000,82,EA Sports,E
4,Call of Duty: Advanced Warfare,PS4,2014,Shooter,Activision,7660000,83,Sledgehammer Games,M
5,FIFA 17,PS4,2016,Sports,Electronic Arts,7590000,85,"EA Sports, EA Vancouver",E
6,Fallout 4,PS4,2015,Role-Playing,Bethesda Softworks,7160000,87,Bethesda Game Studios,M
7,Mario Kart 8,WiiU,2014,Racing,Nintendo,7090000,88,Nintendo,E
8,FIFA 15,PS4,2014,Sports,Electronic Arts,6080000,82,EA Sports,E
9,Destiny,PS4,2014,Shooter,Activision,5640000,76,"Bungie Software, Bungie",T


# Naive Random Oversampling

### Naive Random Oversampling by Platform

In [5]:
target = ["Platform"]

# Create our features
X = pd.get_dummies(game_df.drop(columns='Platform').copy())

# Create our target
y = game_df[target].copy()

In [6]:
X.describe()

Unnamed: 0,Year_of_Release,Global_Sales,Critic_Score,Name_7 Days to Die,Name_7th Dragon III Code: VFD,Name_Adventure Time: The Secret of the Nameless Kingdom,Name_Aegis of Earth: Protonovus Assault,Name_Agatha Christie's The ABC Murders,Name_Alan Wake,Name_Alien: Isolation,...,Developer_Zipper Interactive,Developer_Zoe Mode,Developer_h.a.n.d. Inc.,Developer_id Software,Developer_n-Space,Developer_syn Sophia,Rating_E,Rating_E10+,Rating_M,Rating_T
count,867.0,867.0,867.0,867.0,867.0,867.0,867.0,867.0,867.0,867.0,...,867.0,867.0,867.0,867.0,867.0,867.0,867.0,867.0,867.0,867.0
mean,2014.296424,691672.4,72.959631,0.002307,0.001153,0.001153,0.001153,0.001153,0.001153,0.00346,...,0.001153,0.002307,0.001153,0.001153,0.00346,0.001153,0.222607,0.186851,0.320646,0.269896
std,1.362149,1237520.0,12.209563,0.048001,0.033962,0.033962,0.033962,0.033962,0.033962,0.058756,...,0.033962,0.048001,0.033962,0.033962,0.058756,0.033962,0.416236,0.390017,0.466994,0.444162
min,2012.0,10000.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2013.0,90000.0,67.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2014.0,240000.0,75.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2015.0,685000.0,81.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
max,2016.0,12610000.0,97.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
# Check the balance of our target values
y['Platform'].value_counts()

PS4     239
PC      174
XOne    159
PSV     106
3DS     100
WiiU     89
Name: Platform, dtype: int64

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [9]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({'Platform': 1})

In [10]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)

model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [11]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

y_pred = model.predict(X_test)

balanced_accuracy_score(y_test, y_pred)

0.22420774499441912

In [12]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[ 0,  5,  5,  7,  1,  1],
       [ 0,  3,  4, 35,  4,  1],
       [ 0, 12, 13, 29,  7,  2],
       [ 0,  4,  0, 19,  3,  0],
       [ 0,  3,  5,  8,  7,  1],
       [ 1,  6, 13, 10,  6,  2]], dtype=int64)

In [13]:
# Create a DataFrame from the confusion matrix.
NRO_cm = confusion_matrix(y_test, y_pred)
NRO_cm_df = pd.DataFrame(
    NRO_cm, 
    index=["Actual PS4", "Actual PC", "Actual XOne", "Actual WiiU", "Actual 3DS", "Actual PSV"], 
    columns=["Predicted PS4", "Predicted PC", "Predicted XOne", "Predicted WiiU", "Predicted 3DS", "Predicted PSV"])

NRO_cm_df

Unnamed: 0,Predicted PS4,Predicted PC,Predicted XOne,Predicted WiiU,Predicted 3DS,Predicted PSV
Actual PS4,0,5,5,7,1,1
Actual PC,0,3,4,35,4,1
Actual XOne,0,12,13,29,7,2
Actual WiiU,0,4,0,19,3,0
Actual 3DS,0,3,5,8,7,1
Actual PSV,1,6,13,10,6,2


In [14]:
# Save the NRO dataframe
dfi.export(NRO_cm_df, "analysis/NRO By Platform.png")

C:\Program Files\Google\Chrome\Application\chrome.exe
C:\Program Files\Google\Chrome\Application\chrome.exe


In [15]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

print("Naive Random Oversampling By Platform")
print("\n")
print(f"Accuracy: { balanced_accuracy_score(y_test, y_pred) * 100:.2f}% \n\n")
print(classification_report_imbalanced(y_test, y_pred))

Naive Random Oversampling By Platform


Accuracy: 22.42% 


                   pre       rec       spe        f1       geo       iba       sup

        3DS       0.00      0.00      0.99      0.00      0.00      0.00        19
         PC       0.09      0.06      0.82      0.07      0.23      0.05        47
        PS4       0.33      0.21      0.82      0.25      0.41      0.16        63
        PSV       0.18      0.73      0.53      0.28      0.62      0.40        26
       WiiU       0.25      0.29      0.89      0.27      0.51      0.24        24
       XOne       0.29      0.05      0.97      0.09      0.23      0.05        38

avg / total       0.21      0.20      0.84      0.17      0.34      0.14       217



### Naive Random Oversampling by Rating

In [16]:
target = ["Rating"]

# Create our features
X = pd.get_dummies(game_df.drop(columns='Rating').copy())

# Create our target
y = game_df[target].copy()

In [17]:
X.describe()

Unnamed: 0,Year_of_Release,Global_Sales,Critic_Score,Name_7 Days to Die,Name_7th Dragon III Code: VFD,Name_Adventure Time: The Secret of the Nameless Kingdom,Name_Aegis of Earth: Protonovus Assault,Name_Agatha Christie's The ABC Murders,Name_Alan Wake,Name_Alien: Isolation,...,Developer_Yager,Developer_Yuke's,Developer_ZeniMax Media,Developer_Zerodiv,Developer_Zipper Interactive,Developer_Zoe Mode,Developer_h.a.n.d. Inc.,Developer_id Software,Developer_n-Space,Developer_syn Sophia
count,867.0,867.0,867.0,867.0,867.0,867.0,867.0,867.0,867.0,867.0,...,867.0,867.0,867.0,867.0,867.0,867.0,867.0,867.0,867.0,867.0
mean,2014.296424,691672.4,72.959631,0.002307,0.001153,0.001153,0.001153,0.001153,0.001153,0.00346,...,0.001153,0.00692,0.002307,0.002307,0.001153,0.002307,0.001153,0.001153,0.00346,0.001153
std,1.362149,1237520.0,12.209563,0.048001,0.033962,0.033962,0.033962,0.033962,0.033962,0.058756,...,0.033962,0.082949,0.048001,0.048001,0.033962,0.048001,0.033962,0.033962,0.058756,0.033962
min,2012.0,10000.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2013.0,90000.0,67.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2014.0,240000.0,75.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2015.0,685000.0,81.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2016.0,12610000.0,97.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [18]:
# Check the balance of our target values
y['Rating'].value_counts()

M       278
T       234
E       193
E10+    162
Name: Rating, dtype: int64

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [20]:
# Resample the training data with the RandomOversampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({'Rating': 1})

In [21]:
model = LogisticRegression(solver='lbfgs', random_state=1)

model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [22]:
y_pred = model.predict(X_test)

balanced_accuracy_score(y_test, y_pred)

0.32419432976661894

In [23]:
confusion_matrix(y_test, y_pred)

array([[ 4,  1, 16, 34],
       [ 4, 10,  4, 22],
       [ 7,  3, 17, 56],
       [ 1,  4,  4, 30]], dtype=int64)

In [24]:
# Create a DataFrame from the confusion matrix.
NRO_cm = confusion_matrix(y_test, y_pred)
NRO_cm_df = pd.DataFrame(
    NRO_cm, 
    index=["Actual M", "Actual T", "Actual E", "Actual E10+"], 
    columns=["Predicted M", "Predicted T", "Predicted E", "Predicted E10+"])

NRO_cm_df

Unnamed: 0,Predicted M,Predicted T,Predicted E,Predicted E10+
Actual M,4,1,16,34
Actual T,4,10,4,22
Actual E,7,3,17,56
Actual E10+,1,4,4,30


In [25]:
dfi.export(NRO_cm_df, "analysis/NRO By Rating.png")

C:\Program Files\Google\Chrome\Application\chrome.exe


In [26]:
print("Naive Random Oversampling By Rating")
print("\n")
print(f"Accuracy: { balanced_accuracy_score(y_test, y_pred) * 100:.2f}% \n\n")
print(classification_report_imbalanced(y_test, y_pred))

Naive Random Oversampling By Rating


Accuracy: 32.42% 


                   pre       rec       spe        f1       geo       iba       sup

          E       0.25      0.07      0.93      0.11      0.26      0.06        55
       E10+       0.56      0.25      0.95      0.34      0.49      0.22        40
          M       0.41      0.20      0.82      0.27      0.41      0.16        83
          T       0.21      0.77      0.37      0.33      0.53      0.30        39

avg / total       0.36      0.28      0.79      0.26      0.41      0.17       217



### Naive Random Oversampling by Genre

In [27]:
target = ["Genre"]

# Create our features
X = pd.get_dummies(game_df.drop(columns='Genre').copy())

# Create our target
y = game_df[target].copy()

In [28]:
X.describe()

Unnamed: 0,Year_of_Release,Global_Sales,Critic_Score,Name_7 Days to Die,Name_7th Dragon III Code: VFD,Name_Adventure Time: The Secret of the Nameless Kingdom,Name_Aegis of Earth: Protonovus Assault,Name_Agatha Christie's The ABC Murders,Name_Alan Wake,Name_Alien: Isolation,...,Developer_Zipper Interactive,Developer_Zoe Mode,Developer_h.a.n.d. Inc.,Developer_id Software,Developer_n-Space,Developer_syn Sophia,Rating_E,Rating_E10+,Rating_M,Rating_T
count,867.0,867.0,867.0,867.0,867.0,867.0,867.0,867.0,867.0,867.0,...,867.0,867.0,867.0,867.0,867.0,867.0,867.0,867.0,867.0,867.0
mean,2014.296424,691672.4,72.959631,0.002307,0.001153,0.001153,0.001153,0.001153,0.001153,0.00346,...,0.001153,0.002307,0.001153,0.001153,0.00346,0.001153,0.222607,0.186851,0.320646,0.269896
std,1.362149,1237520.0,12.209563,0.048001,0.033962,0.033962,0.033962,0.033962,0.033962,0.058756,...,0.033962,0.048001,0.033962,0.033962,0.058756,0.033962,0.416236,0.390017,0.466994,0.444162
min,2012.0,10000.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2013.0,90000.0,67.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2014.0,240000.0,75.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2015.0,685000.0,81.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
max,2016.0,12610000.0,97.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [29]:
# Check the balance of our target values
y['Genre'].value_counts()

Action          290
Role-Playing    111
Shooter         109
Sports           94
Platform         50
Racing           49
Misc             44
Adventure        37
Fighting         31
Simulation       22
Strategy         21
Puzzle            9
Name: Genre, dtype: int64

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [31]:
# Resample the training data with the RandomOversampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({'Genre': 1})

In [32]:
model = LogisticRegression(solver='lbfgs', random_state=1)

model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [33]:
y_pred = model.predict(X_test)

balanced_accuracy_score(y_test, y_pred)

0.11846450733437035

In [34]:
confusion_matrix(y_test, y_pred)

array([[ 1, 38,  1,  1,  1,  3,  3,  1, 11,  2,  1, 10],
       [ 0,  5,  0,  0,  0,  1,  0,  0,  1,  0,  0,  0],
       [ 0,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 1,  3,  1,  1,  1,  1,  1,  0,  1,  0,  0,  3],
       [ 0,  4,  0,  0,  0,  1,  0,  0,  4,  0,  0,  2],
       [ 0,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  8,  0,  0,  0,  0,  0,  1,  0,  1,  0,  3],
       [ 1, 17,  0,  1,  0,  1,  0,  0,  1,  0,  1,  3],
       [ 0, 12,  0,  0,  1,  1,  2,  3, 11,  0,  0,  3],
       [ 0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1],
       [ 0, 10,  2,  0,  2,  2,  2,  0,  8,  1,  1,  2],
       [ 0,  1,  0,  1,  0,  1,  0,  0,  0,  0,  0,  1]], dtype=int64)

In [35]:
# Create a DataFrame from the confusion matrix.
NRO_cm = confusion_matrix(y_test, y_pred)

NRO_cm_df = pd.DataFrame(
    NRO_cm, 
    index=["Actual Action", "Actual Shooter", "Actual Sports", "Actual Role-Playing", "Actual Racing", "Actual Platform", "Actual Misc", "Actual Adventure", "Actual Fighting", "Actual Strategy", "Actual Simulation", "Actual Puzzle"], 
    columns=["Predicted Action", "Predicted Shooter", "Predicted Sports", "Predicted Role-Playing", "Predicted Racing", "Predicted Platform", "Predicted Misc", "Predicted Adventure", "Predicted Fighting", "Predicted Strategy", "Predicted Simulation", "Predicted Puzzle"])

NRO_cm_df

Unnamed: 0,Predicted Action,Predicted Shooter,Predicted Sports,Predicted Role-Playing,Predicted Racing,Predicted Platform,Predicted Misc,Predicted Adventure,Predicted Fighting,Predicted Strategy,Predicted Simulation,Predicted Puzzle
Actual Action,1,38,1,1,1,3,3,1,11,2,1,10
Actual Shooter,0,5,0,0,0,1,0,0,1,0,0,0
Actual Sports,0,3,0,0,0,0,0,0,0,0,0,0
Actual Role-Playing,1,3,1,1,1,1,1,0,1,0,0,3
Actual Racing,0,4,0,0,0,1,0,0,4,0,0,2
Actual Platform,0,3,0,0,0,0,0,0,0,0,0,0
Actual Misc,0,8,0,0,0,0,0,1,0,1,0,3
Actual Adventure,1,17,0,1,0,1,0,0,1,0,1,3
Actual Fighting,0,12,0,0,1,1,2,3,11,0,0,3
Actual Strategy,0,1,0,0,0,0,0,0,0,0,0,1


In [36]:
dfi.export(NRO_cm_df, "analysis/NRO By Genre.png")

C:\Program Files\Google\Chrome\Application\chrome.exe
C:\Program Files\Google\Chrome\Application\chrome.exe
C:\Program Files\Google\Chrome\Application\chrome.exe
C:\Program Files\Google\Chrome\Application\chrome.exe
C:\Program Files\Google\Chrome\Application\chrome.exe
C:\Program Files\Google\Chrome\Application\chrome.exe


In [37]:
print("Naive Random Oversampling By Genre")
print("\n")
print(f"Accuracy: { balanced_accuracy_score(y_test, y_pred) * 100:.2f}% \n\n")
print(classification_report_imbalanced(y_test, y_pred))

Naive Random Oversampling By Genre


Accuracy: 11.85% 


                    pre       rec       spe        f1       geo       iba       sup

      Action       0.33      0.01      0.99      0.03      0.12      0.01        73
   Adventure       0.05      0.71      0.52      0.09      0.61      0.38         7
    Fighting       0.00      0.00      0.98      0.00      0.00      0.00         3
        Misc       0.25      0.08      0.99      0.12      0.28      0.07        13
    Platform       0.00      0.00      0.98      0.00      0.00      0.00        11
      Puzzle       0.00      0.00      0.95      0.00      0.00      0.00         3
      Racing       0.00      0.00      0.96      0.00      0.00      0.00        13
Role-Playing       0.00      0.00      0.97      0.00      0.00      0.00        25
     Shooter       0.30      0.33      0.86      0.31      0.54      0.27        33
  Simulation       0.00      0.00      0.98      0.00      0.00      0.00         2
      Sports      

# SMOTE Oversampling

### SMOTE Oversampling By Platform

In [38]:
target = ["Platform"]

# Create our features
X = pd.get_dummies(game_df.drop(columns='Platform').copy())

# Create our target
y = game_df[target].copy()

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [40]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE

X_resampled, y_resampled = SMOTE(random_state=1).fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({'Platform': 1})

In [41]:
# Train the Logistic Regression model using the resampled data
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [42]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)

balanced_accuracy_score(y_test, y_pred)

0.18895133473800887

In [43]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 1,  3,  5,  9,  0,  1],
       [ 4,  3,  4, 35,  0,  1],
       [ 7, 10, 13, 31,  0,  2],
       [ 3,  3,  0, 20,  0,  0],
       [ 6,  4,  5,  8,  1,  0],
       [ 5,  4, 14, 13,  2,  0]], dtype=int64)

In [44]:
# Create a DataFrame from the confusion matrix.
SMOTE_cm = confusion_matrix(y_test, y_pred)

SMOTE_cm_df = pd.DataFrame(
    SMOTE_cm, 
    index=["Actual PS4", "Actual PC", "Actual XOne", "Actual WiiU", "Actual 3DS", "Actual PSV"], 
    columns=["Predicted PS4", "Predicted PC", "Predicted XOne", "Predicted WiiU", "Predicted 3DS", "Predicted PSV"])

SMOTE_cm_df

Unnamed: 0,Predicted PS4,Predicted PC,Predicted XOne,Predicted WiiU,Predicted 3DS,Predicted PSV
Actual PS4,1,3,5,9,0,1
Actual PC,4,3,4,35,0,1
Actual XOne,7,10,13,31,0,2
Actual WiiU,3,3,0,20,0,0
Actual 3DS,6,4,5,8,1,0
Actual PSV,5,4,14,13,2,0


In [45]:
# Save the SMOTE dataframe
dfi.export(SMOTE_cm_df, "analysis/SMOTE By Platform.png")

C:\Program Files\Google\Chrome\Application\chrome.exe
C:\Program Files\Google\Chrome\Application\chrome.exe


In [46]:
# Print the imbalanced classification report
print("SMOTE By Platform")
print("\n")
print(f"Accuracy: { balanced_accuracy_score(y_test, y_pred) * 100:.2f}% \n\n")
print(classification_report_imbalanced(y_test, y_pred))

SMOTE By Platform


Accuracy: 18.90% 


                   pre       rec       spe        f1       geo       iba       sup

        3DS       0.04      0.05      0.87      0.04      0.21      0.04        19
         PC       0.11      0.06      0.86      0.08      0.23      0.05        47
        PS4       0.32      0.21      0.82      0.25      0.41      0.16        63
        PSV       0.17      0.77      0.50      0.28      0.62      0.39        26
       WiiU       0.33      0.04      0.99      0.07      0.20      0.04        24
       XOne       0.00      0.00      0.98      0.00      0.00      0.00        38

avg / total       0.18      0.18      0.84      0.14      0.29      0.11       217



### SMOTE Oversampling By Rating

In [47]:
target = ["Rating"]

# Create our features
X = pd.get_dummies(game_df.drop(columns='Rating').copy())

# Create our target
y = game_df[target].copy()

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [49]:
X_resampled, y_resampled = SMOTE(random_state=1).fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({'Rating': 1})

In [50]:
# Train the Logistic Regression model using the resampled data
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [51]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)

balanced_accuracy_score(y_test, y_pred)

0.28839887522116436

In [52]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 3,  2, 17, 33],
       [ 9,  5,  4, 22],
       [ 8,  2, 17, 56],
       [ 1,  4,  4, 30]], dtype=int64)

In [53]:
# Create a DataFrame from the confusion matrix.
SMOTE_cm = confusion_matrix(y_test, y_pred)
SMOTE_cm_df = pd.DataFrame(
    SMOTE_cm, 
    index=["Actual M", "Actual T", "Actual E", "Actual E10+"], 
    columns=["Predicted M", "Predicted T", "Predicted E", "Predicted E10+"])

SMOTE_cm_df

Unnamed: 0,Predicted M,Predicted T,Predicted E,Predicted E10+
Actual M,3,2,17,33
Actual T,9,5,4,22
Actual E,8,2,17,56
Actual E10+,1,4,4,30


In [54]:
dfi.export(SMOTE_cm_df, "analysis/SMOTE By Rating.png")

C:\Program Files\Google\Chrome\Application\chrome.exe


In [55]:
# Print the imbalanced classification report
print("SMOTE By Rating")
print("\n")
print(f"Accuracy: { balanced_accuracy_score(y_test, y_pred) * 100:.2f}% \n\n")
print(classification_report_imbalanced(y_test, y_pred))

SMOTE By Rating


Accuracy: 28.84% 


                   pre       rec       spe        f1       geo       iba       sup

          E       0.14      0.05      0.89      0.08      0.22      0.04        55
       E10+       0.38      0.12      0.95      0.19      0.35      0.11        40
          M       0.40      0.20      0.81      0.27      0.41      0.16        83
          T       0.21      0.77      0.38      0.33      0.54      0.30        39

avg / total       0.30      0.25      0.78      0.22      0.37      0.15       217



### SMOTE Oversampling By Genre

In [56]:
# target = ["Genre"]

# # Create our features
# X = pd.get_dummies(game_df.drop(columns='Genre').copy())

# # Create our target
# y = game_df[target].copy()

In [57]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [58]:
# Train the Logistic Regression model using the resampled data
# X_resampled, y_resampled = SMOTE(random_state=1).fit_resample(X_train, y_train)

# Counter(y_resampled)

In [59]:
# model.fit(X_resampled, y_resampled)

In [60]:
# Calculated the balanced accuracy score
# y_pred = model.predict(X_test)

# balanced_accuracy_score(y_test, y_pred)

In [61]:
# Display the confusion matrix
# confusion_matrix(y_test, y_pred)

In [62]:
# Create a DataFrame from the confusion matrix.
# SMOTE_cm = confusion_matrix(y_test, y_pred)
# SMOTE_cm_df = pd.DataFrame(
#     SMOTE_cm, 
#     index=["Actual Action", "Actual Shooter", "Actual Sports", "Actual Role-Playing", "Actual Racing", "Actual Platform", "Actual Misc", "Actual Adventure", "Actual Fighting", "Actual Strategy", "Actual Simulation", "Actual Puzzle"], 
#     columns=["Predicted Action", "Predicted Shooter", "Predicted Sports", "Predicted Role-Playing", "Predicted Racing", "Predicted Platform", "Predicted Misc", "Predicted Adventure", "Predicted Fighting", "Predicted Strategy", "Predicted Simulation", "Predicted Puzzle"])

# SMOTE_cm_df

In [63]:
# dfi.export(SMOTE_cm_df, "analysis/SMOTE By Genre.png")

In [64]:
# Print the imbalanced classification report
# print("SMOTE By Genre")
# print("\n")
# print(f"Accuracy: { balanced_accuracy_score(y_test, y_pred) * 100:.2f}% \n\n")
# print(classification_report_imbalanced(y_test, y_pred))

# Undersampling

### Undersampling by Platform

In [65]:
target = ["Platform"]

# Create our features
X = pd.get_dummies(game_df.drop(columns='Platform').copy())

# Create our target
y = game_df[target].copy()

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [67]:
# Resample the data using the ClusterCentroids resampler
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.under_sampling import ClusterCentroids

cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({'Platform': 1})

In [68]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=78)

model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=78)

In [69]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.0

In [70]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)

confusion_matrix(y_test, y_pred)

array([[ 1,  2,  4, 11,  0,  1],
       [ 1,  1,  3, 38,  3,  1],
       [ 2,  6, 12, 38,  4,  1],
       [ 0,  2,  0, 23,  1,  0],
       [ 1,  3,  4, 11,  4,  1],
       [ 0,  3,  9, 16,  5,  5]], dtype=int64)

In [71]:
# Create a DataFrame from the confusion matrix.
Undersampling_cm = confusion_matrix(y_test, y_pred)

Undersampling_cm_df = pd.DataFrame(
    Undersampling_cm, 
    index=["Actual PS4", "Actual PC", "Actual XOne", "Actual WiiU", "Actual 3DS", "Actual PSV"], 
    columns=["Predicted PS4", "Predicted PC", "Predicted XOne", "Predicted WiiU", "Predicted 3DS", "Predicted PSV"])

Undersampling_cm_df

Unnamed: 0,Predicted PS4,Predicted PC,Predicted XOne,Predicted WiiU,Predicted 3DS,Predicted PSV
Actual PS4,1,2,4,11,0,1
Actual PC,1,1,3,38,3,1
Actual XOne,2,6,12,38,4,1
Actual WiiU,0,2,0,23,1,0
Actual 3DS,1,3,4,11,4,1
Actual PSV,0,3,9,16,5,5


In [72]:
dfi.export(Undersampling_cm_df, "analysis/Undersampling By Platform.png")

C:\Program Files\Google\Chrome\Application\chrome.exe
C:\Program Files\Google\Chrome\Application\chrome.exe


In [73]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

print("Undersampling By Platform")
print("\n")
print(f"Accuracy: { balanced_accuracy_score(y_test, y_pred) * 100:.2f}% \n\n")
print(classification_report_imbalanced(y_test, y_pred))

Undersampling By Platform


Accuracy: 24.12% 


                   pre       rec       spe        f1       geo       iba       sup

        3DS       0.20      0.05      0.98      0.08      0.23      0.05        19
         PC       0.06      0.02      0.91      0.03      0.14      0.02        47
        PS4       0.38      0.19      0.87      0.25      0.41      0.15        63
        PSV       0.17      0.88      0.40      0.28      0.60      0.37        26
       WiiU       0.24      0.17      0.93      0.20      0.39      0.14        24
       XOne       0.56      0.13      0.98      0.21      0.36      0.12        38

avg / total       0.28      0.21      0.86      0.18      0.35      0.13       217



### Undersampling by Rating

In [74]:
target = ["Rating"]

# Create our features
X = pd.get_dummies(game_df.drop(columns='Rating').copy())

# Create our target
y = game_df[target].copy()

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [76]:
# Resample the data using the ClusterCentroids resampler
# Warning: This is a large dataset, and this step may take some time to complete

cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({'Rating': 1})

In [77]:
# Train the Logistic Regression model using the resampled data

model = LogisticRegression(solver='lbfgs', random_state=78)

model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=78)

In [78]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.0

In [79]:
y_pred = model.predict(X_test)

confusion_matrix(y_test, y_pred)

array([[ 6, 35, 14,  0],
       [ 0, 24,  4, 12],
       [ 3, 56, 17,  7],
       [ 0, 33,  4,  2]], dtype=int64)

In [80]:
# Create a DataFrame from the confusion matrix.
Undersampling_cm = confusion_matrix(y_test, y_pred)

Undersampling_cm_df = pd.DataFrame(
    Undersampling_cm, 
    index=["Actual M", "Actual T", "Actual E", "Actual E10+"], 
    columns=["Predicted M", "Predicted T", "Predicted E", "Predicted E10+"])

Undersampling_cm_df

Unnamed: 0,Predicted M,Predicted T,Predicted E,Predicted E10+
Actual M,6,35,14,0
Actual T,0,24,4,12
Actual E,3,56,17,7
Actual E10+,0,33,4,2


In [81]:
dfi.export(Undersampling_cm_df, "analysis/Undersampling By Rating.png")

C:\Program Files\Google\Chrome\Application\chrome.exe


In [82]:
# Print the imbalanced classification report
print("Undersampling by Rating")
print("\n")
print(f"Accuracy: { balanced_accuracy_score(y_test, y_pred) * 100:.2f}% \n\n")
print(classification_report_imbalanced(y_test, y_pred))

Undersampling by Rating


Accuracy: 24.13% 


                   pre       rec       spe        f1       geo       iba       sup

          E       0.67      0.11      0.98      0.19      0.33      0.10        55
       E10+       0.16      0.60      0.30      0.26      0.42      0.19        40
          M       0.44      0.20      0.84      0.28      0.41      0.16        83
          T       0.10      0.05      0.89      0.07      0.21      0.04        39

avg / total       0.38      0.23      0.78      0.21      0.36      0.13       217



### Undersampling by Genre

In [83]:
target = ["Genre"]

# Create our features
X = pd.get_dummies(game_df.drop(columns='Genre').copy())

# Create our target
y = game_df[target].copy()

In [84]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [85]:
# Resample the data using the ClusterCentroids resampler
# Warning: This is a large dataset, and this step may take some time to complete

cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({'Genre': 1})

In [86]:
# Train the Logistic Regression model using the resampled data

model = LogisticRegression(solver='lbfgs', random_state=78)

model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=78)

In [87]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.0

In [88]:
y_pred = model.predict(X_test)

confusion_matrix(y_test, y_pred)

array([[ 2,  2,  4,  1,  2, 51,  3,  2,  2,  0,  0,  4],
       [ 0,  0,  0,  1,  0,  6,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  3,  0,  0,  0,  0,  0,  0],
       [ 1,  2,  1,  0,  0,  7,  0,  0,  0,  0,  0,  2],
       [ 1,  0,  0,  1,  0,  7,  2,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  3,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  1,  0,  0, 11,  0,  0,  0,  0,  0,  1],
       [ 1,  1,  1,  0,  0, 21,  0,  0,  0,  0,  0,  1],
       [ 2,  0,  1,  3,  3, 16,  0,  1,  1,  0,  1,  5],
       [ 0,  0,  0,  0,  0,  2,  0,  0,  0,  0,  0,  0],
       [ 1,  2,  3,  2,  2, 14,  2,  1,  0,  0,  1,  2],
       [ 0,  0,  0,  0,  0,  3,  0,  0,  0,  0,  0,  1]], dtype=int64)

In [89]:
# Create a DataFrame from the confusion matrix.
Undersampling_cm = confusion_matrix(y_test, y_pred)

Undersampling_cm_df = pd.DataFrame(
    Undersampling_cm, 
    index=["Actual Action", "Actual Shooter", "Actual Sports", "Actual Role-Playing", "Actual Racing", "Actual Platform", "Actual Misc", "Actual Adventure", "Actual Fighting", "Actual Strategy", "Actual Simulation", "Actual Puzzle"], 
    columns=["Predicted Action", "Predicted Shooter", "Predicted Sports", "Predicted Role-Playing", "Predicted Racing", "Predicted Platform", "Predicted Misc", "Predicted Adventure", "Predicted Fighting", "Predicted Strategy", "Predicted Simulation", "Predicted Puzzle"])

Undersampling_cm_df

Unnamed: 0,Predicted Action,Predicted Shooter,Predicted Sports,Predicted Role-Playing,Predicted Racing,Predicted Platform,Predicted Misc,Predicted Adventure,Predicted Fighting,Predicted Strategy,Predicted Simulation,Predicted Puzzle
Actual Action,2,2,4,1,2,51,3,2,2,0,0,4
Actual Shooter,0,0,0,1,0,6,0,0,0,0,0,0
Actual Sports,0,0,0,0,0,3,0,0,0,0,0,0
Actual Role-Playing,1,2,1,0,0,7,0,0,0,0,0,2
Actual Racing,1,0,0,1,0,7,2,0,0,0,0,0
Actual Platform,0,0,0,0,0,3,0,0,0,0,0,0
Actual Misc,0,0,1,0,0,11,0,0,0,0,0,1
Actual Adventure,1,1,1,0,0,21,0,0,0,0,0,1
Actual Fighting,2,0,1,3,3,16,0,1,1,0,1,5
Actual Strategy,0,0,0,0,0,2,0,0,0,0,0,0


In [90]:
dfi.export(Undersampling_cm_df, "analysis/Undersampling By Genre.png")

C:\Program Files\Google\Chrome\Application\chrome.exe
C:\Program Files\Google\Chrome\Application\chrome.exe
C:\Program Files\Google\Chrome\Application\chrome.exe
C:\Program Files\Google\Chrome\Application\chrome.exe
C:\Program Files\Google\Chrome\Application\chrome.exe
C:\Program Files\Google\Chrome\Application\chrome.exe


In [91]:
# Print the imbalanced classification report
print("Undersampling By Genre")
print("\n")
print(f"Accuracy: { balanced_accuracy_score(y_test, y_pred) * 100:.2f}% \n\n")
print(classification_report_imbalanced(y_test, y_pred))

Undersampling By Genre


Accuracy: 11.18% 


                    pre       rec       spe        f1       geo       iba       sup

      Action       0.25      0.03      0.96      0.05      0.16      0.02        73
   Adventure       0.00      0.00      0.97      0.00      0.00      0.00         7
    Fighting       0.00      0.00      0.95      0.00      0.00      0.00         3
        Misc       0.00      0.00      0.96      0.00      0.00      0.00        13
    Platform       0.00      0.00      0.97      0.00      0.00      0.00        11
      Puzzle       0.02      1.00      0.34      0.04      0.58      0.36         3
      Racing       0.00      0.00      0.97      0.00      0.00      0.00        13
Role-Playing       0.00      0.00      0.98      0.00      0.00      0.00        25
     Shooter       0.33      0.03      0.99      0.06      0.17      0.03        33
  Simulation       0.00      0.00      1.00      0.00      0.00      0.00         2
      Sports       0.50      0

# Combination (Over and Under) Sampling

### Combination (Over and Under) Sampling by Platform

In [92]:
target = ["Platform"]

# Create our features
X = pd.get_dummies(game_df.drop(columns='Platform').copy())

# Create our target
y = game_df[target].copy()

In [93]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [94]:
# Resample the training data with SMOTEENN
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

Counter(y_resampled)

Counter({'Platform': 1})

In [95]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', random_state=1)

model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [96]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred)

0.0

In [97]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred = model.predict(X_test)

confusion_matrix(y_test, y_pred)

array([[ 0,  0,  1, 12,  1,  5],
       [ 0,  6,  0, 32,  2,  7],
       [ 0,  8,  8, 36,  4,  7],
       [ 0,  1,  0, 22,  3,  0],
       [ 0,  3,  1,  9,  3,  8],
       [ 0,  2,  4, 17,  2, 13]], dtype=int64)

In [98]:
# Create a DataFrame from the confusion matrix.
COU_cm = confusion_matrix(y_test, y_pred)

COU_cm_df = pd.DataFrame(
    COU_cm, 
    index=["Actual PS4", "Actual PC", "Actual XOne", "Actual WiiU", "Actual 3DS", "Actual PSV"], 
    columns=["Predicted PS4", "Predicted PC", "Predicted XOne", "Predicted WiiU", "Predicted 3DS", "Predicted PSV"])

COU_cm_df

Unnamed: 0,Predicted PS4,Predicted PC,Predicted XOne,Predicted WiiU,Predicted 3DS,Predicted PSV
Actual PS4,0,0,1,12,1,5
Actual PC,0,6,0,32,2,7
Actual XOne,0,8,8,36,4,7
Actual WiiU,0,1,0,22,3,0
Actual 3DS,0,3,1,9,3,8
Actual PSV,0,2,4,17,2,13


In [99]:
dfi.export(COU_cm_df, "analysis/Combination (Over and Under) Sampling By Platform.png")

C:\Program Files\Google\Chrome\Application\chrome.exe
C:\Program Files\Google\Chrome\Application\chrome.exe


In [100]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

print("Combination (Over and Under) Sampling By Platform")
print("\n")
print(f"Accuracy: { balanced_accuracy_score(y_test, y_pred) * 100:.2f}% \n\n")
print(classification_report_imbalanced(y_test, y_pred))

Combination (Over and Under) Sampling By Platform


Accuracy: 26.13% 


                   pre       rec       spe        f1       geo       iba       sup

        3DS       0.00      0.00      1.00      0.00      0.00      0.00        19
         PC       0.30      0.13      0.92      0.18      0.34      0.11        47
        PS4       0.57      0.13      0.96      0.21      0.35      0.11        63
        PSV       0.17      0.85      0.45      0.29      0.61      0.39        26
       WiiU       0.20      0.12      0.94      0.15      0.34      0.11        24
       XOne       0.33      0.34      0.85      0.33      0.54      0.28        38

avg / total       0.33      0.24      0.87      0.21      0.38      0.16       217



### Combination (Over and Under) Sampling by Rating

In [101]:
target = ["Rating"]

# Create our features
X = pd.get_dummies(game_df.drop(columns='Rating').copy())

# Create our target
y = game_df[target].copy()

In [102]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [103]:
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

Counter(y_resampled)

Counter({'Rating': 1})

In [104]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)

model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [105]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.0

In [106]:
# Display the confusion matrix
y_pred = model.predict(X_test)

confusion_matrix(y_test, y_pred)

array([[10, 17,  6, 22],
       [ 1, 23,  3, 13],
       [ 8, 29,  9, 37],
       [ 2, 14,  2, 21]], dtype=int64)

In [107]:
# Create a DataFrame from the confusion matrix.
COU_cm = confusion_matrix(y_test, y_pred)

COU_cm_df = pd.DataFrame(
    COU_cm, 
    index=["Actual M", "Actual T", "Actual E", "Actual E10+"], 
    columns=["Predicted M", "Predicted T", "Predicted E", "Predicted E10+"])

COU_cm_df

Unnamed: 0,Predicted M,Predicted T,Predicted E,Predicted E10+
Actual M,10,17,6,22
Actual T,1,23,3,13
Actual E,8,29,9,37
Actual E10+,2,14,2,21


In [108]:
dfi.export(COU_cm_df, "analysis/Combination (Over and Under) Sampling By Rating.png")

C:\Program Files\Google\Chrome\Application\chrome.exe


In [109]:
# Print the imbalanced classification report
print("Combination (Over and Under) Sampling By Rating")
print("\n")
print(f"Accuracy: { balanced_accuracy_score(y_test, y_pred) * 100:.2f}% \n\n")
print(classification_report_imbalanced(y_test, y_pred))

Combination (Over and Under) Sampling By Rating


Accuracy: 35.09% 


                   pre       rec       spe        f1       geo       iba       sup

          E       0.48      0.18      0.93      0.26      0.41      0.16        55
       E10+       0.28      0.57      0.66      0.37      0.62      0.38        40
          M       0.45      0.11      0.92      0.17      0.32      0.09        83
          T       0.23      0.54      0.60      0.32      0.57      0.32        39

avg / total       0.38      0.29      0.82      0.26      0.44      0.20       217



### Combination (Over and Under) Sampling by Genre

In [110]:
# target = ["Genre"]

# # Create our features
# X = pd.get_dummies(game_df.drop(columns='Genre').copy())

# # Create our target
# y = game_df[target].copy()

In [111]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [112]:
# smote_enn = SMOTEENN(random_state=0)
# X_resampled, y_resampled = smote_enn.fit_resample(X, y)

# Counter(y_resampled)

In [113]:
# Train the Logistic Regression model using the resampled data
# model = LogisticRegression(solver='lbfgs', random_state=1)

# model.fit(X_resampled, y_resampled)

In [114]:
# Calculated the balanced accuracy score
# balanced_accuracy_score(y_test, y_pred)

In [115]:
# Display the confusion matrix
# y_pred = model.predict(X_test)

# confusion_matrix(y_test, y_pred)

In [116]:
# Create a DataFrame from the confusion matrix.
# COU_cm = confusion_matrix(y_test, y_pred)

# COU_cm_df = pd.DataFrame(
#     COU_cm, 
#     index=["Actual Action", "Actual Shooter", "Actual Sports", "Actual Role-Playing", "Actual Racing", "Actual Platform", "Actual Misc", "Actual Adventure", "Actual Fighting", "Actual Strategy", "Actual Simulation", "Actual Puzzle"], 
#     columns=["Predicted Action", "Predicted Shooter", "Predicted Sports", "Predicted Role-Playing", "Predicted Racing", "Predicted Platform", "Predicted Misc", "Predicted Adventure", "Predicted Fighting", "Predicted Strategy", "Predicted Simulation", "Predicted Puzzle"])

# COU_cm_df

In [117]:
# dfi.export(Undersampling_cm_df, "analysis/Combination (Over and Under) Sampling by Genre.png")

In [118]:
# Print the imbalanced classification report
# print("Combination (Over and Under) Sampling by Genre")
# print("\n")
# print(f"Accuracy: { balanced_accuracy_score(y_test, y_pred) * 100:.2f}% \n\n")
# print(classification_report_imbalanced(y_test, y_pred))