In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.ensemble import BalancedRandomForestClassifier

In [2]:
# Load the data
# import clean anime csv (RTR)
file_path = '../Data/anime_df.csv'
clean_anime_df = pd.read_csv(file_path)

clean_anime_df.head()


Unnamed: 0,anime_id,title,type,score,scored_by,status,episodes,source,members,favorites,rating,sfw,start_year,start_season,genres,demographics,studios
0,5114,Fullmetal Alchemist: Brotherhood,tv,High,1871705,finished_airing,64,Non-Original,2932347,204645,r,True,2009,spring,Action,Shounen,Bones
1,11061,Hunter x Hunter (2011),tv,High,1509622,finished_airing,148,Non-Original,2418883,185178,pg_13,True,2011,fall,Action,Shounen,Madhouse
2,38524,Shingeki no Kyojin Season 3 Part 2,tv,High,1329500,finished_airing,10,Non-Original,1881734,51931,r,True,2019,spring,Action,Shounen,Wit Studio
3,9253,Steins;Gate,tv,High,1252286,finished_airing,24,Non-Original,2269121,173088,pg_13,True,2011,spring,Drama,,White Fox
4,28851,Koe no Katachi,movie,High,1398608,finished_airing,1,Non-Original,2001335,77431,pg_13,True,2016,summer,Drama,Shounen,Kyoto Animation


In [3]:
clean_anime_df.shape

(9451, 17)

In [4]:
# Create our features (in testing)

feature_df = clean_anime_df.drop(columns = ["anime_id", "title", "score", "rating", "sfw", "demographics", "genres", "status", "start_year", "start_season"])

# features are scored_by, episodes, source, members, favorites, type, studios.

X = feature_df

In [5]:
# Create our target
y = clean_anime_df["score"]
y.value_counts()

Average    5738
High       3713
Name: score, dtype: int64

In [6]:
#Get Dummies
X_encoded = pd.get_dummies(X)
X_encoded.shape

(9451, 696)

In [7]:
data_scaler = StandardScaler()

In [8]:
X_encoded_scaled = data_scaler.fit_transform(X_encoded)
# X_encoded_scaled[:1]

In [9]:
# from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_encoded_scaled, y, random_state = 1, stratify=y)

# addidng in stratify to help with the imbalance between Non-Original and Original

In [10]:
y_train.value_counts()

Average    4303
High       2785
Name: score, dtype: int64

In [11]:
# Resample the training data with the BalancedRandomForestClassifier
# model / fit / predict

# model
brfc_model = BalancedRandomForestClassifier(n_estimators= 1500,
                                           random_state=1)
# fit model
brfc_model = brfc_model.fit(X_train, y_train)

predictions = brfc_model.predict(X_test)


In [12]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, predictions)

0.8266918478913854

In [13]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, predictions))

                   pre       rec       spe        f1       geo       iba       sup

    Average       0.89      0.80      0.85      0.85      0.83      0.68      1435
       High       0.74      0.85      0.80      0.79      0.83      0.69       928

avg / total       0.83      0.82      0.83      0.82      0.83      0.68      2363



In [20]:
# List the features sorted in descending order by feature importance
imp_features = brfc_model.feature_importances_

cols = X_encoded.columns

imp_features_df = pd.DataFrame({"feature": cols, "importance": imp_features}).sort_values("importance", ascending=False)
imp_features_df.head(5)

# add in others

Unnamed: 0,feature,importance
3,favorites,0.225341
2,members,0.193152
0,scored_by,0.188275
1,episodes,0.065925
7,type_ova,0.01524
