In [1]:
pip install imbalanced-learn==0.9.0

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install scikit-learn==1.0.1

Note: you may need to restart the kernel to use updated packages.


In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [5]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [6]:
from imblearn.ensemble import BalancedRandomForestClassifier


In [7]:
# Load the data
# import clean anime csv (RTR)
url = 'https://raw.githubusercontent.com/Megreid23/final_project/main/clean_anime.csv'
clean_anime_df = pd.read_csv(url)

# clean_anime_df.head()

# combine manga, visual_novel, light_novel, game, novel, other, 4_koma_manga, and web_manga into source
x = {'x': 'Original'}   
clean_anime_df = clean_anime_df.replace(x)

x = dict.fromkeys(['manga', 'visual_novel', 'light_novel', 'game', 'novel', 'other', '4_koma_manga', "web_manga"], 'Non-Original')    
clean_anime_df = clean_anime_df.replace(x)

clean_anime_df.reset_index(inplace=True, drop=True)



clean_anime_df.head()


Unnamed: 0,anime_id,type,score,scored_by,episodes,start_date,source,rating,sfw,start_season,demographics,studios,producers,licensors,keywords
0,5114,tv,9.13,1871705,64.0,2009-04-05,Non-Original,r,True,spring,['Shounen'],['Bones'],"['Aniplex', 'Square Enix', 'Mainichi Broadcast...","['Funimation', 'Aniplex of America']","['Action', 'Adventure', 'Drama', 'Fantasy', 'M..."
1,11061,tv,9.04,1509622,148.0,2011-10-02,Non-Original,pg_13,True,fall,['Shounen'],['Madhouse'],"['VAP', 'Nippon Television Network', 'Shueisha']",['VIZ Media'],"['Action', 'Adventure', 'Fantasy']"
2,38524,tv,9.07,1329500,10.0,2019-04-29,Non-Original,r,True,spring,['Shounen'],['Wit Studio'],"['Production I.G', 'Dentsu', 'Mainichi Broadca...",['Funimation'],"['Action', 'Drama', 'Gore', 'Military', 'Survi..."
3,9253,tv,9.08,1252286,24.0,2011-04-06,Non-Original,pg_13,True,spring,[],['White Fox'],"['Frontier Works', 'Media Factory', 'Movic', '...",['Funimation'],"['Drama', 'Sci-Fi', 'Suspense', 'Psychological..."
4,28851,movie,8.95,1398608,1.0,2016-09-17,Non-Original,pg_13,True,summer,['Shounen'],['Kyoto Animation'],"['Shochiku', 'Pony Canyon', 'Kodansha', 'ABC A...","['Eleven Arts', 'NYAV Post']","['Drama', 'Romantic Subtext']"


In [8]:
# Create our features (in testing)

feature_df = clean_anime_df.drop(columns=["source", "demographics", "anime_id",])

feature_df.value_counts()

X = feature_df

In [9]:
# Create our target
y = clean_anime_df["source"]
y.value_counts()

Non-Original    7714
original        2892
Name: source, dtype: int64

In [10]:
#Get Dummies
X_encoded = pd.get_dummies(X)
X_encoded.shape

(10606, 13424)

In [11]:
data_scaler = StandardScaler()

In [12]:
X_encoded_scaled = data_scaler.fit_transform(X_encoded)
# X_encoded_scaled[:1]

In [13]:
# from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_encoded_scaled, y, random_state = 1)

In [14]:
y_train.value_counts()

Non-Original    5766
original        2188
Name: source, dtype: int64

### Balanced Random Forest Classifier

In [15]:
# Resample the training data with the Balanced Random Fores tClassifier
# model / fit / predict

# model
brf_model = BalancedRandomForestClassifier(n_estimators= 100,
                                           random_state=1)
# fit model
brf_model = brf_model.fit(X_train, y_train)

predictions = brf_model.predict(X_test)



In [16]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, predictions)

0.7583827235392944

In [17]:
# Display the confusion matrix
confusion_matrix(y_test, predictions)

array([[1549,  399],
       [ 196,  508]])

In [18]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, predictions))

                    pre       rec       spe        f1       geo       iba       sup

Non-Original       0.89      0.80      0.72      0.84      0.76      0.58      1948
    original       0.56      0.72      0.80      0.63      0.76      0.57       704

 avg / total       0.80      0.78      0.74      0.78      0.76      0.58      2652



In [19]:
# List the features sorted in descending order by feature importance
imp_features = brf_model.feature_importances_

cols = X_encoded.columns

features_df = pd.DataFrame({"feature": cols, "importance": imp_features}).sort_values("importance", ascending=False)
features_df.head(10)

# add in others

Unnamed: 0,feature,importance
2,scored_by,0.056784
1,score,0.044522
0,anime_id,0.039403
3,episodes,0.025623
5355,rating_g,0.016197
6288,studios_['Sunrise'],0.012776
6476,studios_[],0.011058
5360,rating_rx,0.008816
9931,producers_[],0.008563
7,type_ova,0.007894
