In [1]:
pip install imbalanced-learn==0.9.0

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install scikit-learn==1.0.1

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install sqlalchemy

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install psycopg2

Note: you may need to restart the kernel to use updated packages.


In [5]:
import numpy as np
import pandas as pd
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from imblearn.ensemble import BalancedRandomForestClassifier

In [6]:
# Import SQLAlchemy `automap` and other dependencies here
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, inspect, func

In [7]:
# Load the data
# import clean anime csv (RTR)
file_path = '../Data/anime_df.csv'
clean_anime_df = pd.read_csv(file_path)

# clean_anime_df.head()

clean_anime_df = clean_anime_df.drop(columns=["title"])


In [8]:
clean_anime_df.shape

(9451, 16)

In [9]:
# Create our features (in testing)

feature_df = clean_anime_df.drop(columns=["score", "anime_id", "sfw", "scored_by", "episodes", "members", "favorites", "start_year"])

feature_df.head()



Unnamed: 0,type,status,source,rating,start_season,genres,demographics,studios
0,tv,finished_airing,Non-Original,r,spring,Action,Shounen,Bones
1,tv,finished_airing,Non-Original,pg_13,fall,Action,Shounen,Madhouse
2,tv,finished_airing,Non-Original,r,spring,Action,Shounen,Wit Studio
3,tv,finished_airing,Non-Original,pg_13,spring,Drama,,White Fox
4,movie,finished_airing,Non-Original,pg_13,summer,Drama,Shounen,Kyoto Animation


In [10]:
feature_df.dtypes

type            object
status          object
source          object
rating          object
start_season    object
genres          object
demographics    object
studios         object
dtype: object

In [11]:
# Generate our categorical variable lists
feature_df = feature_df.dtypes[feature_df.dtypes == "object"].index.tolist()

feature_df

['type',
 'status',
 'source',
 'rating',
 'start_season',
 'genres',
 'demographics',
 'studios']

In [12]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(clean_anime_df[feature_df]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(feature_df)
encode_df.head()



Unnamed: 0,type_Non-Original,type_movie,type_ona,type_ova,type_special,type_tv,status_currently_airing,status_finished_airing,source_Non-Original,source_original,...,studios_feel.,studios_happyproject,studios_helo.inc,studios_iDRAGONS Creative Studio,studios_ixtl,studios_l-a-unch・BOX,studios_monofilmo,studios_production doA,studios_studio MOTHER,studios_ufotable
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
# Merge one-hot encoded features and drop the originals
encoded_app_df = clean_anime_df.merge(encode_df, left_index=True, right_index=True).drop(feature_df, 1)

encoded_app_df.head()

  


Unnamed: 0,anime_id,score,scored_by,episodes,members,favorites,sfw,start_year,type_Non-Original,type_movie,...,studios_feel.,studios_happyproject,studios_helo.inc,studios_iDRAGONS Creative Studio,studios_ixtl,studios_l-a-unch・BOX,studios_monofilmo,studios_production doA,studios_studio MOTHER,studios_ufotable
0,5114,High,1871705,64,2932347,204645,True,2009,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,11061,High,1509622,148,2418883,185178,True,2011,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,38524,High,1329500,10,1881734,51931,True,2019,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,9253,High,1252286,24,2269121,173088,True,2011,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,28851,High,1398608,1,2001335,77431,True,2016,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:

# Split our preprocessed data into our features and target arrays
y = encoded_app_df["score"].values
X = encoded_app_df.drop(["score"], 1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

  This is separate from the ipykernel package so we can avoid doing imports until


In [15]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [16]:
# Resample the training data with the BalancedRandomForestClassifier
# model / fit / predict

# model
brf_model = BalancedRandomForestClassifier(n_estimators= 1000,
                                           random_state=1)
# fit model
brf_model = brf_model.fit(X_train, y_train)

predictions = brf_model.predict(X_test)


In [17]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, predictions)

0.8364513246425568

In [18]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, predictions))

                   pre       rec       spe        f1       geo       iba       sup

    Average       0.90      0.81      0.86      0.85      0.84      0.70      1435
       High       0.75      0.86      0.81      0.80      0.84      0.70       928

avg / total       0.84      0.83      0.84      0.83      0.84      0.70      2363

