In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from imblearn.ensemble import BalancedRandomForestClassifier

In [2]:
# Import SQLAlchemy `automap` and other dependencies here
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, inspect, func

In [3]:
# Load the data
# import clean anime csv (RTR)
file_path = '../Data/anime_df.csv'
clean_anime_df = pd.read_csv(file_path)

# clean_anime_df.head()

clean_anime_df = clean_anime_df.drop(columns=["title"])


In [4]:
clean_anime_df.shape

(9451, 16)

In [5]:
# Create our features (in testing)

feature_df = clean_anime_df.drop(columns=["score", "anime_id", "sfw", "scored_by", "episodes", 
                                          "members", "favorites", "start_year", "studios", "demographics", "genres"])

feature_df.head()



Unnamed: 0,type,status,source,rating,start_season
0,tv,finished_airing,Non-Original,r,spring
1,tv,finished_airing,Non-Original,pg_13,fall
2,tv,finished_airing,Non-Original,r,spring
3,tv,finished_airing,Non-Original,pg_13,spring
4,movie,finished_airing,Non-Original,pg_13,summer


In [6]:
feature_df.dtypes

type            object
status          object
source          object
rating          object
start_season    object
dtype: object

In [7]:
# Generate our categorical variable lists
feature_df = feature_df.dtypes[feature_df.dtypes == "object"].index.tolist()

feature_df

['type', 'status', 'source', 'rating', 'start_season']

In [8]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(clean_anime_df[feature_df]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(feature_df)
encode_df.head()



Unnamed: 0,type_Non-Original,type_movie,type_ona,type_ova,type_special,type_tv,status_currently_airing,status_finished_airing,source_Non-Original,source_original,rating_g,rating_pg,rating_pg_13,rating_r,rating_r+,rating_rx,start_season_fall,start_season_spring,start_season_summer,start_season_winter
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [9]:
# Merge one-hot encoded features and drop the originals
encoded_app_df = feature_df.merge(encode_df, left_index=True, right_index=True).drop(feature_df, 1)

encoded_app_df.head()

AttributeError: 'list' object has no attribute 'merge'

In [None]:

# Split our preprocessed data into our features and target arrays
y = encoded_app_df["score"].values
X = encoded_app_df.drop(["score"], 1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [None]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Resample the training data with the BalancedRandomForestClassifier
# model / fit / predict

# model
brf_model = BalancedRandomForestClassifier(n_estimators= 1000,
                                           random_state=1)
# fit model
brf_model = brf_model.fit(X_train, y_train)

predictions = brf_model.predict(X_test)


In [None]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, predictions)

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, predictions))