# Capstone Project Data Science (mdavap)
# Import Library

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import joblib

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score

import warnings
warnings.filterwarnings('ignore')

# Import the dataset

In [2]:
df = pd.read_csv("../dataset/anime.csv")
df.head(5)

Unnamed: 0,id,title,title_english,type,source,episodes,status,airing,rating,score,...,popularity,favorites,members,synopsis,season,year,start,ending,studios,genres
0,1,Cowboy Bebop,Cowboy Bebop,TV,Original,26,Finished Airing,False,R - 17+ (violence & profanity),8.75,...,43,85049,1935105,"Crime is timeless. By the year 2071, humanity ...",spring,1998,1998-04-03,1999-04-24,Sunrise,Action;Award Winning;Sci-Fi
1,5,Cowboy Bebop: Tengoku no Tobira,Cowboy Bebop: The Movie,Movie,Original,1,Finished Airing,False,R - 17+ (violence & profanity),8.38,...,635,1661,390938,"Another day, another bounty—such is the life o...",,2001,2001-09-01,,Bones,Action;Sci-Fi
2,6,Trigun,Trigun,TV,Manga,26,Finished Airing,False,PG-13 - Teens 13 or older,8.22,...,259,16535,789238,"Vash the Stampede is the man with a $$60,000,0...",spring,1998,1998-04-01,1998-09-30,Madhouse,Action;Adventure;Sci-Fi
3,7,Witch Hunter Robin,Witch Hunter Robin,TV,Original,26,Finished Airing,False,PG-13 - Teens 13 or older,7.24,...,1921,658,120993,"Though hidden away from the general public, Wi...",summer,2002,2002-07-03,2002-12-25,Sunrise,Action;Drama;Mystery;Supernatural
4,8,Bouken Ou Beet,Beet the Vandel Buster,TV,Manga,52,Finished Airing,False,PG - Children,6.93,...,5557,16,16091,It is the dark century and the people are suff...,fall,2004,2004-09-30,2005-09-29,Toei Animation,Action;Adventure;Fantasy


In [3]:
df[df['rank'] == 1]

Unnamed: 0,id,title,title_english,type,source,episodes,status,airing,rating,score,...,popularity,favorites,members,synopsis,season,year,start,ending,studios,genres
23087,52991,Sousou no Frieren,Frieren: Beyond Journey's End,TV,Manga,28,Finished Airing,False,PG-13 - Teens 13 or older,9.31,...,171,60283,995965,During their decade-long quest to defeat the D...,fall,2023,2023-09-29,2024-03-22,Madhouse,Adventure;Drama;Fantasy


# Classification
- Create `success`
    - 0: `Not successful`
    - 1: `Successful` with Criteria below:
        - Top 500 at `rank`
        - Top 500 at `popularity`
        - Score is or above `7.5` 

In [4]:
def DecideSuccess(anime):
    return anime['rank'] <= 500 or anime['popularity'] <= 500 or anime['score'] >= 7.5

df['success'] = df.apply(DecideSuccess, axis=1)

In [5]:
df.head(5)

Unnamed: 0,id,title,title_english,type,source,episodes,status,airing,rating,score,...,favorites,members,synopsis,season,year,start,ending,studios,genres,success
0,1,Cowboy Bebop,Cowboy Bebop,TV,Original,26,Finished Airing,False,R - 17+ (violence & profanity),8.75,...,85049,1935105,"Crime is timeless. By the year 2071, humanity ...",spring,1998,1998-04-03,1999-04-24,Sunrise,Action;Award Winning;Sci-Fi,True
1,5,Cowboy Bebop: Tengoku no Tobira,Cowboy Bebop: The Movie,Movie,Original,1,Finished Airing,False,R - 17+ (violence & profanity),8.38,...,1661,390938,"Another day, another bounty—such is the life o...",,2001,2001-09-01,,Bones,Action;Sci-Fi,True
2,6,Trigun,Trigun,TV,Manga,26,Finished Airing,False,PG-13 - Teens 13 or older,8.22,...,16535,789238,"Vash the Stampede is the man with a $$60,000,0...",spring,1998,1998-04-01,1998-09-30,Madhouse,Action;Adventure;Sci-Fi,True
3,7,Witch Hunter Robin,Witch Hunter Robin,TV,Original,26,Finished Airing,False,PG-13 - Teens 13 or older,7.24,...,658,120993,"Though hidden away from the general public, Wi...",summer,2002,2002-07-03,2002-12-25,Sunrise,Action;Drama;Mystery;Supernatural,False
4,8,Bouken Ou Beet,Beet the Vandel Buster,TV,Manga,52,Finished Airing,False,PG - Children,6.93,...,16,16091,It is the dark century and the people are suff...,fall,2004,2004-09-30,2005-09-29,Toei Animation,Action;Adventure;Fantasy,False


In [6]:
print(f"Number of successful anime: {len(df[df['success'] == True]['success'])}")
print(f"Number of not successful anime: {len(df[df['success'] == False]['success'])}")

Number of successful anime: 2179
Number of not successful anime: 25791


# Preprocessing

## One hot encoding for genres and studios

In [7]:
df_encoded = df.join(df['genres'].str.get_dummies(sep=';')).join(df['studios'].str.get_dummies(sep=';'))

In [8]:
df_encoded.head(5)

Unnamed: 0,id,title,title_english,type,source,episodes,status,airing,rating,score,...,studio NAGURI,studio YOG,studio hb,studio2 Animation Lab,team Yamahitsuji,teamKG,trenova,ufotable,uzupiyo Animation & Digital Works,yell
0,1,Cowboy Bebop,Cowboy Bebop,TV,Original,26,Finished Airing,False,R - 17+ (violence & profanity),8.75,...,0,0,0,0,0,0,0,0,0,0
1,5,Cowboy Bebop: Tengoku no Tobira,Cowboy Bebop: The Movie,Movie,Original,1,Finished Airing,False,R - 17+ (violence & profanity),8.38,...,0,0,0,0,0,0,0,0,0,0
2,6,Trigun,Trigun,TV,Manga,26,Finished Airing,False,PG-13 - Teens 13 or older,8.22,...,0,0,0,0,0,0,0,0,0,0
3,7,Witch Hunter Robin,Witch Hunter Robin,TV,Original,26,Finished Airing,False,PG-13 - Teens 13 or older,7.24,...,0,0,0,0,0,0,0,0,0,0
4,8,Bouken Ou Beet,Beet the Vandel Buster,TV,Manga,52,Finished Airing,False,PG - Children,6.93,...,0,0,0,0,0,0,0,0,0,0


In [9]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27970 entries, 0 to 27969
Columns: 1254 entries, id to yell
dtypes: bool(2), float64(1), int64(1239), object(12)
memory usage: 267.2+ MB


In [10]:
df_encoded['airing'] = df_encoded['airing'].apply(lambda x: 1 if x else 0)
df_encoded.head(5)

Unnamed: 0,id,title,title_english,type,source,episodes,status,airing,rating,score,...,studio NAGURI,studio YOG,studio hb,studio2 Animation Lab,team Yamahitsuji,teamKG,trenova,ufotable,uzupiyo Animation & Digital Works,yell
0,1,Cowboy Bebop,Cowboy Bebop,TV,Original,26,Finished Airing,0,R - 17+ (violence & profanity),8.75,...,0,0,0,0,0,0,0,0,0,0
1,5,Cowboy Bebop: Tengoku no Tobira,Cowboy Bebop: The Movie,Movie,Original,1,Finished Airing,0,R - 17+ (violence & profanity),8.38,...,0,0,0,0,0,0,0,0,0,0
2,6,Trigun,Trigun,TV,Manga,26,Finished Airing,0,PG-13 - Teens 13 or older,8.22,...,0,0,0,0,0,0,0,0,0,0
3,7,Witch Hunter Robin,Witch Hunter Robin,TV,Original,26,Finished Airing,0,PG-13 - Teens 13 or older,7.24,...,0,0,0,0,0,0,0,0,0,0
4,8,Bouken Ou Beet,Beet the Vandel Buster,TV,Manga,52,Finished Airing,0,PG - Children,6.93,...,0,0,0,0,0,0,0,0,0,0


## Label encoding
- `status`
- `type`
- `source`
- `rating`

In [11]:
df_encoded['status'].value_counts()

status
Finished Airing     27034
Not yet aired         544
Currently Airing      392
Name: count, dtype: int64

In [12]:
df_encoded['type'].value_counts()

type
TV            8203
Movie         4766
OVA           4157
ONA           3861
Music         3731
Special       1769
TV Special     737
CM             434
PV             231
unknown         81
Name: count, dtype: int64

In [13]:
df_encoded['source'].value_counts()

source
Original        11786
Manga            5396
Unknown          2854
Game             1413
Other            1253
Visual novel     1153
Light novel      1151
Novel             805
Web manga         606
4-koma manga      330
Picture book      269
Music             261
Mixed media       223
Book              217
Web novel         163
Card game          76
Radio              14
Name: count, dtype: int64

In [17]:
df_encoded['rating'].value_counts()

rating
PG-13 - Teens 13 or older         9984
G - All Ages                      8761
PG - Children                     4327
Rx - Hentai                       1553
R - 17+ (violence & profanity)    1551
R+ - Mild Nudity                  1197
no_rating                          597
Name: count, dtype: int64

In [None]:
status_encoder = LabelEncoder()
type_encoder = LabelEncoder()
source_encoder = LabelEncoder()
rating_encoder = LabelEncoder()

df_encoded['status'] = status_encoder.fit_transform(df_encoded['status'])
df_encoded['type'] = type_encoder.fit_transform(df_encoded['type'])
df_encoded['source'] = source_encoder.fit_transform(df_encoded['source'])
df_encoded['rating'] = rating_encoder.fit_transform(df_encoded['rating'])

In [19]:
df_encoded.sample(1)

Unnamed: 0,id,title,title_english,type,source,episodes,status,airing,rating,score,...,studio NAGURI,studio YOG,studio hb,studio2 Animation Lab,team Yamahitsuji,teamKG,trenova,ufotable,uzupiyo Animation & Digital Works,yell
12522,34746,Fussa-shi PR Anime,Fussa City Promotion Animation,5,9,2,1,0,0,5.5,...,0,0,0,0,0,0,0,0,0,0


# Modeling

## Split dataset for training and test