Preprocessing Anime dataset from Kaggle

In [1]:
from collections import defaultdict
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib as plt
import re
import string

1. Data preprocessing

1.1 Data Cleaning

In [2]:
# remove unwanted features (columns) from the dataset
anime_df = pd.read_csv("anime.csv")
anime_df.drop(columns=["members"], axis = 1, inplace = True)

#removing unwanted characters from the anime name strings
def text_cleaning(text):
    text = re.sub(r'&quot;', '', text)
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub(r'.hack//', '', text)
    text = re.sub(r'&#039;', '', text)
    text = re.sub(r'A&#039;s', '', text)
    text = re.sub(r'I&#039;', 'I\'', text)
    text = re.sub(r'&amp;', 'and', text)
    text = re.sub(r'Â°', '',text)

    return text

anime_df['name'] = anime_df['name'].apply(text_cleaning)
anime_df.head(10)

Unnamed: 0,anime_id,name,genre,type,episodes,rating
0,32281,Kimi no Na wa,"Drama, Romance, School, Supernatural",Movie,1,9.37
1,5114,Fullmetal Alchemist Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25
3,9253,SteinsGate,"Sci-Fi, Thriller",TV,24,9.17
4,9969,Gintama039,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16
5,32935,Haikyuu Karasuno Koukou VS Shiratorizawa Gakue...,"Comedy, Drama, School, Shounen, Sports",TV,10,9.15
6,11061,Hunter x Hunter 2011,"Action, Adventure, Shounen, Super Power",TV,148,9.13
7,820,Ginga Eiyuu Densetsu,"Drama, Military, Sci-Fi, Space",OVA,110,9.11
8,15335,Gintama Movie Kanketsuhen Yorozuya yo Eien Nare,"Action, Comedy, Historical, Parody, Samurai, S...",Movie,1,9.1
9,15417,Gintama039 Enchousen,"Action, Comedy, Historical, Parody, Samurai, S...",TV,13,9.11


In [56]:
anime_df.episodes.replace({'Unknown':np.nan},inplace=True)

In [4]:
user_ratings_df = pd.read_csv("rating.csv")
user_ratings_df.head(10)

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1
5,1,355,-1
6,1,356,-1
7,1,442,-1
8,1,487,-1
9,1,846,-1


1.2 Merging the datasets

In [5]:
anime_rating = pd.merge(anime_df, user_ratings_df, on='anime_id')
anime_rating.rename(columns={'rating_x':'avg_rating','rating_y':'user_rating'},inplace=True)
anime_rating.head()

Unnamed: 0,anime_id,name,genre,type,episodes,avg_rating,user_id,user_rating
0,32281,Kimi no Na wa,"Drama, Romance, School, Supernatural",Movie,1,9.37,99,5
1,32281,Kimi no Na wa,"Drama, Romance, School, Supernatural",Movie,1,9.37,152,10
2,32281,Kimi no Na wa,"Drama, Romance, School, Supernatural",Movie,1,9.37,244,10
3,32281,Kimi no Na wa,"Drama, Romance, School, Supernatural",Movie,1,9.37,271,10
4,32281,Kimi no Na wa,"Drama, Romance, School, Supernatural",Movie,1,9.37,278,-1


In [6]:
# anime_id and user_id are both unique values so convert them from int to objects
anime_rating['anime_id'] = anime_rating.anime_id.astype('object')
anime_rating['user_id'] = anime_rating.user_id.astype('object')

1.3 Deal with missing values

In [13]:
# see how many null values are present within our merged dataframe (anime_rating)
missing_value = pd.DataFrame({
    'Missing Value': anime_rating.isnull().sum()
})
display(missing_value)

Unnamed: 0,Missing Value
anime_id,0
name,0
genre,110
type,4
episodes,8
avg_rating,6
user_id,0
user_rating,0


In [11]:
# see how many null values are present within our initial dataset (anime_df)
anime_df.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes    340
rating      230
dtype: int64

In [22]:
# remove all null values from both dataframes
anime_rating.dropna(inplace=True)
anime_df.dropna(inplace=True)

anime_rating.isnull().sum()

anime_id       0
name           0
genre          0
type           0
episodes       0
avg_rating     0
user_id        0
user_rating    0
dtype: int64

In [23]:
anime_df.isnull().sum()

anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
dtype: int64

1.4 Duplicate Data

In [17]:
duplicate = anime_rating.duplicated(subset=['anime_id','user_id']).sum()
print('There are {} duplicated rows in anime_rating'.format(duplicate))

duplicate = anime_df.duplicated().sum()
print('There are {} duplicated rows in anime_df'.format(duplicate))

There are 7 duplicated rows in anime_rating
There are 0 duplicated rows in anime_df


In [18]:
anime_rating.drop_duplicates(subset=['anime_id','user_id'],inplace=True)

#checking to see if duplicates have been removed
duplicate = anime_rating.duplicated(subset=['anime_id','user_id']).sum()
print('There are {} duplicated rows in anime_rating'.format(duplicate))

There are 0 duplicated rows in anime_rating


1.5 Indexing

In [25]:
anime_rating.shape

(7813600, 8)

In [27]:
# we see that the indexes for the last 5 entries range from 7813722-7813726, but there are only 7813600 
# entries so the indexes need to be reset
anime_rating.reset_index(drop=True,inplace=True)
anime_rating.tail()

Unnamed: 0,anime_id,name,genre,type,episodes,avg_rating,user_id,user_rating
7813595,6133,Violence Gekiga Shin David no Hoshi Inma Densetsu,Hentai,OVA,1,4.98,39532,-1
7813596,6133,Violence Gekiga Shin David no Hoshi Inma Densetsu,Hentai,OVA,1,4.98,48766,-1
7813597,6133,Violence Gekiga Shin David no Hoshi Inma Densetsu,Hentai,OVA,1,4.98,60365,4
7813598,26081,Yasuji no Pornorama Yacchimae,Hentai,Movie,1,5.46,27364,-1
7813599,26081,Yasuji no Pornorama Yacchimae,Hentai,Movie,1,5.46,48766,-1


1.6 Ratings

In [30]:
# user_rating entries with a -1 indicate that a user has not rated that anime yet, so we remove them
anime_rating.user_rating.replace({-1:np.nan},inplace=True)
anime_rating.dropna(inplace=True)

1.7 Categorical Encoding

In [31]:
anime_df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating
0,32281,Kimi no Na wa,"Drama, Romance, School, Supernatural",Movie,1,9.37
1,5114,Fullmetal Alchemist Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25
3,9253,SteinsGate,"Sci-Fi, Thriller",TV,24,9.17
4,9969,Gintama039,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16


1.7.1 Genres

In [35]:
genres = anime_df['genre'].str.split(',', expand=True)
genres.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,Drama,Romance,School,Supernatural,,,,,,,,,
1,Action,Adventure,Drama,Fantasy,Magic,Military,Shounen,,,,,,
2,Action,Comedy,Historical,Parody,Samurai,Sci-Fi,Shounen,,,,,,
3,Sci-Fi,Thriller,,,,,,,,,,,
4,Action,Comedy,Historical,Parody,Samurai,Sci-Fi,Shounen,,,,,,


In [43]:
# considering only first four genres columns to limit the dimensions
genres = genres.iloc[:,:4]
genres.columns = ['genre1', 'genre2', 'genre3','genre4']
genres.head()

Unnamed: 0,genre1,genre2,genre3,genre4
0,Drama,Romance,School,Supernatural
1,Action,Adventure,Drama,Fantasy
2,Action,Comedy,Historical,Parody
3,Sci-Fi,Thriller,,
4,Action,Comedy,Historical,Parody


In [57]:
updated_rating = anime_df[:]

# removing the singular genre column to assign a column to each specific genre which 
# will be useful when calculating similarity later on
updated_rating = updated_rating.drop('genre', axis=1) 
updated_rating = pd.concat([updated_rating, genres], axis=1)

updated_rating.episodes=updated_rating.episodes.astype('int')

updated_rating.head()

Unnamed: 0,anime_id,name,type,episodes,rating,genre1,genre2,genre3,genre4
0,32281,Kimi no Na wa,Movie,1,9.37,Drama,Romance,School,Supernatural
1,5114,Fullmetal Alchemist Brotherhood,TV,64,9.26,Action,Adventure,Drama,Fantasy
2,28977,Gintama°,TV,51,9.25,Action,Comedy,Historical,Parody
3,9253,SteinsGate,TV,24,9.17,Sci-Fi,Thriller,,
4,9969,Gintama039,TV,51,9.16,Action,Comedy,Historical,Parody


In [58]:
categorical_df = updated_rating.select_dtypes('object')
numerical_df = updated_rating.select_dtypes('number')

categorical_df.head()

Unnamed: 0,name,type,genre1,genre2,genre3,genre4
0,Kimi no Na wa,Movie,Drama,Romance,School,Supernatural
1,Fullmetal Alchemist Brotherhood,TV,Action,Adventure,Drama,Fantasy
2,Gintama°,TV,Action,Comedy,Historical,Parody
3,SteinsGate,TV,Sci-Fi,Thriller,,
4,Gintama039,TV,Action,Comedy,Historical,Parody


In [59]:
# convert the categorical data into numerical indicators
# here, a 1 indicates an anime has that attribute, a 0 means they do not
dummy_df = pd.get_dummies(categorical_df[['type','genre1','genre2','genre3','genre4']])
dummy_df.head()

Unnamed: 0,type_Movie,type_Music,type_ONA,type_OVA,type_Special,type_TV,genre1_Action,genre1_Adventure,genre1_Cars,genre1_Comedy,...,genre4_ Shounen Ai,genre4_ Slice of Life,genre4_ Space,genre4_ Sports,genre4_ Super Power,genre4_ Supernatural,genre4_ Thriller,genre4_ Vampire,genre4_ Yaoi,genre4_ Yuri
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


1.8 Feature Scaling

The dataset consists of features that have a very large range, in particular the numerical features. Examples of this will be shown below.

In order for the machine learning models to interpret these features on the same scale, feature scaling will be necessary.

In [62]:
numerical_df.head()

Unnamed: 0,anime_id,episodes,rating
0,32281,1,9.37
1,5114,64,9.26
2,28977,51,9.25
3,9253,24,9.17
4,9969,51,9.16


In [61]:
print('Min rating: ', min(numerical_df['rating']), 'Max rating: ', max(numerical_df['rating']))
print('Min episodes: ', min(numerical_df['episodes']), 'Max episodes: ', max(numerical_df['episodes']))

Min rating:  1.67 Max rating:  10.0
Min episodes:  1 Max episodes:  1818


In [64]:
# set anime_id as the index so it doesnt get scaled as well
numerical_df.set_index('anime_id',inplace=True)
numerical_df.head()

Unnamed: 0_level_0,episodes,rating
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1
32281,1,9.37
5114,64,9.26
28977,51,9.25
9253,24,9.17
9969,51,9.16


1.8.1 Scaling

In [65]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_df = pd.DataFrame(scaler.fit_transform(numerical_df),columns=numerical_df.columns)

scaled_df.head()

Unnamed: 0,episodes,rating
0,-0.243905,2.831301
1,1.093813,2.723363
2,0.817776,2.713551
3,0.244468,2.63505
4,0.817776,2.625238


In [67]:
dummy_df.reset_index(drop=True,inplace=True)

complete_df = pd.concat([scaled_df, dummy_df], axis = 1)
complete_df.head()

Unnamed: 0,episodes,rating,type_Movie,type_Music,type_ONA,type_OVA,type_Special,type_TV,genre1_Action,genre1_Adventure,...,genre4_ Shounen Ai,genre4_ Slice of Life,genre4_ Space,genre4_ Sports,genre4_ Super Power,genre4_ Supernatural,genre4_ Thriller,genre4_ Vampire,genre4_ Yaoi,genre4_ Yuri
0,-0.243905,2.831301,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,1.093813,2.723363,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0.817776,2.713551,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0.244468,2.63505,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.817776,2.625238,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
