# ETL Project

## Step 1 Data Cleaning

In [19]:
#import dependencies
import pandas as pd
from sqlalchemy import create_engine

In [167]:
#load data file 1 anime list
anime_file = "resources/anime.csv"
anime_file_pd =  pd.read_csv(anime_file)
#copy the file into a cleaned file for later data cleaning
anime_cleaned=anime_file_pd

In [73]:
#load data file 2 rating per user
rating_file = "resources/rating.csv"
rating_file_pd =  pd.read_csv(rating_file)
#copy the file into a cleaned file for later data cleaning
rating_cleaned=rating_file_pd

In [75]:
#display the anime file
anime_cleaned.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [76]:
#display the rating file
rating_cleaned.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


### 1) Clean anime file

In [98]:
#check data type of anime
anime_cleaned.dtypes

anime_id      int64
name         object
genre        object
type         object
episodes     object
rating      float64
members       int64
dtype: object

In [78]:
#understanding if there's NA in the column episodes before turning it into a integer
anime_cleaned['episodes'].unique()

array(['1', '64', '51', '24', '10', '148', '110', '13', '201', '25', '22',
       '75', '4', '26', '12', '27', '43', '74', '37', '2', '11', '99',
       'Unknown', '39', '101', '47', '50', '62', '33', '112', '23', '3',
       '94', '6', '8', '14', '7', '40', '15', '203', '77', '291', '120',
       '102', '96', '38', '79', '175', '103', '70', '153', '45', '5',
       '21', '63', '52', '28', '145', '36', '69', '60', '178', '114',
       '35', '61', '34', '109', '20', '9', '49', '366', '97', '48', '78',
       '358', '155', '104', '113', '54', '167', '161', '42', '142', '31',
       '373', '220', '46', '195', '17', '1787', '73', '147', '127', '16',
       '19', '98', '150', '76', '53', '124', '29', '115', '224', '44',
       '58', '93', '154', '92', '67', '172', '86', '30', '276', '59',
       '72', '330', '41', '105', '128', '137', '56', '55', '65', '243',
       '193', '18', '191', '180', '91', '192', '66', '182', '32', '164',
       '100', '296', '694', '95', '68', '117', '151', '130',

In [99]:
#drop rows with 'Unknown' in the column episodes
anime_cleaned = anime_cleaned[~anime_file_pd.episodes.str.contains("Unknown")]
anime_cleaned['episodes'].unique()

array(['1', '64', '51', '24', '10', '148', '110', '13', '201', '25', '22',
       '75', '4', '26', '12', '27', '43', '74', '37', '2', '11', '99',
       '39', '101', '47', '50', '62', '33', '112', '23', '3', '94', '6',
       '8', '14', '7', '40', '15', '203', '77', '291', '120', '102', '96',
       '38', '79', '175', '103', '70', '153', '45', '5', '21', '63', '52',
       '28', '145', '36', '69', '60', '178', '114', '35', '61', '34',
       '109', '20', '9', '49', '366', '97', '48', '78', '358', '155',
       '104', '113', '54', '167', '161', '42', '142', '31', '373', '220',
       '46', '195', '17', '1787', '73', '147', '127', '16', '19', '98',
       '150', '76', '53', '124', '29', '115', '224', '44', '58', '93',
       '154', '92', '67', '172', '86', '30', '276', '59', '72', '330',
       '41', '105', '128', '137', '56', '55', '65', '243', '193', '18',
       '191', '180', '91', '192', '66', '182', '32', '164', '100', '296',
       '694', '95', '68', '117', '151', '130', '87', '170

In [100]:
#turn episodes into integer
anime_cleaned['episodes']=pd.to_numeric(anime_cleaned['episodes'])
#confirm the change
anime_cleaned.dtypes

anime_id      int64
name         object
genre        object
type         object
episodes      int64
rating      float64
members       int64
dtype: object

In [95]:
#check descriptives of anime
anime_file_pd.describe()

Unnamed: 0,anime_id,rating,members
count,12294.0,12064.0,12294.0
mean,14058.221653,6.473902,18071.34
std,11455.294701,1.026746,54820.68
min,1.0,1.67,5.0
25%,3484.25,5.88,225.0
50%,10260.5,6.57,1550.0
75%,24794.5,7.18,9437.0
max,34527.0,10.0,1013917.0


In [101]:
#turn type into category
anime_cleaned['type']=anime_cleaned['type'].astype('category')
anime_cleaned.dtypes

anime_id       int64
name          object
genre         object
type        category
episodes       int64
rating       float64
members        int64
dtype: object

In [91]:
#turn genre into two columns through sparsing to make sure each row has at least one genre
anime_cleaned=anime_cleaned.join(anime_cleaned['genre'].str.split(',', 1, expand=True).add_prefix('genre'))

ValueError: columns overlap but no suffix specified: Index(['genre0', 'genre1'], dtype='object')

In [94]:
#checking if column genre0 has NA
anime_cleaned['genre0'].unique()

array(['Drama', 'Action', 'Sci-Fi', 'Comedy', 'Adventure', 'Fantasy',
       'Mystery', 'Psychological', 'Ecchi', 'Josei', 'Military',
       'Romance', 'Demons', 'Dementia', 'Music', 'Game', 'Cars', 'Mecha',
       'Horror', 'School', 'Historical', 'Kids', 'Shounen', 'Shoujo',
       'Magic', 'Harem', 'Martial Arts', 'Sports', 'Slice of Life',
       'Seinen', nan, 'Parody', 'Police', 'Thriller', 'Supernatural',
       'Samurai', 'Super Power', 'Vampire', 'Space', 'Hentai', 'Yaoi'],
      dtype=object)

In [96]:
#drop the newly created two columns
anime_cleaned=anime_cleaned.drop(['genre0','genre1'],axis=1)
anime_cleaned.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [102]:
#basic descriptives for the anime file
anime_cleaned.describe()

Unnamed: 0,anime_id,episodes,rating,members
count,11954.0,11954.0,11876.0,11954.0
mean,13589.70512,12.38255,6.480333,18368.56
std,11209.052798,46.865352,1.021995,55110.78
min,1.0,1.0,1.67,5.0
25%,3380.25,1.0,5.89,229.0
50%,9934.5,2.0,6.57,1579.5
75%,23644.5,12.0,7.1825,9755.5
max,34527.0,1818.0,10.0,1013917.0


In [103]:
#drop columns in rating that has null value
anime_cleaned=anime_cleaned.dropna()
anime_cleaned.describe()

Unnamed: 0,anime_id,episodes,rating,members
count,11830.0,11830.0,11830.0,11830.0
mean,13404.150211,12.486729,6.484609,18511.0
std,11110.087616,47.097131,1.019147,55371.44
min,1.0,1.0,1.67,12.0
25%,3326.25,1.0,5.8925,232.25
50%,9820.5,2.0,6.57,1589.5
75%,23302.5,12.0,7.19,9832.0
max,34519.0,1818.0,10.0,1013917.0


In [104]:
#check if there is duplicated anime_id
anime_cleaned[anime_cleaned.duplicated('anime_id')]

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members


In [149]:
#reset index
anime_cleaned=anime_cleaned.reset_index(drop=True)

### 2) Clean rating file

In [107]:
#understand data type of rating file
rating_cleaned.dtypes

user_id     int64
anime_id    int64
rating      int64
dtype: object

In [106]:
#understand descriptives of rating file
rating_cleaned.describe()

Unnamed: 0,user_id,anime_id,rating
count,7813737.0,7813737.0,7813737.0
mean,36727.96,8909.072,6.14403
std,20997.95,8883.95,3.7278
min,1.0,1.0,-1.0
25%,18974.0,1240.0,6.0
50%,36791.0,6213.0,7.0
75%,54757.0,14093.0,9.0
max,73516.0,34519.0,10.0


In [111]:
#drop the rows with negative ratings
rating_cleaned=rating_cleaned[rating_cleaned['rating']>=0]

In [112]:
rating_cleaned.describe()

Unnamed: 0,user_id,anime_id,rating
count,6337241.0,6337241.0,6337241.0
mean,36747.91,8902.866,7.808497
std,21013.4,8882.0,1.572496
min,1.0,1.0,1.0
25%,18984.0,1239.0,7.0
50%,36815.0,6213.0,8.0
75%,54873.0,14075.0,9.0
max,73516.0,34475.0,10.0


## Manipulate cleaned anime file for further analysis

In [105]:
anime_cleaned[anime_cleaned.duplicated('name')]

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
10141,30059,Saru Kani Gassen,Drama,Movie,1,4.75,76
10194,33195,Shi Wan Ge Leng Xiaohua,"Action, Adventure, Comedy, Fantasy, Parody",Movie,1,7.07,110


In [168]:
#parse and store the genre data into different columns  multiple response

AttributeError: 'DataFrame' object has no attribute 'str'

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
5,32935,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Ga...,"Comedy, Drama, School, Shounen, Sports",TV,10,9.15,93351
6,11061,Hunter x Hunter (2011),"Action, Adventure, Shounen, Super Power",TV,148,9.13,425855
7,820,Ginga Eiyuu Densetsu,"Drama, Military, Sci-Fi, Space",OVA,110,9.11,80679
8,15335,Gintama Movie: Kanketsu-hen - Yorozuya yo Eien...,"Action, Comedy, Historical, Parody, Samurai, S...",Movie,1,9.10,72534
9,15417,Gintama&#039;: Enchousen,"Action, Comedy, Historical, Parody, Samurai, S...",TV,13,9.11,81109


In [145]:
anime_cleaned1 = anime_cleaned[~anime_cleaned.index.duplicated(keep='first')]

In [None]:
anime_cleaned = anime_cleaned.join(anime_cleaned['genre'].str.get_dummies(sep=','))

In [155]:
anime_cleaned.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [166]:
anime_cleaned.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members,'Adventure','Adventure'],'Cars',...,['Shounen'],['Slice of Life',['Slice of Life'],['Space'],['Sports'],['Super Power',['Supernatural'],['Thriller'],['Vampire'],['Yaoi']
0,32281,Kimi no Na wa.,"[Drama, Romance, School, Supernatural]",Movie,1,9.37,200630,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5114,Fullmetal Alchemist: Brotherhood,"[Action, Adventure, Drama, Fantasy, Magic, Mil...",TV,64,9.26,793665,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,28977,Gintama°,"[Action, Comedy, Historical, Parody, Samurai, ...",TV,51,9.25,114262,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9253,Steins;Gate,"[Sci-Fi, Thriller]",TV,24,9.17,673572,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,9969,Gintama&#039;,"[Action, Comedy, Historical, Parody, Samurai, ...",TV,51,9.16,151266,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [165]:
anime_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Columns: 159 entries, anime_id to ['Yaoi']
dtypes: float64(1), int64(154), object(4)
memory usage: 14.9+ MB


In [133]:
#step 2. create dumy matrix for each genre
anime_cleaned1['genre'].str.split(',\s*').apply(lambda x: pd.Series(1,index=x))


InvalidIndexError: Reindexing only valid with uniquely valued Index objects

In [135]:
df = pd.DataFrame({'Movies': ['Harry Potter', 'Toy Story'],
                   'Genres': ['fantasy,adventure', 
                              'adventure,animation,children,comedy,fantasy']})
df

Unnamed: 0,Movies,Genres
0,Harry Potter,"fantasy,adventure"
1,Toy Story,"adventure,animation,children,comedy,fantasy"


In [None]:
anime_cleaned