In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.core.display import display, HTML
from pylab import rcParams

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer

# keras imports - comment them out or do `pip install keras`
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.layers.embeddings import Embedding

# gensim for pretrained word2vec model
import gensim

# for synopsis clean up
import string

# list of stopwords used by MySQL in MyISAM
stop_words = ["a's" , "able" , "about" , "above" , "according" , "accordingly" , "across" , "actually" , "after" , "afterwards" , "again" , "against" , "ain't" , "all" , "allow" , "allows" , "almost" , "alone" , "along" , "already" , "also" , "although" , "always" , "am" , "among" , "amongst" , "an" , "and" , "another" , "any" , "anybody" , "anyhow" , "anyone" , "anything" , "anyway" , "anyways" , "anywhere" , "apart" , "appear" , "appreciate" , "appropriate" , "are" , "aren't" , "around" , "as" , "aside" , "ask" , "asking" , "associated" , "at" , "available" , "away" , "awfully" , "be" , "became" , "because" , "become" , "becomes" , "becoming" , "been" , "before" , "beforehand" , "behind" , "being" , "believe" , "below" , "beside" , "besides" , "best" , "better" , "between" , "beyond" , "both" , "brief" , "but" , "by" , "c'mon" , "c's" , "came" , "can" , "can't" , "cannot" , "cant" , "cause" , "causes" , "certain" , "certainly" , "changes" , "clearly" , "co" , "com" , "come" , "comes" , "concerning" , "consequently" , "consider" , "considering" , "contain" , "containing" , "contains" , "corresponding" , "could" , "couldn't" , "course" , "currently" , "definitely" , "described" , "despite" , "did" , "didn't" , "different" , "do" , "does" , "doesn't" , "doing" , "don't" , "done" , "down" , "downwards" , "during" , "each" , "edu" , "eg" , "eight" , "either" , "else" , "elsewhere" , "enough" , "entirely" , "especially" , "et" , "etc" , "even" , "ever" , "every" , "everybody" , "everyone" , "everything" , "everywhere" , "ex" , "exactly" , "example" , "except" , "far" , "few" , "fifth" , "first" , "five" , "followed" , "following" , "follows" , "for" , "former" , "formerly" , "forth" , "four" , "from" , "further" , "furthermore" , "get" , "gets" , "getting" , "given" , "gives" , "go" , "goes" , "going" , "gone" , "got" , "gotten" , "greetings" , "had" , "hadn't" , "happens" , "hardly" , "has" , "hasn't" , "have" , "haven't" , "having" , "he" , "he's" , "hello" , "help" , "hence" , "her" , "here" , "here's" , "hereafter" , "hereby" , "herein" , "hereupon" , "hers" , "herself" , "hi" , "him" , "himself" , "his" , "hither" , "hopefully" , "how" , "howbeit" , "however" , "i'd" , "i'll" , "i'm" , "i've" , "ie" , "if" , "ignored" , "immediate" , "in" , "inasmuch" , "inc" , "indeed" , "indicate" , "indicated" , "indicates" , "inner" , "insofar" , "instead" , "into" , "inward" , "is" , "isn't" , "it" , "it'd" , "it'll" , "it's" , "its" , "itself" , "just" , "keep" , "keeps" , "kept" , "know" , "known" , "knows" , "last" , "lately" , "later" , "latter" , "latterly" , "least" , "less" , "lest" , "let" , "let's" , "like" , "liked" , "likely" , "little" , "look" , "looking" , "looks" , "ltd" , "mainly" , "many" , "may" , "maybe" , "me" , "mean" , "meanwhile" , "merely" , "might" , "more" , "moreover" , "most" , "mostly" , "much" , "must" , "my" , "myself" , "name" , "namely" , "nd" , "near" , "nearly" , "necessary" , "need" , "needs" , "neither" , "never" , "nevertheless" , "new" , "next" , "nine" , "no" , "nobody" , "non" , "none" , "noone" , "nor" , "normally" , "not" , "nothing" , "novel" , "now" , "nowhere" , "obviously" , "of" , "off" , "often" , "oh" , "ok" , "okay" , "old" , "on" , "once" , "one" , "ones" , "only" , "onto" , "or" , "other" , "others" , "otherwise" , "ought" , "our" , "ours" , "ourselves" , "out" , "outside" , "over" , "overall" , "own" , "particular" , "particularly" , "per" , "perhaps" , "placed" , "please" , "plus" , "possible" , "presumably" , "probably" , "provides" , "que" , "quite" , "qv" , "rather" , "rd" , "re" , "really" , "reasonably" , "regarding" , "regardless" , "regards" , "relatively" , "respectively" , "right" , "said" , "same" , "saw" , "say" , "saying" , "says" , "second" , "secondly" , "see" , "seeing" , "seem" , "seemed" , "seeming" , "seems" , "seen" , "self" , "selves" , "sensible" , "sent" , "serious" , "seriously" , "seven" , "several" , "shall" , "she" , "should" , "shouldn't" , "since" , "six" , "so" , "some" , "somebody" , "somehow" , "someone" , "something" , "sometime" , "sometimes" , "somewhat" , "somewhere" , "soon" , "sorry" , "specified" , "specify" , "specifying" , "still" , "sub" , "such" , "sup" , "sure" , "t's" , "take" , "taken" , "tell" , "tends" , "th" , "than" , "thank" , "thanks" , "thanx" , "that" , "that's" , "thats" , "the" , "their" , "theirs" , "them" , "themselves" , "then" , "thence" , "there" , "there's" , "thereafter" , "thereby" , "therefore" , "therein" , "theres" , "thereupon" , "these" , "they" , "they'd" , "they'll" , "they're" , "they've" , "think" , "third" , "this" , "thorough" , "thoroughly" , "those" , "though" , "three" , "through" , "throughout" , "thru" , "thus" , "to" , "together" , "too" , "took" , "toward" , "towards" , "tried" , "tries" , "truly" , "try" , "trying" , "twice" , "two" , "un" , "under" , "unfortunately" , "unless" , "unlikely" , "until" , "unto" , "up" , "upon" , "us" , "use" , "used" , "useful" , "uses" , "using" , "usually" , "value" , "various" , "very" , "via" , "viz" , "vs" , "want" , "wants" , "was" , "wasn't" , "way" , "we" , "we'd" , "we'll" , "we're" , "we've" , "welcome" , "well" , "went" , "were" , "weren't" , "what" , "what's" , "whatever" , "when" , "whence" , "whenever" , "where" , "where's" , "whereafter" , "whereas" , "whereby" , "wherein" , "whereupon" , "wherever" , "whether" , "which" , "while" , "whither" , "who" , "who's" , "whoever" , "whole" , "whom" , "whose" , "why" , "will" , "willing" , "wish" , "with" , "within" , "without" , "won't" , "wonder" , "would" , "wouldn't" , "yes" , "yet" , "you" , "you'd" , "you'll" , "you're" , "you've" , "your" , "yours" , "yourself" , "yourselves" , "zero"]

rcParams['figure.figsize'] = 10, 6
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
df = pd.read_csv("data/tidy_anime.csv")
df.shape

(77911, 28)

In [3]:
desired_cols = ['animeID', 'title_english', 'type', 'source', 'producers', 'genre', 'studio',
               'episodes', 'premiered', 'rating', 'score', 'scored_by', 'rank', 'popularity',
               'members', 'favorites', 'synopsis']
truncated_df = df[desired_cols]

In [4]:
desired_cols = ['animeID', 'title_english', 'type', 'source', 'producers', 'genre', 'studio',
               'episodes', 'premiered', 'rating', 'score', 'scored_by', 'rank', 'popularity',
               'members', 'favorites', 'synopsis']
truncated_df = df[desired_cols]
truncated_df.head()

Unnamed: 0,animeID,title_english,type,source,producers,genre,studio,episodes,premiered,rating,score,scored_by,rank,popularity,members,favorites,synopsis
0,1,Cowboy Bebop,TV,Original,Bandai Visual,Action,Sunrise,26.0,Spring 1998,R - 17+ (violence & profanity),8.81,405664,26,39,795733,43460,"In the year 2071, humanity has colonized sever..."
1,1,Cowboy Bebop,TV,Original,Bandai Visual,Adventure,Sunrise,26.0,Spring 1998,R - 17+ (violence & profanity),8.81,405664,26,39,795733,43460,"In the year 2071, humanity has colonized sever..."
2,1,Cowboy Bebop,TV,Original,Bandai Visual,Comedy,Sunrise,26.0,Spring 1998,R - 17+ (violence & profanity),8.81,405664,26,39,795733,43460,"In the year 2071, humanity has colonized sever..."
3,1,Cowboy Bebop,TV,Original,Bandai Visual,Drama,Sunrise,26.0,Spring 1998,R - 17+ (violence & profanity),8.81,405664,26,39,795733,43460,"In the year 2071, humanity has colonized sever..."
4,1,Cowboy Bebop,TV,Original,Bandai Visual,Sci-Fi,Sunrise,26.0,Spring 1998,R - 17+ (violence & profanity),8.81,405664,26,39,795733,43460,"In the year 2071, humanity has colonized sever..."


In [5]:
#change NaN to 0 in Premiered column

truncated_df.loc[:,'premiered'] = truncated_df.loc[:,'premiered'].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [6]:
# filter out bad titles. Only want titles that have an english name

orig_len = len(truncated_df)
filtered_df = truncated_df[truncated_df['title_english'].notnull()]
new_len = len(filtered_df)
print ("removed {} bad anime after filtering for english titled anime only".format(orig_len - new_len))

# drop NaN rows
filtered_df.dropna(inplace=True)
print ("removed {} bad anime after dropping NaN rows".format(new_len - len(filtered_df)))


removed 30430 bad anime after filtering for english titled anime only
removed 8678 bad anime after dropping NaN rows


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [7]:
# currently the anime is duplicated, one row per genre per studio. We need to flatten all to one row
# also need to do this for type, source, producers, rating

all_ids = set(filtered_df['animeID'].unique()) # 1.8K anime IDs
print ("{} unique anime".format(len(all_ids)))

id_genre_mapping = {}
for each_id in all_ids:
    genre_list = list(filtered_df[truncated_df['animeID'] == each_id]['genre'])
    id_genre_mapping[each_id] = genre_list
    
id_studio_mapping = {}
for each_id in all_ids:
    id_studio_mapping[each_id] = list(filtered_df[truncated_df['animeID'] == each_id]['studio'])
    
id_source_mapping = {}
for each_id in all_ids:
    id_source_mapping[each_id] = list(filtered_df[truncated_df['animeID'] == each_id]['source'])
    
id_producers_mapping = {}
for each_id in all_ids:
    id_producers_mapping[each_id] = list(filtered_df[truncated_df['animeID'] == each_id]['producers'])
    
id_rating_mapping = {}
for each_id in all_ids:
    id_rating_mapping[each_id] = list(filtered_df[truncated_df['animeID'] == each_id]['rating'])
    
id_type_mapping = {}
for each_id in all_ids:
    id_type_mapping[each_id] = list(filtered_df[truncated_df['animeID'] == each_id]['type'])

2855 unique anime


  if __name__ == '__main__':
  


In [22]:
# get distinct df, remove duplicates
reduced_df = filtered_df.groupby('animeID').head(1)

In [23]:
# will add 40 columns to the data
all_genres = sorted(list(set([item for sublist in id_genre_mapping.values() for item in sublist])))
all_studios = sorted(list(set([item for sublist in id_studio_mapping.values() for item in sublist])))
all_sources = sorted(list(set([item for sublist in id_source_mapping.values() for item in sublist])))
all_producers = sorted(list(set([item for sublist in id_producers_mapping.values() for item in sublist])))
all_ratings = sorted(list(set([item for sublist in id_rating_mapping.values() for item in sublist])))
all_types = sorted(list(set([item for sublist in id_type_mapping.values() for item in sublist])))

In [24]:
anime_IDs = reduced_df.animeID.tolist()
genres_new = []
studios_new = []
sources_new = []
producers_new = []
ratings_new = []
types_new = []
for each_id in anime_IDs:
    genres_new.append(id_genre_mapping[each_id])
    studios_new.append(id_studio_mapping[each_id])
    sources_new.append(id_source_mapping[each_id])
    producers_new.append(id_producers_mapping[each_id])
    ratings_new.append(id_rating_mapping[each_id])
    types_new.append(id_type_mapping[each_id])
reduced_df.head()

Unnamed: 0,animeID,title_english,type,source,producers,genre,studio,episodes,premiered,rating,score,scored_by,rank,popularity,members,favorites,synopsis
0,1,Cowboy Bebop,TV,Original,Bandai Visual,Action,Sunrise,26.0,Spring 1998,R - 17+ (violence & profanity),8.81,405664,26,39,795733,43460,"In the year 2071, humanity has colonized sever..."
6,5,Cowboy Bebop: The Movie,Movie,Original,Sunrise,Action,Bones,1.0,0,R - 17+ (violence & profanity),8.41,120243,164,449,197791,776,"Another day, another bounty—such is the life o..."
16,6,Trigun,TV,Manga,Victor Entertainment,Action,Madhouse,26.0,Spring 1998,PG-13 - Teens 13 or older,8.3,212537,255,146,408548,10432,"Vash the Stampede is the man with a $$60,000,0..."
22,7,Witch Hunter Robin,TV,Original,Bandai Visual,Action,Sunrise,26.0,Summer 2002,PG-13 - Teens 13 or older,7.33,32837,2371,1171,79397,537,Witches are individuals with special powers li...
28,8,Beet the Vandel Buster,TV,Manga,TV Tokyo,Adventure,Toei Animation,52.0,Fall 2004,PG - Children,7.03,4894,3544,3704,11708,14,It is the dark century and the people are suff...


In [25]:
mlb = MultiLabelBinarizer()
encoded_genres = mlb.fit_transform(genres_new)
encoded_studios = mlb.fit_transform(studios_new)
encoded_sources = mlb.fit_transform(sources_new)
encoded_producers = mlb.fit_transform(producers_new)
encoded_ratings = mlb.fit_transform(ratings_new)
encoded_types = mlb.fit_transform(types_new)

In [26]:
genre_columns_added = encoded_genres.shape[1]

# for col_idx in range(genre_columns_added):
#     reduced_df.insert(len(reduced_df.columns), "genre_{}".format(col_idx+1), encoded_genres[:, col_idx])
    
for col_idx in range(genre_columns_added):
    reduced_df.insert(len(reduced_df.columns), "genre_{}".format(all_genres[col_idx]), encoded_genres[:, col_idx])

for col_idx in range(encoded_studios.shape[1]):
    reduced_df.insert(len(reduced_df.columns), "studio_{}".format(all_studios[col_idx]), encoded_studios[:, col_idx])
    
for col_idx in range(encoded_sources.shape[1]):
    reduced_df.insert(len(reduced_df.columns), "source_{}".format(all_sources[col_idx]), encoded_sources[:, col_idx])
    
for col_idx in range(encoded_producers.shape[1]):
    reduced_df.insert(len(reduced_df.columns), "producer_{}".format(all_producers[col_idx]), encoded_producers[:, col_idx])
    
for col_idx in range(encoded_ratings.shape[1]):
    reduced_df.insert(len(reduced_df.columns), "rating_{}".format(all_ratings[col_idx]), encoded_ratings[:, col_idx])
    
for col_idx in range(encoded_types.shape[1]):
    reduced_df.insert(len(reduced_df.columns), "type_{}".format(all_types[col_idx]), encoded_types[:, col_idx])

In [27]:
reduced_df = reduced_df.drop(['type', 'source', 'producers', 'genre', 'studio', 'rating'], axis = 1)
reduced_df.to_csv(r'data/one_hot_encode_complete.csv')
reduced_df.head()

Unnamed: 0,animeID,title_english,episodes,premiered,score,scored_by,rank,popularity,members,favorites,...,rating_PG - Children,rating_PG-13 - Teens 13 or older,rating_R - 17+ (violence & profanity),rating_R+ - Mild Nudity,type_Movie,type_Music,type_ONA,type_OVA,type_Special,type_TV
0,1,Cowboy Bebop,26.0,Spring 1998,8.81,405664,26,39,795733,43460,...,0,0,1,0,0,0,0,0,0,1
6,5,Cowboy Bebop: The Movie,1.0,0,8.41,120243,164,449,197791,776,...,0,0,1,0,1,0,0,0,0,0
16,6,Trigun,26.0,Spring 1998,8.3,212537,255,146,408548,10432,...,0,1,0,0,0,0,0,0,0,1
22,7,Witch Hunter Robin,26.0,Summer 2002,7.33,32837,2371,1171,79397,537,...,0,1,0,0,0,0,0,0,0,1
28,8,Beet the Vandel Buster,52.0,Fall 2004,7.03,4894,3544,3704,11708,14,...,1,0,0,0,0,0,0,0,0,1


In [76]:
#### Decreasing Count of Entries for each column value

#set options to see full dataframe
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

reduced_df.groupby('genre').count().sort_values(by=["animeID"], ascending=False)['animeID']
    #Keep top 15? Add Horror, Historical?

#reduced_df.groupby('studio').count().sort_values(by=["animeID"], ascending=False)['animeID']
    #Keep studios with 20+ records?

#reduced_df.groupby('source').count().sort_values(by=["animeID"], ascending=False)['animeID']
    #Keep all 15?

#reduced_df.groupby('producers').count().sort_values(by=["animeID"], ascending=False)['animeID']
    #Keep Producers with 30+ records?

#reduced_df.groupby('rating').count().sort_values(by=["animeID"], ascending=False)['animeID']
    #Keep all except 'None'?

genre
Action           1122
Comedy           520 
Adventure        287 
Slice of Life    179 
Drama            136 
Sci-Fi           111 
Mystery          69  
Music            59  
Fantasy          54  
Game             43  
Harem            42  
Military         25  
Ecchi            22  
Kids             21  
Romance          21  
Sports           21  
Magic            16  
Dementia         16  
Historical       16  
Psychological    13  
Horror           12  
Supernatural     9   
Mecha            8   
Demons           8   
Space            6   
School           5   
Cars             3   
Parody           2   
Martial Arts     2   
Seinen           2   
Police           1   
Samurai          1   
Shounen          1   
Josei            1   
Thriller         1   
Name: animeID, dtype: int64

# Encoding textual data - tokenization approach
* May want to look into cleaning all the synopsis first, ex: remove (), lower case, etc (disregarding this for now)

In [28]:
synopsis_list = reduced_df['synopsis'].tolist()
# synopsis_list

In [64]:
VOCAB_SIZE = None
MAX_SEQ_LEN = 0

# find vocab_size
all_words = {}

# DEBUG
debug = False
init = 0
count = 100

for synopsis in synopsis_list:

    # =========================
    if debug:
        if init == count - 1:
            print("Original Synopsis:")
            print(synopsis + "\n")
    # =========================
    
    # deletes synopsis credits e.g. [Written by MAL Rewrite], (Source: ANN)
    if synopsis[-1] == ")":
        idx = synopsis.rfind("(")
        synopsis = synopsis[:idx]
    elif synopsis[-1] == "]":
        idx = synopsis.rfind("[")
        synopsis = synopsis[:idx]
    
    # deletes punctuation
    punctuation = string.punctuation
    for c in punctuation:
        synopsis = synopsis.replace(c, "")
    
    synopsis = synopsis.lower()
    
    # =========================
    if debug:
        if init == count - 1:
            print("Lower Case Synopsis Without Punctuation:")
            print(synopsis + "\n")
    # =========================

    word_list = synopsis.split(" ")
    
    
    # =========================
    if debug:
        if init == count - 1:
            print("Original Word List:")
            print(str(word_list) + "\n")
    # =========================
    
    # remove stop words
    word_list = [x for x in word_list if x not in stop_words]
    
    
    # =========================
    if debug:
        if init == count - 1:
            print("Word List Without Stop Words:")
            print(str(word_list) + "\n")
        init += 1
    # =========================
    
    # find max seq len
    if len(word_list) > MAX_SEQ_LEN:
        MAX_SEQ_LEN = len(word_list)
        sent = word_list

    for ea_word in word_list:
        
        if ea_word in all_words:
            all_words[ea_word] += 1
        else:
            all_words[ea_word] = 1
VOCAB_SIZE = len(all_words.keys())
print ('vocab_size = ', VOCAB_SIZE)
print ('max_seq_len = ', MAX_SEQ_LEN)
# print (MAX_SEQ_LEN, sent)

vocab_size =  21259
max_seq_len =  290


In [63]:
encoded_synopsis = [one_hot(x, VOCAB_SIZE) for x in synopsis_list]
padded_synopsis = pad_sequences(encoded_synopsis, maxlen=MAX_SEQ_LEN, padding='pre')
padded_synopsis.shape

(1867, 290)

In [61]:
# load pretrained google word2vec model
model = gensim.models.KeyedVectors.load_word2vec_format('./data/GoogleNews-vectors-negative300.bin.gz', binary=True)

FileNotFoundError: [Errno 2] No such file or directory: './data/GoogleNews-vectors-negative300.bin.gz'

In [None]:
# get averaged word embedding
EMBED_DIM = 300

all_synopsis_vectors = np.empty((0, EMBED_DIM))
print (all_synopsis_vectors.shape)
for each_synopsis in synopsis_list:
    synopsis_words = each_synopsis.split(" ")
    word_count = len(synopsis_words)
    synopsis_sum_vector = np.zeros((EMBED_DIM,))
    for each_word in synopsis_words:
        try:
            synopsis_sum_vector += model[each_word]
        except:
            # word not in pretrained vocab
            pass
    synopsis_avg_vector = (synopsis_sum_vector / word_count).reshape(1, -1)
    all_synopsis_vectors = np.append(all_synopsis_vectors, synopsis_avg_vector, axis=0)
all_synopsis_vectors.shape

In [None]:
synopsis_columns_added = all_synopsis_vectors.shape[1]
for col_idx in range(synopsis_columns_added):
    reduced_df.insert(len(reduced_df.columns), "synopsis_embedded_{}".format(col_idx+1), all_synopsis_vectors[:, col_idx])
reduced_df.head()