In [251]:
#Import package
import re
import pandas as pd
import numpy as np
import warnings
from string import digits
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import random
import statistics
warnings.filterwarnings('ignore')

## Load Data and Preprocessing

### Load data from specific csv files

In [442]:
def read_data(filename1='anime.csv', filename2='anime_with_synopsis.csv', filename3='rating_complete.csv'):
    '''
    This function is used to read the data and combine them.
    :param filename1: data path
    :param filename2: data path
    :param filename3: data path
    :return: Merged DataFrame
    '''
    df1=pd.read_csv(filename1)
    df2=pd.read_csv(filename2)
    df3=pd.read_csv(filename3)
    new_df=df1.merge(df2, on=['MAL_ID','Genres'])
    return new_df

### Preprocessing on raw data

In [446]:
def get_UnknownName(x):
    '''
    This function is used to get all animes with English name "Unknown".
    :param filename1: Raw DataFrame
    :return: list of anime with English name 'Unknown'
    '''
    Unknown=[]
    for i in range(x.shape[0]):
        if x['English name'][i] == 'Unknown':
            Unknown.append(i)
        else:
            continue
    return Unknown

In [453]:
def create_soup(x):
    '''
    This function is used to combine features 'Genres', 'Type' and 'Producers'.
    :param x: sentence 
    :return: combined new feature 'soup'
    '''
    return  ' '.join(x['Genres']) + ',' + ' '.join(x['Type']) + ',' + ' '.join(x['Producers']) #+ str(x['sypnopsis'])

In [454]:
def bag_words(x):
    '''
    This function is used to combine all features together.
    :param x: merged data
    :return: combined new feature 'word_features'
    '''
    return ''.join(x['soup']) + str(x['English name'])  + str(x['sypnopsis'])  

In [455]:
def remove_stopwords(x):
    '''
    This function is used to remove stopwords in merged features.
    :param x: merged feature
    :return: combined feature we will use 'word_features_tokenized'
    '''
    sentence_depart=word_tokenize(x)
    stop_words = set(stopwords.words('english'))
    outstr = ''
    for word in sentence_depart:
        if word not in stop_words:
            if word != '\t':
                outstr += word
                outstr += " "
    return outstr

In [463]:
def preprocessing(raw_data, features=['MAL_ID','English name','Genres','Type','Producers','sypnopsis'], user_id=64807):
    '''
    This function is used to do preprocessing work.
    :param raw_data: combined data of csv file
    :param features: features we will use in our recommender system
    :param user_id: the target user id
    :return: data after preprocessing
    '''
    data=raw_data[features]
    
    # Clean Raw Data 
    # Remove Unknown English name
    for i in get_UnknownName(data):
        data=data.drop(i,axis=0)
    
    # Lower all strings of selected features
    features2=['English name','Genres','Type','Producers','sypnopsis']
    for feature in features2:
        data[feature]=data[feature].str.lower()
    
    # Remove space
    data['Producers']=data['Producers'].str.replace(' ', '')
    
    #Delete all numbers, we only use words in our reconnendation system
    table = str.maketrans('', '', digits)
    data['sypnopsis']=data['sypnopsis'].str.translate(table)
    
    #Remove punctuation in feature 'sypnopsis'
    try:
        for i in list(data.index):
            data['sypnopsis'][i]=re.sub('[\W_]+', ' ', data['sypnopsis'][i])
    except:
        "TypeError: expected string or bytes-like object"
    
    #Apply function 'create_soup' to combine features 'Genres', 'Type' and 'Producers'
    data['soup'] = data.apply(create_soup, axis=1)
    data['soup']=data['soup'].str.replace(' ','')
    
    #Apply function 'bag_words' to combine all features together and get new column to store it
    data['words_features'] = data.apply(bag_words, axis=1)
    
    #Remove all ',' i new features
    data['words_features']=data['words_features'].str.replace(',',' ')
    
    #Apply function 'remove_stopwords' to remove stopwords in merged feature and get final merged feature
    data['words_features_tokenized'] = data['words_features'].apply(lambda x : remove_stopwords(x))
    
    #select target user whose id is 64807 and add user's rating into current data to form new dataframe 
    df4=df3[df3['user_id']==user_id]
    rating_df=data.merge(df4, left_on='MAL_ID', right_on='anime_id')
    
    return rating_df
        
    

## Building our system
Here we use word2vec to calculate similarity matrix and implement our recommender system

### calculating similarity

In [459]:
def word2vec_similarity(data):
    '''
    This function is used to use word2vec to get similarity matrix for anime.
    :param data: Processed data
    :return: similarity matrix 
    '''
    #get word vector
    words = []
    for num, vector in enumerate(rating_df['words_features_tokenized'].tolist()):
        vector = [ str(x) for x in vector.split() ]
#     document = TaggededDocument(title, tags=[num])
        words.append(vector)
    
    model=Word2Vec(words,min_count=1,vector_size=100,window=6,workers=4,negative=5,sg=1)
    
    #get targets that will be applied to calculate cosine similarity
    items = []
    # for title in x_train_creative_id:
    for num,vector in enumerate(rating_df['words_features_tokenized'].tolist()):
        ss_product_id = []
        vector1 = [ str(x) for x in vector.split() ]
        for i in vector1:
            ss_product_id.append(model.wv[str(i)])
        ss = sum(ss_product_id)/len(ss_product_id)
    #     print(type(model_dm.infer_vector(title)))
        items.append(ss)
        
    #cosmine Similarity using word2vec, we call it w2v
    #every anime we recommend top 10 similar animes
    w2v = cosine_similarity(items)
    for num,j in enumerate(w2v):
        item = rating_df['English name'][num]
        recs = []
        for i in j.argsort()[::-1][1:10]:
            recs.append(rating_df['English name'][i])
    
    return w2v


### main funciton

In [504]:
def content_recommender(title, cosine_sim=w2v, data=rating_df):
    '''
    This function is used to use word2vec to make recommendations.
    :param title: the input anime
    :param cosine_sim: the similartiry matrix
    :param data: Processed data
    :return: top 10 related animes and their 'English name', 'Genres' and 'Rating'
    '''
    # get the index of the movie that matches the title
    idx = data.index[data['English name']==title].tolist()[0]

    # get the pairwsie similarity scores of all movies with input movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # sort the movies based on the cosine similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # get the scores of the 10 most similar movies (ignore the first movie).
    sim_scores = sim_scores[1:11]

    # get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # return the top 10 most similar movies
    return data['English name'].iloc[movie_indices], data['rating'].iloc[movie_indices], data['Genres'].iloc[movie_indices] 

In [505]:
def recommend(user_id=64807, title='great teacher onizuka', filename1='anime.csv', filename2='anime_with_synopsis.csv', filename3='rating_complete.csv'):
    '''
    This function is the main function of our recommender system.
    :param user_id: the target user id
    :param title: the input anime
    :param filename1: data path
    :param filename2: data path
    :param filename3: data path
    :return: top 10 related animes and their 'English name', 'Genres' and 'Rating'
    '''
    # read raw data
    rating_df = read_data(filename1=filename1, filename2=filename2, filename3=filename3)
    # preprocess raw data
    processed_df = preprocessing(rating_df, user_id=user_id)
    # calculate similarity matrix
    w2v = word2vec_similarity(processed_df)
    # make recommendations
    name, rating, genre = content_recommender(title=title)
    
    return name, rating, genre

### test on user 64807

In [506]:
name, rating, genre = recommend()

In [507]:
for i in range(10):
    print(name.iloc[i],"\n", rating.iloc[i],"\n", genre.iloc[i])
    print("\n")

brother, dear brother 
 9 
 psychological, drama, school, shoujo, shoujo ai


his and her circumstances 
 10 
 comedy, drama, romance, school, shoujo, slice of life


haruka nogizaka's secret 
 7 
 comedy, romance


miss machiko 
 7 
 comedy, ecchi, school


ouran high school host club 
 8 
 comedy, harem, romance, school, shoujo


chihayafuru 
 8 
 drama, game, josei, school, slice of life, sports


tenjho tenge 
 9 
 action, ecchi, martial arts, comedy, super power, school, shounen


encouragement of climb season 2 
 6 
 adventure, comedy, slice of life


citrus 
 6 
 drama, romance, school, shoujo ai


attacker you! 
 6 
 action, romance, shoujo, sports




In [514]:
#get rank padas to apply spearman
new_series=pd.Series([10,9,8,7,6,5,4,3,2,1])
rank_series=pd.concat([rating.reset_index(drop=True),new_series],axis=1)

In [515]:
rank_series.corr('spearman')

Unnamed: 0,rating,0
rating,1.0,0.712242
0,0.712242,1.0


## Evaluation
Here we will compare performence of word2vec and CountVectorizer, and evaluation our system

In [508]:
def recommend_CV(user_id=64807, title='great teacher onizuka', filename1='anime.csv', filename2='anime_with_synopsis.csv', filename3='rating_complete.csv'):
    '''
    This function is the main function of our recommender system.
    :param user_id: the target user id
    :param title: the input anime
    :param filename1: data path
    :param filename2: data path
    :param filename3: data path
    :return: top 10 related animes and their 'English name', 'Genres' and 'Rating'
    '''
    rating_df = read_data(filename1=filename1, filename2=filename2, filename3=filename3)
    processed_df = preprocessing(rating_df, user_id=user_id)
    
    count = CountVectorizer(stop_words='english')
    count_matrix = count.fit_transform(processed_df['words_features_tokenized'])
    cosine_sim = cosine_similarity(count_matrix, count_matrix)
    
    name, rating, genre = content_recommender(title=title, cosine_sim=cosine_sim)
    
    return name, rating, genre

In [509]:
name_CV, rating_CV, genre_CV=recommend_CV()

In [510]:
for i in range(10):
    print(name_CV.iloc[i],"\n", rating_CV.iloc[i],"\n", genre_CV.iloc[i])
    print("\n")

assassination classroom second season 
 8 
 action, comedy, school, shounen


classroom of the elite 
 6 
 slice of life, psychological, drama, school


please teacher! 
 9 
 sci-fi, comedy, drama, romance, school


baka & test - summon the beasts 
 7 
 comedy, romance, school, super power


sayonara, zetsubou-sensei 
 9 
 comedy, parody, school, shounen


nobunaga teacher's young bride 
 3 
 comedy, ecchi, harem, romance, school


azumanga daioh:the animation 
 8 
 slice of life, comedy, school


hello!! kinmoza! 
 6 
 slice of life, comedy, school, seinen


hell teacher nube 
 6 
 action, supernatural, comedy, school, demons, horror, shounen


assassination classroom 
 7 
 action, comedy, school, shounen




### Compare Diversity 

In [511]:
# get numebr of all genres. we can see there are 45 kinds of genres in total
data = read_data()
all_genre=set()
for i in data['Genres']:
    a=word_tokenize(i)
    for j in a:
        if j == ',':
            pass
        else:
            all_genre.add(j)
len(all_genre)

45

In [512]:
#compare original genres and current genres, this is cosine
processed_df = preprocessing(data, user_id=64807)
original_genre=rating_df[processed_df['English name']=='great teacher onizuka']['Genres']
new_genre=set()
sentence=list(genre_CV.values)
for i in sentence:
    a=word_tokenize(i)
    for j in a:
        if j == ',':
            pass
        else:
            new_genre.add(j)
current_genre=[]
new_genre=list(new_genre)
en_stop=set(stopwords.words('english'))
for word in new_genre:
    if word not in en_stop:
        current_genre.append(word)
print(original_genre.values)
x=original_genre.values
a=x.tolist()
a=a[0].split(",")
print("original genres number is ",len(a))
print(current_genre)
print("current genres length is",len(current_genre))

['slice of life, comedy, drama, school, shounen']
original genres number is  5
['psychological', 'supernatural', 'power', 'slice', 'demons', 'super', 'school', 'parody', 'harem', 'shounen', 'action', 'ecchi', 'romance', 'horror', 'seinen', 'sci-fi', 'life', 'drama', 'comedy']
current genres length is 19


In [513]:
#compare original genres and current genres, this is cosine
original_genre=rating_df[processed_df['English name']=='great teacher onizuka']['Genres']
new_genre=set()
sentence=list(genre.values)
for i in sentence:
    a=word_tokenize(i)
    for j in a:
        if j == ',':
            pass
        else:
            new_genre.add(j)
current_genre=[]
new_genre=list(new_genre)
en_stop=set(stopwords.words('english'))
for word in new_genre:
    if word not in en_stop:
        current_genre.append(word)
print(original_genre.values)
x=original_genre.values
a=x.tolist()
a=a[0].split(",")
print("original genres number is ",len(a))
print(current_genre)
print("current genres length is",len(current_genre))

['slice of life, comedy, drama, school, shounen']
original genres number is  5
['sports', 'psychological', 'power', 'slice', 'shoujo', 'super', 'school', 'adventure', 'ai', 'arts', 'harem', 'shounen', 'action', 'ecchi', 'romance', 'josei', 'game', 'martial', 'life', 'drama', 'comedy']
current genres length is 21


### Compare Spearman correlation 

In [516]:
#random select five animes
name=[]
for i in processed_df['English name']:
    name.append(i)
random.seed(5)
name_slice = random.sample(name, 5)
rating_list=[]
for i in name_slice:
    l_name , l_rating, l_genre = recommend(title=i)
    rank_series=pd.concat([l_rating.reset_index(drop=True),new_series],axis=1)
    rating_list.append(rank_series.corr('spearman')[0]['rating'])
mean_value = statistics.mean(rating_list)
mean_value

0.3742494935411322

In [517]:
#random select five animes
name=[]
for i in rating_df['English name']:
    name.append(i)
random.seed(10)
name_slice = random.sample(name, 5)
rating_list=[]
for i in name_slice:
    l_name , l_rating, l_genre =recommend_CV(title=i)
    rank_series=pd.concat([l_rating.reset_index(drop=True),new_series],axis=1)
    rating_list.append(rank_series.corr('spearman')[0]['rating'])
mean_value = statistics.mean(rating_list)
mean_value

0.14680503798279895