In [1]:
import pandas as pd

ratings_df = pd.read_csv("rating.csv")
ratings_df.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [2]:
ratings_df.rating.unique()

array([-1, 10,  8,  6,  9,  7,  3,  5,  4,  1,  2])

In [3]:
ratings_df.shape

(7813737, 3)

In [4]:
anime_df = pd.read_csv("anime.csv")
anime_df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [5]:
genres_lst = []

# get genres
for grp in anime_df.genre.unique():
    if isinstance(grp, str):
        genres_lst += [genre for genre in grp.split(", ")]

genres = pd.DataFrame({"genres": genres_lst}).genres.unique()
genres.sort()
genres

In [7]:
merged_df = pd.merge(ratings_df[['user_id', 'anime_id', 'rating']], 
             anime_df[['anime_id', 'genre']], on="anime_id").reset_index().drop(["index"], axis=1)

In [10]:
# preparation for preference generation
# remove animes which have not been rated
merged_df = merged_df[merged_df.rating != -1]
merged_df.head()

Unnamed: 0,user_id,anime_id,rating,genre
1,3,20,8,"Action, Comedy, Martial Arts, Shounen, Super P..."
2,5,20,6,"Action, Comedy, Martial Arts, Shounen, Super P..."
5,21,20,8,"Action, Comedy, Martial Arts, Shounen, Super P..."
6,28,20,9,"Action, Comedy, Martial Arts, Shounen, Super P..."
7,34,20,9,"Action, Comedy, Martial Arts, Shounen, Super P..."


In [11]:
# gets 5 top-rated anime per user
# disadvantage: if same rating, fcfs approach
top5 = merged_df.loc[merged_df.groupby("user_id")['rating'].nlargest(5).reset_index()['level_1']]
top5.tail()

In [12]:
top5.to_csv("top5_amime_per_user.csv", index=False)

Unnamed: 0,user_id,anime_id,rating,genre
262222,73515,4898,10,"Action, Comedy, Demons, Fantasy, Historical, S..."
350078,73515,6707,10,"Action, Comedy, Demons, Fantasy, Shounen, Supe..."
1270186,73515,1535,10,"Mystery, Police, Psychological, Supernatural, ..."
413771,73516,8074,9,"Action, Ecchi, Horror, Supernatural"
4055601,73516,790,9,"Mystery, Psychological, Sci-Fi"


In [20]:
genre_dict = {genre:i for i,genre in enumerate(genres)}

def get_preference(genres):
    """
    Creates a preference profile for the user by getting the 8 most occuring genres in the user's 5 (at most)
    top-rated anime. Genre string is based on the genre arrangement in genre_dict, where 1s represent the 
    user's preference for a certain genre in the position dictated by genre_dict.
    
    Input: genres (2D array) - genre values
    Output: genstr (string)
    """
    genre_df = pd.DataFrame({"genres" : [y for x in genres for y in x.split(", ") ]})
    top_5 = genre_df.genres.value_counts()[:8].index.tolist()
    gen_str = ["0"] * len(genre_dict)
    for val in top_5:
        gen_str[genre_dict[val]] = "1"
    
    return "".join(gen_str)
        
        
preference_df = pd.DataFrame({"preference" : top5.groupby('user_id').genre.apply(lambda x: get_preference(x))}).reset_index()
preference_df.tail()

Unnamed: 0,user_id,preference
69595,73512,0100101000000000000001001100100000000010000
69596,73513,0001001000000000000000100100100010100010000
69597,73514,0101001010000000100000000100000000000000000
69598,73515,1001010010000000000001000000100010000010000
69599,73516,1000000100000100000001001000100000000010000


In [21]:
preference_df.to_csv("user_preference.csv", index=False)