In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import sqlite3
import seaborn as sns

In [2]:
conn = sqlite3.connect("anime.db")
curr = conn.cursor()

In [3]:
query = "SELECT ratings.username, anime.title, ratings.my_score FROM ratings \
INNER JOIN anime ON anime.anime_id = ratings.anime_id \
WHERE anime.scored_by>4999 \
AND anime.episodes>=6 \
AND anime.start_year>1999 \
AND anime.score > (SELECT AVG(score) FROM anime) \
AND anime.duration > 7"

df = pd.read_sql(query, conn)
# Each user should rate at least 10 anime
# Typecast to float32 to save memory
df = df.pivot_table("my_score", "username", "title").dropna(thresh=10, axis=0).astype(np.float32)
# For every user, rate each anime with the user average
df = df.apply(lambda x: x.fillna(x.mean()), axis=0)
# For every user we are subtracting the mean
# df = df.apply(lambda x: x - x.mean(), axis=1)
# For every user subtract the minimum and divide by the maximum
# so we get data in the range of 0-1 which can be interpreted
# as a percentage rating
df = df.apply(lambda x: x - x.min(), axis=1)
df = df.apply(lambda x: x / x.max(), axis=1)
# drop (some) second seasons
df.drop([x for x in df.columns if '2nd' in x], 1, inplace=True)

df.head()

title,.hack//Roots,.hack//Sign,.hack//Tasogare no Udewa Densetsu,07-Ghost,11eyes,12-sai.: Chicchana Mune no Tokimeki,18if,3-gatsu no Lion,30-sai no Hoken Taiiku,3D Kanojo: Real Girl,...,Zoids Shinseiki/Zero,Zoku Natsume Yuujinchou,Zoku Sayonara Zetsubou Sensei,Zombie-Loan,ef: A Tale of Melodies.,ef: A Tale of Memories.,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei,ēlDLIVE
username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
----phoebelyn,0.414166,0.430188,0.441617,0.453597,0.420317,0.344706,0.265262,0.381603,0.38204,0.148032,...,0.506485,0.549878,0.477392,0.427016,0.427934,0.0,0.429056,0.436356,0.519429,0.285577
--AnimeBoy--,0.414166,0.430188,0.441617,0.453597,0.420317,0.344706,0.265262,0.381603,0.38204,0.148032,...,0.506485,0.549878,0.477392,0.427016,0.427934,0.468103,0.429056,0.436356,0.519429,0.285577
--CyberShadow--,0.414166,0.430188,0.441617,0.453597,0.420317,0.344706,0.265262,0.381603,0.38204,0.148032,...,0.506485,0.7,0.477392,0.0,0.427934,0.468103,0.429056,0.436356,0.519429,0.285577
--Etsuko--,0.414166,0.430188,0.441617,0.453597,0.420317,0.344706,0.265262,0.381603,0.38204,0.148032,...,0.506485,0.549878,0.477392,0.8,0.427934,0.468103,0.429056,0.436356,0.519429,0.285577
--FallenAngel--,0.414166,0.7,0.441617,0.453597,0.0,0.344706,0.265262,0.381603,0.38204,0.148032,...,0.506485,0.9,0.477392,0.7,1.0,0.9,0.429056,0.9,0.9,0.285577


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 107837 entries, ----phoebelyn to zzzzz-chan
Columns: 1586 entries, .hack//Roots to ēlDLIVE
dtypes: float32(1586)
memory usage: 653.2+ MB


In [303]:
df2 = pd.read_sql("SELECT title, genre FROM anime", conn)
df2["genre"] = df2["genre"].fillna("No category")
df3 = pd.read_excel("anime_additional_features.xlsx")

genres = [x.split(", ") if x is not None else "No Category" for x in df2["genre"].values ]
unique_genres = list(set([genre if isinstance(genre_list, list) else genre_list
                          for genre_list in genres
                          for genre in genre_list]))

for genre in unique_genres:
    df2[genre] = df2["genre"].apply(lambda x: 1 if genre in x else 0, 1)
    
df2 = df2.set_index("title")
df3.index.name = "title"


joined = df3.join(df2, "title").fillna(0).reset_index()
joined.to_sql("features", conn, index=False, if_exists="replace")

  dtype=dtype)


In [305]:
feats = pd.read_sql("SELECT * FROM features", conn).set_index('title')
feats = feats.loc[df.columns]
feats.head(3)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  


Unnamed: 0_level_0,Reviwer1,Reviwer2,Reviwer3,Reviwer4,Overall,Story,Animation,Sound,Character,Enjoyment,...,Hentai,Martial Arts,Fantasy,Yuri,Horror,Shounen,Shoujo,Magic,Music,Romance
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
.hack//Roots,.hack//Roots is a fine sequel that's kind of b...,.hack//Roots is a 26-episode anime created by....,This anime's about as messy as a landslide. Al...,"To start off, .hack is well-known by fans as a...",5.75,5.75,6.75,6.75,5.25,5.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.hack//Sign,.Hack//Sign is a single part in the Project .H...,"To the many people who play them, video games,...","Overview: Before SAO, before Log Horizon, ther...","**THIS REVIEW CONTAINS MAJOR SPOILERS** Okay, ...",7.5,8.0,7.75,9.75,8.0,7.5,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
.hack//Tasogare no Udewa Densetsu,Story Legend of the Twilight is the next anime...,I&rsquo;d like to start off by saying that I a...,Almsot any anime watcher who stumbles upon .ha...,First off I would like to point out that I hav...,6.0,6.0,7.5,5.5,5.75,5.25,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [322]:
numeric_feats = feats.drop(["Reviwer1", "Reviwer2", "Reviwer3", "Reviwer4", "genre",
                           "Overall", "Story", "Animation", "Sound", "Character",
                            "Enjoyment", "Psychological"], 1)
numeric_feats = numeric_feats.apply(lambda x: x.fillna(x.mean()) 
                            if x.dtype is float else x.fillna(0), 0)
numeric_feats.head()

Unnamed: 0_level_0,Adventure,No category,Kids,Seinen,Dementia,Action,Sports,Sci-Fi,Josei,Historical,...,Hentai,Martial Arts,Fantasy,Yuri,Horror,Shounen,Shoujo,Magic,Music,Romance
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
.hack//Roots,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.hack//Sign,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
.hack//Tasogare no Udewa Densetsu,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
07-Ghost,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
11eyes,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Since we drop some labels, we should check how the model performs in estimating the scores of those labels. We can do that by masking for where `X==0` and then select only those that the user found as positive, i.e. where `Y>0.5` (or use harsher criteria, e.g. 0.6).

### It turns out the MSE loss is .01883 which is ~2.5x higher than the model reports for this first user (0.00770), but still seems very good.

These are some personal ratings. Note that I don't scale ratings on a typical 0-10 scale but after normalization it won't matter. Generally speaking, a rating of 10 means `awesome`, 9 means `good`, 8 means `slightly better than average`, 7 means `meh`, 5.3 means `boring`, and 3 means `actively disliked it` (and usually dropped).

In [192]:
test = ['Ano Hi Mita Hana no Namae wo Bokutachi wa Mada Shiranai.', 'Angel Beats!', 'Toradora!',
        'Tengen Toppa Gurren Lagann', 'Shigatsu wa Kimi no Uso', 'No Game No Life', 'Death Parade','Hyouka']

In [307]:
me = df[:1].copy()
me[:] = np.nan
me.index = pd.Index(["kmourat"])

# 7
me['Akatsuki no Yona'] = 10
me["Usagi Drop"] = 10
me['Sakurasou no Pet na Kanojo'] = 10
me['Shingeki no Kyojin'] = 10
me['Ao Haru Ride'] = 10
me['Log Horizon'] = 10
me['Yahari Ore no Seishun Love Comedy wa Machigatteiru.'] = 10

# 7
me['Tonari no Kaibutsu-kun'] = 9
me['Samurai Champloo'] = 9
me['One Punch Man'] = 9
me['Ergo Proxy'] = 9
me['Kuzu no Honkai'] = 9
me['Sword Art Online'] = 9
me['Shirobako'] = 9

# 7
me['Brave 10'] = 8
me['Kiss x Sis (TV)'] = 8
me['Black Lagoon'] = 8
me['Servant x Service'] = 8
me['Mahou Shoujo Madoka★Magica'] = 8
me['Guilty Crown'] = 8
me['Sengoku Basara'] = 8

# 7
me["Death Note"] = 7
me['Another'] = 7
me['Blade & Soul'] = 7
me['Boku dake ga Inai Machi'] = 7
me['Trinity Seven'] = 7
me['Hyakka Ryouran: Samurai Girls'] = 7
me['Oniichan dakedo Ai sae Areba Kankeinai yo ne!'] = 7

# 4
me['Elfen Lied'] = 5.3
me['Aoi Bungaku Series'] = 5.3
me['Hentai Ouji to Warawanai Neko.'] = 5.3
me['Fairy Tail'] = 5.3

# 4
me['Sakamoto desu ga?'] = 3
me['Prison School'] = 3
me['Acchi Kocchi (TV)'] = 3
me['Saenai Heroine no Sodatekata'] = 3
me['Hataraki Man'] = 3

In [194]:
given_columns = me[me>0].dropna(1).columns

In [195]:
me = me - me.min(1).values[0]
me = me / me.max(1).values[0]
me = me.fillna(me.mean(1).values[0])

In [326]:
numeric_feats.to_excel("numeric_features.xlsx")

In [329]:
def get_user_preferences(anime_list):
    gen = np.zeros(43)
    for anime in anime_list:
        # get numerical features for each of the anime
        gen += numeric_feats.loc[anime]
                
    return gen / len(anime_list)

def get_suggestions_for_user(anime_list, n=10):
    user_vec = get_user_preferences(anime_list)
    corrs = np.corrcoef(numeric_feats, user_vec)[:-1,-1]
    udf = pd.DataFrame(corrs, columns=["Correlations"], index=numeric_feats.index)
    udf = udf[~udf.index.isin(anime_list)]
    
    return udf.sort_values("Correlations", ascending=False).head(n)


get_suggestions_for_user(['Ao Haru Ride', 'Akatsuki no Yona', 'Lovely★Complex'], 20)

  c /= stddev[:, None]
  c /= stddev[None, :]


Unnamed: 0_level_0,Correlations
title,Unnamed: 1_level_1
Ore Monogatari!!,0.903511
Itazura na Kiss,0.903511
Aishiteruze Baby★★,0.857309
Kirarin☆Revolution,0.857309
Skip Beat!,0.857309
Kaichou wa Maid-sama!,0.857309
Ookami Shoujo to Kuro Ouji,0.857309
Special A,0.857309
Nijiiro Days,0.835611
Tonari no Kaibutsu-kun,0.835611


### Let's see... 

#### Seen (ratings) -- remember, 9-10 are good, 8 is about average:
* Fullmetal Alchemist (Brotherhood): 9
* Angel Beats: 10
* No Game no Life: 10
* Steins;Gate: 10
* Toradora: 10
* AnoHana: 10
* Tengen Toppa: 10
* Mirai Nikki: 8
* Shigatsu wa Kimi no Uso: 10
* Black Lagoon: 8 
* K-On!: 9
* Higashi no Eden: 7
* Boku no Hero: 10

#### Not watch (reason)
* Naruto (length)
* Naruto Shippuuden (length)
* Bleach (length)

#### Probably watch:
* Code Geass
* Noragami
* Detroit Metal City
* Nagi no Asakura
* Highschool of the Dead
* Tokyo Ghoul

#### Maybe:
* Owari no Seraph
* Ao no Exorcist
* Ouran Koukou Host Club
* FLCL


## FOR NEURAL NET THE DF SHOULD BE THE OUTPUT (BUT MAYBE ALSO THE INPUT)

# FOR THE DF:
## 1) FILL NaNs WITH USER MEAN
## 2) SUBTRACK USER MEAN
## 3) 

In [112]:
corrs = np.corrcoef(df.values.T)

In [114]:
corrs = pd.DataFrame(corrs, index=df.columns, columns=df.columns)

In [119]:
corrs

title,.hack//Roots,.hack//Sign,.hack//Tasogare no Udewa Densetsu,07-Ghost,11eyes,12-sai.: Chicchana Mune no Tokimeki,18if,3-gatsu no Lion,30-sai no Hoken Taiiku,3D Kanojo: Real Girl,...,Zoids Shinseiki/Zero,Zoku Natsume Yuujinchou,Zoku Sayonara Zetsubou Sensei,Zombie-Loan,ef: A Tale of Melodies.,ef: A Tale of Memories.,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei,ēlDLIVE
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
.hack//Roots,1.000000,0.408861,0.510519,0.125317,0.121478,0.121293,0.100477,0.052756,0.097276,0.062360,...,0.171196,0.090101,0.107662,0.116172,0.099967,0.095801,0.168859,0.119892,0.135454,0.104547
.hack//Sign,0.408861,1.000000,0.394380,0.116070,0.111772,0.073068,0.072622,0.053656,0.081699,0.032477,...,0.130468,0.090885,0.099261,0.115131,0.094864,0.113829,0.187509,0.136164,0.108062,0.069295
.hack//Tasogare no Udewa Densetsu,0.510519,0.394380,1.000000,0.109444,0.112901,0.136499,0.107431,0.048032,0.107543,0.062511,...,0.191513,0.088041,0.102411,0.110790,0.089492,0.087048,0.172998,0.102928,0.122836,0.109521
07-Ghost,0.125317,0.116070,0.109444,1.000000,0.182043,0.079185,0.064226,0.049087,0.091516,0.035535,...,0.084062,0.141764,0.080974,0.184219,0.087367,0.090414,0.098153,0.134308,0.132061,0.074805
11eyes,0.121478,0.111772,0.112901,0.182043,1.000000,0.091621,0.079307,0.054928,0.128439,0.049402,...,0.082358,0.092086,0.077698,0.146902,0.123897,0.126896,0.096766,0.106576,0.100259,0.089372
12-sai.: Chicchana Mune no Tokimeki,0.121293,0.073068,0.136499,0.079185,0.091621,1.000000,0.272530,0.134526,0.183632,0.179142,...,0.246811,0.076644,0.086020,0.075873,0.071143,0.052886,0.110745,0.045371,0.088458,0.271209
18if,0.100477,0.072622,0.107431,0.064226,0.079307,0.272530,1.000000,0.148321,0.153442,0.198787,...,0.185608,0.068310,0.075650,0.064800,0.057272,0.042699,0.095843,0.047626,0.073444,0.297872
3-gatsu no Lion,0.052756,0.053656,0.048032,0.049087,0.054928,0.134526,0.148321,1.000000,0.077047,0.097071,...,0.069988,0.112090,0.062009,0.038956,0.074530,0.071558,0.044916,0.063950,0.060995,0.147886
30-sai no Hoken Taiiku,0.097276,0.081699,0.107543,0.091516,0.128439,0.183632,0.153442,0.077047,1.000000,0.086225,...,0.149423,0.094207,0.114164,0.094172,0.092793,0.084824,0.095526,0.081070,0.100014,0.141947
3D Kanojo: Real Girl,0.062360,0.032477,0.062511,0.035535,0.049402,0.179142,0.198787,0.097071,0.086225,1.000000,...,0.104775,0.033090,0.040717,0.040051,0.026594,0.017476,0.051059,0.023373,0.037926,0.171364


In [259]:
df2.shape

(14478, 44)

In [261]:
feats.shape

(1587, 55)

In [258]:
def get_user_preferences(anime_list):
    gen = np.zeros(44)
    for anime in anime_list:
        # get genres for each of the anime
        gen += numeric_feats.loc[anime]
        
    return gen / len(anime_list)


def get_suggestions_for_user(anime_list, n=10):
    user_vec = get_user_preferences(anime_list)
    corrs = np.corrcoef(df2, user_vec)[:-1,-1]
    udf = pd.DataFrame(corrs, columns=["Correlations"], index=df2.index)
    udf = udf[~udf.index.isin(anime_list)]
    
    return udf.sort_values("Correlations", ascending=False).head(n)

In [256]:
user = get_suggestions_for_user(["Ao Haru Ride", "Akatsuki no Yona"])
user.head()

ValueError: all the input array dimensions except for the concatenation axis must match exactly

In [133]:
anime_list = ["Ao Haru Ride", "Akatsuki no Yona"]

In [148]:
results = corrs[anime_list]

results = results[~results.index.isin(anime_list
                    )].mean(1).sort_values(ascending=False).head(20)
results = pd.DataFrame(results, columns=["kmourat"])
results

Unnamed: 0_level_0,kmourat
title,Unnamed: 1_level_1
Ookami Shoujo to Kuro Ouji,0.274365
Akagami no Shirayuki-hime,0.271796
Gekkan Shoujo Nozaki-kun,0.240971
Soredemo Sekai wa Utsukushii,0.228616
Tonari no Kaibutsu-kun,0.22774
Sukitte Ii na yo.,0.226321
Kamisama Hajimemashita,0.222015
Kamisama Hajimemashita◎,0.214993
Owari no Seraph,0.211014
Bokura wa Minna Kawai-sou,0.205748
