In [None]:
#General libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#Data cleanup
import re
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

#Misc
from datetime import datetime
import os
import warnings

warnings.filterwarnings('ignore')

# Part 1. Data exploration and processing

## 1.1 Anime dataset

In [None]:
anime = pd.read_csv('../input/anime_cleaned.csv')

In [None]:
print(anime.shape)
print(anime.columns)

In [None]:
anime.head(3)

In [None]:
anime.drop(['title_english', #tfdf later
            'title_japanese', #use later
            'title_synonyms', #use later
            'image_url', #scrape the images and use later
            'status', #bool column 'airing'
            'aired_string', #same info as 'aired'
            'aired', #use only 'aired_from_year' to calcualte 'age' of series
            'duration', #cleaner version avaiable as duration_min
            'rank', #use later, has missing values
            'background', #use later - even better scrape synopses and do text analysis
            'premiered', #cleaner version as 'aired_from_year'
            'broadcast', #use later
            'opening_theme', #use later
            'ending_theme',  #use later
            'related', #use later
            'licensor', #use later, has ~20% missing
            'opening_theme', #use later - parse out the artist
            'ending_theme' #use later
            
           ], axis=1, inplace=True)

In [None]:
#Clean up category rating to extract category
def getRating(st):
    try:
        return st.split(" - ")[0]
    except:
        return np.nan

anime['rating_clean'] = anime['rating'].apply(lambda st: getRating(st))
anime.drop(['rating'],axis=1,inplace=True)

In [None]:
anime.head()

In [None]:
#Producer - add bool column for multiple producers and add column for each
def isMultProducer(st):
    try:
        return 1 if ',' in st else 0
    except:
        return 9999
    
anime['hasMultiple_producers'] = anime['producer'].apply(lambda st: isMultProducer(st))

In [None]:
#Split producer into columns
#First clean space after commas
anime['producer'] = anime['producer'].str.replace(", ", ",") 

#Split
producer_to_dummies = anime['producer'].str.get_dummies(sep=',').add_prefix('producer_')

#Clean up
for col in producer_to_dummies:
    if producer_to_dummies[col].astype(bool).sum(axis=0) < 50:
        producer_to_dummies.drop(col,axis=1,inplace=True)

#Add to main df
anime = pd.concat([anime,producer_to_dummies],axis=1)
anime.drop(['producer'],axis=1,inplace=True)

In [None]:
#Split genre into columns
genre_to_dummies = anime['genre'].str.get_dummies(sep=',').add_prefix('genre_')

#Clean up
for col in genre_to_dummies:
    if genre_to_dummies[col].astype(bool).sum(axis=0) < 50:
        genre_to_dummies.drop(col,axis=1,inplace=True)

#Add to main df
anime = pd.concat([anime,genre_to_dummies],axis=1)
anime.drop(['genre'],axis=1,inplace=True)

In [None]:
#Check for missing values
for col in anime:
    if True in anime[col].isna().value_counts():
        print(f"Column *{col}* has missing")
        print(anime[col].isna().value_counts())

In [None]:
#Label encode categorical
for col in anime:
    if anime[col].dtype == 'object' and col != 'title':
        print(f"Column {col} is object, performing label encoding")
        anime[col].fillna('9999', inplace=True)
        le = LabelEncoder()
        le.fit(anime[col])
        anime[col] = le.transform(anime[col])
        

In [None]:
#Keep a seperate df for id and title link, drop the latter from the main table [anime_id is temporarly dropped]
anime_idTitleLink = anime[['anime_id','title']]
anime.drop(['title', 'anime_id'], axis=1, inplace=True)

In [None]:
#Scale
min_max_scaler = MinMaxScaler()
animeFinal = pd.DataFrame(np.round(min_max_scaler.fit_transform(anime),2), columns=anime.columns)

In [None]:
animeFinal = pd.concat([animeFinal,anime_idTitleLink['anime_id'].astype(int)], axis=1)
animeFinal['anime_id'] = animeFinal['anime_id'].astype(int)
animeFinal.set_index('anime_id', inplace=True)

In [None]:
#Final anime dataset
print(animeFinal.shape)
animeFinal

## 1.2 Users

In [None]:
users = pd.read_csv('../input/users_cleaned.csv')

In [None]:
users.head()

In [None]:
print(users.shape)
print(users.columns)

In [None]:
users.drop(['username', #not needed at this time
            'access_rank'#all missing
           ], axis=1, inplace=True)

In [None]:
#Clean up location to split into country and city
import re

def getLocation(location):
    regex = re.compile('[^a-zA-Z]')
    try:
        locSplit = location.split(',')
        #print("Split is", locSplit)
        retStr = locSplit[0] if len(locSplit) == 1 else locSplit[1] 
        retStr = regex.sub('', retStr.strip().lower())
        if len(retStr.strip()) == 0:
            return np.nan
        else: 
            return retStr
    except:
        return np.nan

users['location'] = users['location'].apply(lambda loc: getLocation(loc))

In [None]:
#users['location'].value_counts()

In [None]:
#Label encode location
users['location'].fillna('9999', inplace=True)
le = LabelEncoder()
le.fit(users['location'])
users['location'] = le.transform(users['location'])

In [None]:
#Clean up the dates to leave only a year
dateCols = ['birth_date','join_date','last_online']
for col in dateCols:
    users[col] = pd.to_datetime(users[col]).dt.year

In [None]:
#Check for missing values
for col in users:
    if True in users[col].isna().value_counts():
        print(f"Column *{col}* has missing")
        print(users[col].isna().value_counts())

In [None]:
#Label encode gender
le = LabelEncoder()
le.fit(users['gender'])
users['gender'] = le.transform(users['gender'])

In [None]:
users.head()

In [None]:
#usersAnime = pd.read_csv('../input/animelists_cleaned.csv', usecols=['username','anime_id','my_status','my_score'])

In [None]:
#avg_anime_watched = usersAnime.groupby('username')['anime_id'].count()
#avg_anime_watched.describe()

In [None]:
"""
Anime: Initially 6668 cleaned up to 6132
Users: 108711
UsersAnime: Mean 287, max 6536 .. :O

what can be done:

Path 1:
work only with 6k anime dataset to group similiar anime
for each user for each of their liked anime recommend similiar ones

Path 2:
find user similiarities and make a general anime recommendation (tons of data, needs more filtering):
Find preferred genre, type, length etc 
"""

# Part 2: Model building

## 2.1 Content based recommendation

In [None]:
from sklearn.neighbors import NearestNeighbors, BallTree, KDTree

In [None]:
kdt = KDTree(animeFinal, leaf_size=30, metric='euclidean')
distances, indeces = kdt.query(animeFinal, k=10, return_distance=True)

In [None]:
distances

In [None]:
def getIdFromName(inputTitle):
    df_found = anime_idTitleLink[anime_idTitleLink['title'].str.contains(inputTitle)]
    exact_match = df_found[df_found["title"]==inputTitle]['anime_id'].values
    if exact_match > 0:
        return exact_match[0]
    else:
        print("Choose Id or exact name from below\n", df_found)
        return 

In [None]:
def printNeighbors(indeces,foundId):
    foundIdIndex = anime_idTitleLink.loc[anime_idTitleLink['anime_id']==foundId].index[0]
    foundIdGroup = []
    if foundIdIndex:
        #print(f"For ID {foundId} corresponding row index is {foundIdIndex}\n")
        for group in indeces:
            if foundIdIndex in group:
                foundIdGroup = group
            
        for val in foundIdGroup:
            if val != foundIdIndex:
                print(anime_idTitleLink.loc[val]['title'])
    

def getSimiliarByName(indeces,anime_title = None, anime_id = None):
    if anime_title:
        anime_id = getIdFromName(anime_title)
    print("Anime Id is", anime_id)
    if anime_id:
        printNeighbors(indeces,anime_id)

In [None]:
getSimiliarByName(indeces,anime_title='Bleach')

In [None]:
kdt.kneighbors_graph(X).toarray()

In [None]:
#Ideas:
"""
-Use related column to exclude parent stories
-Scrape description/synopses
-Add image features
-Scrape recomendation and make a supervised learning
"""