
# <p style="background-color:lightgray; font-family:verdana; font-size:250%; text-align:center; border-radius: 15px 20px;">🟠Libraries and Data import -- First look at Data o.O 🟠</p>

In [31]:
import numpy as np
import pandas as pd 
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
anime = pd.read_csv("/kaggle/input/anime-recommendations-database/anime.csv")
rating = pd.read_csv("/kaggle/input/anime-recommendations-database/rating.csv")

<div style="border-radius: 10px; border: #6B8E23 solid; padding: 15px; background-color: #F5F5DC; font-size: 100%; text-align: left">

<h3 align="left"><font color='#556B2F'>👀 Features : </font></h3>
    
* **Anime.csv**
    * **anime_id** - myanimelist.net's unique id identifying an anime.
    * **name** - full name of anime.
    * **genre** - comma separated list of genres for this anime.
    * **type** - movie, TV, OVA, etc.
    * **episodes** - how many episodes in this show. (1 if movie).
    * **rating** - average rating out of 10 for this anime.
    * **members** - number of community members that are in this anime's "group".   
* **Rating.csv**
    * **user_id** - non identifiable randomly generated user id.
    * **anime_id** - the anime that this user has rated.
    * **rating** - rating out of 10 this user has assigned (-1 if the user watched it but didn't assign a rating).

In [15]:
anime.head().T

Unnamed: 0,0,1,2,3,4
anime_id,32281,5114,28977,9253,9969
name,Kimi no Na wa.,Fullmetal Alchemist: Brotherhood,Gintama°,Steins;Gate,Gintama&#039;
genre,"Drama, Romance, School, Supernatural","Action, Adventure, Drama, Fantasy, Magic, Mili...","Action, Comedy, Historical, Parody, Samurai, S...","Sci-Fi, Thriller","Action, Comedy, Historical, Parody, Samurai, S..."
type,Movie,TV,TV,TV,TV
episodes,1,64,51,24,51
animerating,9.37,9.26,9.25,9.17,9.16
members,200630,793665,114262,673572,151266


In [16]:
rating.head().T

Unnamed: 0,0,1,2,3,4
user_id,1,1,1,1,1
anime_id,20,24,79,226,241
userrating,-1,-1,-1,-1,-1


In [3]:
anime.rename(columns={"rating": "animerating"}, inplace=True)
rating.rename(columns={"rating": "userrating"}, inplace=True)

In [4]:
rating = rating[rating["userrating"] != -1]

In [5]:
df = pd.merge(anime, rating, on="anime_id", how="inner",)

In [35]:
df.head().T

Unnamed: 0,0,1,2,3,4
anime_id,32281,32281,32281,32281,32281
name,Kimi no Na wa.,Kimi no Na wa.,Kimi no Na wa.,Kimi no Na wa.,Kimi no Na wa.
genre,"Drama, Romance, School, Supernatural","Drama, Romance, School, Supernatural","Drama, Romance, School, Supernatural","Drama, Romance, School, Supernatural","Drama, Romance, School, Supernatural"
type,Movie,Movie,Movie,Movie,Movie
episodes,1,1,1,1,1
animerating,9.37,9.37,9.37,9.37,9.37
members,200630,200630,200630,200630,200630
user_id,99,152,244,271,322
userrating,5,10,10,10,10


In [29]:
df.shape

(6337239, 9)

In [31]:
df.isnull().sum()

anime_id        0
name            0
genre          88
type            4
episodes        0
animerating     5
members         0
user_id         0
userrating      0
dtype: int64

In [32]:
anime['name'].value_counts()

name
Shi Wan Ge Leng Xiaohua                           2
Saru Kani Gassen                                  2
Bakabon Osomatsu no Karee wo Tazunete Sansenri    1
Backkom Meogeujan Yeohaeng                        1
Backkom Mission Impossible                        1
                                                 ..
Yoroiden Samurai Troopers Kikoutei Densetsu       1
Yuu☆Yuu☆Hakusho: Mu Mu Hakusho                    1
3-gatsu no Lion meets Bump of Chicken             1
Bannou Bunka Neko-Musume                          1
Yasuji no Pornorama: Yacchimae!!                  1
Name: count, Length: 12292, dtype: int64

In [34]:
anime[anime['name'].isin(['Shi Wan Ge Leng Xiaohua', 'Saru Kani Gassen'])]

Unnamed: 0,anime_id,name,genre,type,episodes,animerating,members
10140,22399,Saru Kani Gassen,Kids,OVA,1,5.23,62
10141,30059,Saru Kani Gassen,Drama,Movie,1,4.75,76
10193,33193,Shi Wan Ge Leng Xiaohua,"Comedy, Parody",ONA,12,6.67,114
10194,33195,Shi Wan Ge Leng Xiaohua,"Action, Adventure, Comedy, Fantasy, Parody",Movie,1,7.07,110


# <p style="background-color:lightgray; font-family:verdana; font-size:250%; text-align:center; border-radius: 15px 20px;">Association Rule Learning</p>

<div style="border-radius:10px; border:#D0C2F0 solid; padding: 15px; background-color: #FFF0F4; font-size:100%; text-align:left">

<h3 align="left"><font color='#5E5273'>🔍     Association rule learning is a machine learning technique that is used to discover interesting relationships, or associations, between variables in large datasets. It is particularly useful for uncovering relationships between different items in transactional data, such as items frequently bought together by customers in a store.</font></h3>


In [6]:
import statsmodels.stats.api as sms
low_conf, up_conf = sms.DescrStatsW(df["anime_id"].value_counts()).tconfint_mean()
print(f"Lower Confidence Interval: {low_conf:.0f}")
print(f"Upper Confidence Interval: {up_conf:.0f}")

Lower Confidence Interval: 603
Upper Confidence Interval: 674


<div style="border-radius:10px; border:#DEB887 solid; padding: 15px; background-color: #FFFAF0; font-size:100%; text-align:left">

<h3 align="left"><font color='#DEB887'>💡 Notes:</font></h3>

 Since there is a lot of low votes, we choose our rating according to lowe confidence interval

In [7]:
rating_counts = df["anime_id"].value_counts()
rare_animes = rating_counts[rating_counts < low_conf].index
df_ = df[~df["anime_id"].isin(rare_animes)]
df_["anime_id"].value_counts()

anime_id
1535     34226
11757    26310
16498    25290
1575     24126
6547     23565
         ...  
7222       605
175        605
7375       604
32668      604
2132       604
Name: count, Length: 2007, dtype: int64

In [8]:
#user - anime  matrix
matrix = df_.groupby(["user_id","anime_id"])["animerating"].count().unstack().notnull()
matrix.head()

anime_id,1,5,6,7,15,16,18,19,20,22,...,32668,32681,32729,32828,32935,32998,33028,33558,34103,34240
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
5,False,False,True,False,True,False,True,False,True,True,...,False,False,False,True,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False


In [14]:
frequent_itemsets = apriori(matrix,min_support=0.1,use_colnames=True,low_memory=True)
frequent_itemsets.sort_values("support", ascending=False)
rules = association_rules(frequent_itemsets,metric="support",min_threshold=0.01)

In [17]:
def arl_recommender(rules_df, product_id, rec=1):
    sorted_rules = rules_df.sort_values("lift", ascending=False)
    recommendation_list = []
    for i, product in enumerate(sorted_rules["antecedents"]):
        for j in list(product):
            if j == product_id :
                for k in list(sorted_rules.iloc[i]["consequents"]):
                    if k not in recommendation_list:
                        recommendation_list.append(k)

    return recommendation_list[0:rec]

In [18]:
df_[["anime_id","name"]][df_["name"].str.contains('Naruto')].drop_duplicates().head(5)

Unnamed: 0,anime_id,name
1946235,28755,Boruto: Naruto the Movie
2444742,16870,The Last: Naruto the Movie
2597857,13667,Naruto: Shippuuden Movie 6 - Road to Ninja
2744301,20,Naruto
3471644,10589,Naruto: Shippuuden Movie 5 - Blood Prison


In [27]:
suggest = arl_recommender(rules,20,3)
suggest

[2904, 1535, 1575]

In [28]:
def check_id(data, id):
    name = data["name"][data["anime_id"] == id].iloc[0]
    return name

In [30]:
for suggest in suggest:
    print(check_id(anime, id = suggest))

Code Geass: Hangyaku no Lelouch R2
Death Note
Code Geass: Hangyaku no Lelouch


# <p style="background-color:lightgray; font-family:verdana; font-size:250%; text-align:center; border-radius: 15px 20px;">Content Based Recommendation</p>

<div style="border-radius:10px; border:#D0C2F0 solid; padding: 15px; background-color: #FFF0F4; font-size:100%; text-align:left">

<h3 align="left"><font color='#5E5273'>🔍 Content-based recommendation is a type of recommendation system in the field of information filtering and machine learning that suggests items to users based on their preferences and interests. It relies on the characteristics of the items themselves rather than on the behavior of other users. The system recommends items that are similar to those the user has liked in the past.

In [32]:
#We removed common words such as 'and,' 'the,' 'on,' 'in' as they do not carry significant meaning.
tfidf = TfidfVectorizer(stop_words="english")

In [34]:
#We replaced the NaNs with blank spaces; NaNs can cause issues in calculations.
anime['genre'] = anime['genre'].fillna('')

In [35]:
#we fit and transform
tfidf_matrix = tfidf.fit_transform(anime['genre'])

In [36]:
tfidf_matrix.shape

(12294, 46)

In [37]:
tfidf.get_feature_names_out()

array(['action', 'adventure', 'ai', 'arts', 'cars', 'comedy', 'dementia',
       'demons', 'drama', 'ecchi', 'fantasy', 'fi', 'game', 'harem',
       'hentai', 'historical', 'horror', 'josei', 'kids', 'life', 'magic',
       'martial', 'mecha', 'military', 'music', 'mystery', 'parody',
       'police', 'power', 'psychological', 'romance', 'samurai', 'school',
       'sci', 'seinen', 'shoujo', 'shounen', 'slice', 'space', 'sports',
       'super', 'supernatural', 'thriller', 'vampire', 'yaoi', 'yuri'],
      dtype=object)

In [38]:
tfidf_matrix.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.29464923, 0.31760665, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.25063144, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [39]:
#The part where we mathematically find which movies are similar to each other using text vectors.
cosine_sim = cosine_similarity(tfidf_matrix,
                               tfidf_matrix)

In [41]:
# We took the names to evaluate the scores we calculated, creating a series for each movie with its name and corresponding indices.
indices = pd.Series(anime.index, index=anime['name'])
indices.index.value_counts()

name
Shi Wan Ge Leng Xiaohua                           2
Saru Kani Gassen                                  2
Bakabon Osomatsu no Karee wo Tazunete Sansenri    1
Backkom Meogeujan Yeohaeng                        1
Backkom Mission Impossible                        1
                                                 ..
Yoroiden Samurai Troopers Kikoutei Densetsu       1
Yuu☆Yuu☆Hakusho: Mu Mu Hakusho                    1
3-gatsu no Lion meets Bump of Chicken             1
Bannou Bunka Neko-Musume                          1
Yasuji no Pornorama: Yacchimae!!                  1
Name: count, Length: 12292, dtype: int64

In [42]:
#We retain one of the duplicate movies and remove the others, keeping the most recent one for freshness.
indices = indices[~indices.index.duplicated(keep='last')]

In [43]:
#i choose the best anime ever One piece
anime_index = indices["One Piece"]

In [44]:
cosine_sim[anime_index]

array([0.14071617, 0.5004482 , 0.21511727, ..., 0.        , 0.        ,
       0.        ])

In [47]:
#Created a dataframe called 'sim_score' and retrieve the ones with similarity, evaluating them as scores.
similarity_scores = pd.DataFrame(cosine_sim[anime_index],
                                 columns=["score"])

similarity_scores

Unnamed: 0,score
0,0.140716
1,0.500448
2,0.215117
3,0.000000
4,0.215117
...,...
12289,0.000000
12290,0.000000
12291,0.000000
12292,0.000000


In [49]:
#Retrieve the top 10 anime with the highest scores. Since the anime itself is present in observation 0, we indicate 1-11.
anime_indices = similarity_scores.sort_values("score", ascending=False)[1:11].index
#We are obtaining the titles of the anime with index information.
anime['name'].iloc[anime_indices]

231     One Piece: Episode of Merry - Mou Hitori no Na...
74                                              One Piece
896     One Piece: Episode of Sabo - 3 Kyoudai no Kizu...
1171    One Piece Movie 9: Episode of Chopper Plus - F...
1576                   One Piece: Adventure of Nebulandia
1793                 One Piece Movie 5: Norowareta Seiken
2723    One Piece Movie 3: Chinjuu-jima no Chopper Oukoku
1795              One Piece: Umi no Heso no Daibouken-hen
3202           Dragon Ball Movie 1: Shen Long no Densetsu
1829    One Piece: Oounabara ni Hirake! Dekkai Dekkai ...
Name: name, dtype: object

<img src="https://i.imgur.com/GNnbyOg.png" style ="text-align: center;">