In [5]:
import pandas as pd
import numpy as np
import ast
from sklearn.feature_extraction.text import CountVectorizer
import nltk 
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
anime_df = pd.read_csv("anime.csv")

In [7]:
anime_df.head(5)

Unnamed: 0,MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,...,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,...,229170.0,182126.0,131625.0,62330.0,20688.0,8904.0,3184.0,1357.0,741.0,1580.0
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,Movie,1,"Sep 1, 2001",Unknown,...,30043.0,49201.0,49505.0,22632.0,5805.0,1877.0,577.0,221.0,109.0,379.0
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",Trigun,トライガン,TV,26,"Apr 1, 1998 to Sep 30, 1998",Spring 1998,...,50229.0,75651.0,86142.0,49432.0,15376.0,5838.0,1965.0,664.0,316.0,533.0
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),TV,26,"Jul 2, 2002 to Dec 24, 2002",Summer 2002,...,2182.0,4806.0,10128.0,11618.0,5709.0,2920.0,1083.0,353.0,164.0,131.0
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",Beet the Vandel Buster,冒険王ビィト,TV,52,"Sep 30, 2004 to Sep 29, 2005",Fall 2004,...,312.0,529.0,1242.0,1713.0,1068.0,634.0,265.0,83.0,50.0,27.0


In [8]:
anime_df.tail(5)

Unnamed: 0,MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,...,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1
17557,48481,Daomu Biji Zhi Qinling Shen Shu,Unknown,"Adventure, Mystery, Supernatural",Unknown,盗墓笔记之秦岭神树,ONA,Unknown,"Apr 4, 2021 to ?",Unknown,...,Unknown,Unknown,Unknown,1.0,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
17558,48483,Mieruko-chan,Unknown,"Comedy, Horror, Supernatural",Unknown,見える子ちゃん,TV,Unknown,2021 to ?,Unknown,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
17559,48488,Higurashi no Naku Koro ni Sotsu,Unknown,"Mystery, Dementia, Horror, Psychological, Supe...",Higurashi:When They Cry – SOTSU,ひぐらしのなく頃に卒,TV,Unknown,"Jul, 2021 to ?",Summer 2021,...,1.0,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
17560,48491,Yama no Susume: Next Summit,Unknown,"Adventure, Slice of Life, Comedy",Unknown,ヤマノススメ Next Summit,TV,Unknown,Unknown,Unknown,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
17561,48492,Scarlet Nexus,Unknown,"Action, Fantasy",Unknown,SCARLET NEXUS,TV,Unknown,"Jul, 2021 to ?",Summer 2021,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown


In [9]:
anime_df.shape

(17562, 35)

In [10]:
anime_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17562 entries, 0 to 17561
Data columns (total 35 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   MAL_ID         17562 non-null  int64 
 1   Name           17562 non-null  object
 2   Score          17562 non-null  object
 3   Genres         17562 non-null  object
 4   English name   17562 non-null  object
 5   Japanese name  17562 non-null  object
 6   Type           17562 non-null  object
 7   Episodes       17562 non-null  object
 8   Aired          17562 non-null  object
 9   Premiered      17562 non-null  object
 10  Producers      17562 non-null  object
 11  Licensors      17562 non-null  object
 12  Studios        17562 non-null  object
 13  Source         17562 non-null  object
 14  Duration       17562 non-null  object
 15  Rating         17562 non-null  object
 16  Ranked         17562 non-null  object
 17  Popularity     17562 non-null  int64 
 18  Members        17562 non-n

In [11]:
#Look for null values
anime_df.isnull().sum()

MAL_ID           0
Name             0
Score            0
Genres           0
English name     0
Japanese name    0
Type             0
Episodes         0
Aired            0
Premiered        0
Producers        0
Licensors        0
Studios          0
Source           0
Duration         0
Rating           0
Ranked           0
Popularity       0
Members          0
Favorites        0
Watching         0
Completed        0
On-Hold          0
Dropped          0
Plan to Watch    0
Score-10         0
Score-9          0
Score-8          0
Score-7          0
Score-6          0
Score-5          0
Score-4          0
Score-3          0
Score-2          0
Score-1          0
dtype: int64

In [12]:
#Checking how rows containe the word 'Unknown'
unknown_rows = anime_df[anime_df.apply(lambda row: 'Unknown' in row.values, axis=1)]

# Get the count of rows with 'Unknown' information
num_unknown_rows = len(unknown_rows)

print(num_unknown_rows)

15978


After initially reviewing the data, it became apparent that numerous rows contained missing information, denoted as 'Unknown' in our dataset. To gain a clearer understanding of the extent of this issue, I conducted a mapping exercise, revealing that 15,978 rows were affected by missing information. Unfortunately, this problem impacts a substantial number of rows. Furthermore, since our Recommendation system requires as much information as possible, deleting these rows would result in a system too weak to operate effectively.

On the other hand, utilizing imputation, statistical analysis, or any other technique to replace the missing data poses a risk to both the integrity of the data and the accuracy of the results. Recognizing that similar issues may arise in real-life scenarios, I have chosen to proceed with the Recommendation system using the existing rows. This decision ensures that our system remains robust and capable of providing meaningful results to users.

In [13]:
#Look for duplicates
anime_df.duplicated().sum()

0

In [14]:
#create a new column named Tags
#The column will include important info such as Genre and Type
anime_df['Tags'] = anime_df['Genres'] + anime_df['Type']

In [15]:
#check changes 
anime_df.head(5)

Unnamed: 0,MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,...,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1,Tags
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,...,182126.0,131625.0,62330.0,20688.0,8904.0,3184.0,1357.0,741.0,1580.0,"Action, Adventure, Comedy, Drama, Sci-Fi, SpaceTV"
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,Movie,1,"Sep 1, 2001",Unknown,...,49201.0,49505.0,22632.0,5805.0,1877.0,577.0,221.0,109.0,379.0,"Action, Drama, Mystery, Sci-Fi, SpaceMovie"
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",Trigun,トライガン,TV,26,"Apr 1, 1998 to Sep 30, 1998",Spring 1998,...,75651.0,86142.0,49432.0,15376.0,5838.0,1965.0,664.0,316.0,533.0,"Action, Sci-Fi, Adventure, Comedy, Drama, Shou..."
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),TV,26,"Jul 2, 2002 to Dec 24, 2002",Summer 2002,...,4806.0,10128.0,11618.0,5709.0,2920.0,1083.0,353.0,164.0,131.0,"Action, Mystery, Police, Supernatural, Drama, ..."
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",Beet the Vandel Buster,冒険王ビィト,TV,52,"Sep 30, 2004 to Sep 29, 2005",Fall 2004,...,529.0,1242.0,1713.0,1068.0,634.0,265.0,83.0,50.0,27.0,"Adventure, Fantasy, Shounen, SupernaturalTV"


In [16]:
new_df = anime_df[['MAL_ID', 'Name', 'Tags']]
new_df.head()

Unnamed: 0,MAL_ID,Name,Tags
0,1,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, SpaceTV"
1,5,Cowboy Bebop: Tengoku no Tobira,"Action, Drama, Mystery, Sci-Fi, SpaceMovie"
2,6,Trigun,"Action, Sci-Fi, Adventure, Comedy, Drama, Shou..."
3,7,Witch Hunter Robin,"Action, Mystery, Police, Supernatural, Drama, ..."
4,8,Bouken Ou Beet,"Adventure, Fantasy, Shounen, SupernaturalTV"


In [17]:
#Standardised the words we are going to use in our systems by turning them into small cases only
new_df['Tags']=new_df['Tags'].apply(lambda X:X.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['Tags']=new_df['Tags'].apply(lambda X:X.lower())


In [18]:
#check if changes apply
new_df.head()

Unnamed: 0,MAL_ID,Name,Tags
0,1,Cowboy Bebop,"action, adventure, comedy, drama, sci-fi, spacetv"
1,5,Cowboy Bebop: Tengoku no Tobira,"action, drama, mystery, sci-fi, spacemovie"
2,6,Trigun,"action, sci-fi, adventure, comedy, drama, shou..."
3,7,Witch Hunter Robin,"action, mystery, police, supernatural, drama, ..."
4,8,Bouken Ou Beet,"adventure, fantasy, shounen, supernaturaltv"


In [19]:
#Call our vectorizer model
cv = CountVectorizer(max_features= 500, stop_words='english')

In [20]:
#check our df array size
cv.fit_transform(new_df['Tags']).toarray().shape

(17562, 296)

In [21]:
#create an array based on our new_df
vectors = cv.fit_transform(new_df['Tags']).toarray()

In [22]:
vectors[0]

array([1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [23]:
#Check how many feature names we have in our array
len(cv.get_feature_names())



296

In [24]:
#Call our natural processing language model
ps = PorterStemmer()

In [25]:
#create a function that will split our Tags into a list of words
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [26]:
#Create a list and simplify the words present under the Tags column
new_df['Tags'] = new_df['Tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['Tags'] = new_df['Tags'].apply(stem)


In [27]:
# computing the cosine similarity matrix for a set of vectors
cosine_similarity(vectors)

array([[1.        , 0.6172134 , 0.85714286, ..., 0.        , 0.18898224,
        0.26726124],
       [0.6172134 , 1.        , 0.6172134 , ..., 0.16666667, 0.        ,
        0.28867513],
       [0.85714286, 0.6172134 , 1.        , ..., 0.        , 0.18898224,
        0.26726124],
       ...,
       [0.        , 0.16666667, 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.18898224, 0.        , 0.18898224, ..., 0.        , 1.        ,
        0.        ],
       [0.26726124, 0.28867513, 0.26726124, ..., 0.        , 0.        ,
        1.        ]])

In [28]:
#Check shape of our matrix
cosine_similarity(vectors).shape

(17562, 17562)

In [29]:
#Assigne variable similarity 
similarity = cosine_similarity(vectors)

In [30]:
#Check first col of our array
similarity[0]

array([1.        , 0.6172134 , 0.85714286, ..., 0.        , 0.18898224,
       0.26726124])

In [31]:
#indicate top 5 most similar items based on index 0 values  
sorted(list(enumerate(similarity[0])), reverse= True, key=lambda x:x[1])[1:6]

[(1352, 0.9258200997725515),
 (2016, 0.9258200997725515),
 (2, 0.8571428571428569),
 (927, 0.8571428571428569),
 (1079, 0.8571428571428569)]

In [45]:
#create the recomendation sys function + reverse previous matrix 
def recommend(anime):
    anime_index = new_df[new_df['Name']==anime].index[0]
    distances = similarity[anime_index]
    anime_list = sorted(list(enumerate(distances)), reverse= True, key = lambda x:x[1])[1:6]

    for i in anime_list:
        print(new_df.iloc[i[0]]['Name'])
        

In [46]:
#Test 1
recommend('Naruto')

Naruto: Shippuuden
Dragon Ball Kai (2014)
Dragon Ball Super
Boruto: Naruto Next Generations
Boruto: Jump Festa 2016 Special


In [47]:
#Test 2
recommend('Blood+')

Blade
Tokyo Ghoul
Tokyo Ghoul √A
Tokyo Ghoul:re
Night Walker: Mayonaka no Tantei


In [48]:
#Test 3
recommend('Cowboy Bebop')

Ginga Tetsudou Monogatari
Waga Seishun no Arcadia: Mugen Kidou SSX
Trigun
Generator Gawl
Lost Universe


In [53]:
# Panda test
recommend('Fullmetal Alchemist: Brotherhood')

Fullmetal Alchemist: Brotherhood
Fullmetal Alchemist: The Sacred Star of Milos
Digimon Frontier
Fairy Tail
Fairy Tail (2014)
