In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import seaborn as sns
%matplotlib inline


In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
anime = pd.read_csv("anime.csv")


In [None]:
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [None]:
anime.isnull().sum()
#returns the number of missing values in the dataset


anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [None]:
anime[anime['episodes']=='Unknown'].head(5)
#We have to deal with unknown episodes as many anime are not finished and currently ongoing


Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
74,21,One Piece,"Action, Adventure, Comedy, Drama, Fantasy, Sho...",TV,Unknown,8.58,504862
252,235,Detective Conan,"Adventure, Comedy, Mystery, Police, Shounen",TV,Unknown,8.25,114702
615,1735,Naruto: Shippuuden,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,Unknown,7.94,533578
991,966,Crayon Shin-chan,"Comedy, Ecchi, Kids, School, Shounen, Slice of...",TV,Unknown,7.73,26267
1021,33157,Tanaka-kun wa Itsumo Kedaruge Specials,"Comedy, School, Slice of Life",Special,Unknown,7.72,5400


In [None]:
anime.loc[(anime["genre"]=="Hentai") & (anime["episodes"]=="Unknown"),"episodes"] = "1"
anime.loc[(anime["type"]=="OVA") & (anime["episodes"]=="Unknown"),"episodes"] = "1"

anime.loc[(anime["type"] == "Movie") & (anime["episodes"] == "Unknown")] = "1"
#accessing groups of rows and columns by labels


In [None]:
known_animes = {"Naruto Shippuuden":500, "One Piece":1054,"Detective Conan":1075, "Dragon Ball Super":131,
                "Crayon Shin chan":1122, "Yu Gi Oh Arc V":148}

In [None]:
for k,v in known_animes.items():
    anime.loc[anime["name"]==k,"episodes"] = v

In [None]:
anime["episodes"] = anime["episodes"].map(lambda x:np.nan if x=="Unknown" else x)
#not a number


In [None]:
anime["episodes"].fillna(anime["episodes"].median(),inplace = True)
#nan not specified filled with some info

In [None]:
pd.get_dummies(anime[["type"]]).head()
#for converting categorical features to dataframes with dummy/indicator variables.



Unnamed: 0,type_1,type_Movie,type_Music,type_ONA,type_OVA,type_Special,type_TV
0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,1
2,0,0,0,0,0,0,1
3,0,0,0,0,0,0,1
4,0,0,0,0,0,0,1


In [None]:
anime["rating"] = anime["rating"].astype(float)
anime["rating"].fillna(anime["rating"].median(),inplace = True)
anime["members"] = anime["members"].astype(float)


In [None]:
# Scaling

anime_features = pd.concat([anime["genre"].str.get_dummies(sep=","),
                            pd.get_dummies(anime[["type"]]),
                            anime[["rating"]],anime[["members"]],anime["episodes"]],axis=1)
anime["name"] = anime["name"].map(lambda name:re.sub('[^A-Za-z0-9]+', " ", name))
anime_features.head()


Unnamed: 0,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,Harem,...,type_1,type_Movie,type_Music,type_ONA,type_OVA,type_Special,type_TV,rating,members,episodes
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,9.37,200630.0,1
1,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,1,9.26,793665.0,64
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,9.25,114262.0,51
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,9.17,673572.0,24
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,9.16,151266.0,51


In [None]:
anime_features.columns


Index([' Adventure', ' Cars', ' Comedy', ' Dementia', ' Demons', ' Drama',
       ' Ecchi', ' Fantasy', ' Game', ' Harem', ' Hentai', ' Historical',
       ' Horror', ' Josei', ' Kids', ' Magic', ' Martial Arts', ' Mecha',
       ' Military', ' Music', ' Mystery', ' Parody', ' Police',
       ' Psychological', ' Romance', ' Samurai', ' School', ' Sci-Fi',
       ' Seinen', ' Shoujo', ' Shoujo Ai', ' Shounen', ' Shounen Ai',
       ' Slice of Life', ' Space', ' Sports', ' Super Power', ' Supernatural',
       ' Thriller', ' Vampire', ' Yaoi', ' Yuri', '1', 'Action', 'Adventure',
       'Cars', 'Comedy', 'Dementia', 'Demons', 'Drama', 'Ecchi', 'Fantasy',
       'Game', 'Harem', 'Hentai', 'Historical', 'Horror', 'Josei', 'Kids',
       'Magic', 'Martial Arts', 'Mecha', 'Military', 'Music', 'Mystery',
       'Parody', 'Police', 'Psychological', 'Romance', 'Samurai', 'School',
       'Sci-Fi', 'Seinen', 'Shoujo', 'Shounen', 'Slice of Life', 'Space',
       'Sports', 'Super Power', 'Supernat

To address potential bias in the distance metric of k-nearest neighbors (KNN) caused by features with varying scales, I opted to utilize the MinMaxScaler from scikit-learn. This scaler ensures that values are scaled to a range between 0 and 1, preventing features with larger numbers from dominating the distance calculation.

The MinMaxScaler transforms the data using the following formula:



X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
X_scaled = X_std * (max - min) + min
where min, max = feature_range.
  represent the minimum and maximum values of the feature along the specified axis. The scaling is performed within the range defined by
min
min and
max
max.

This transformation serves as an effective strategy to mitigate the impact of disparate feature scales, providing a viable alternative to standardization (zero mean, unit variance scaling)

In [None]:
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
anime_features = min_max_scaler.fit_transform(anime_features)


In [None]:
from sklearn.neighbors import NearestNeighbors


In [None]:
nbrs = NearestNeighbors(n_neighbors=10, algorithm='ball_tree').fit(anime_features)


In [None]:
distances, indices = nbrs.kneighbors(anime_features)


In [None]:
def get_index_from_name(name):
    return anime[anime["name"]==name].index.tolist()[0]

In [None]:
all_anime_names = list(anime.name.values)


In [None]:
def get_id_from_partial_name(partial):
    for name in all_anime_names:
        if partial in name:
            print(name,all_anime_names.index(name))

In [None]:
get_index_from_name("Naruto")
distances[841]


array([0.        , 0.19080893, 1.0861954 , 1.41778433, 1.44803221,
       1.47423227, 1.51479076, 1.51983293, 1.52215612, 1.52400017])

In [None]:
def print_similar_animes(query=None,id=None):
    if id:
        for id in indices[id][1:]:
            print(anime.iloc[id]["name"])
    if query:
        found_id = get_index_from_name(query)
        for id in indices[found_id][1:]:
            print(anime.iloc[id]["name"])

In [None]:

print_similar_animes(query="Detective Conan")


Kaitou Joker 2nd Season
Kaitou Joker 3rd Season
Kaitou Joker
Kaitou Joker 4th Season
Meitantei Holmes
Detective Conan Movie 03 The Last Wizard of the Century
Detective Conan Movie 05 Countdown to Heaven
Detective Conan Movie 04 Captured in Her Eyes
Detective Conan Movie 01 The Timed Skyscraper


In [None]:
print_similar_animes(query="Naruto")


Naruto Shippuuden
Katekyo Hitman Reborn 
Bleach
Dragon Ball Z
Boku no Hero Academia
Ben To
Shijou Saikyou no Deshi Kenichi
Dragon Ball Kai
Dragon Ball Super


In [None]:
print_similar_animes(query="One Punch Man")


One Punch Man 2
One Punch Man Road to Hero
One Punch Man Specials
Choujigen Game Neptune The Animation
Tokyo ESP
Kiddy GiRL AND
Genji Tsuushin Agedama
Oh Super Milk chan
Super Milk chan
