In [147]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

## Data Preprocessing:

Load the dataset into a suitable data structure (e.g., pandas DataFrame).

Handle missing values, if any.

Explore the dataset to understand its structure and attributes.

In [148]:
df = pd.read_csv('anime.csv')
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [149]:
df.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [150]:
df['genre'] = df['genre'].fillna(df['genre'].mode()[0])
df['type'] = df['type'].fillna(df['type'].mode()[0])
df['rating'] = df['rating'].fillna(df['rating'].mean())
df.isnull().sum()

anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64

In [151]:
len(df.name.unique())

12292

In [152]:
len(df.anime_id.unique())

12294

In [153]:
len(df.rating.unique())

599

In [154]:
df.groupby('name')['rating'].count().sort_values(ascending=False)

name
Shi Wan Ge Leng Xiaohua       2
Saru Kani Gassen              2
&quot;0&quot;                 1
Otoshidama                    1
Otome Nadeshiko Koi Techou    1
                             ..
Himegoto                      1
Himekishi Angelica            1
Himekishi Lilia               1
Himekishi Olivia              1
◯                             1
Name: rating, Length: 12292, dtype: int64

In [155]:
df1 = df.pivot_table(index='anime_id',columns='name',values='rating')
df1

name,&quot;0&quot;,"&quot;Aesop&quot; no Ohanashi yori: Ushi to Kaeru, Yokubatta Inu",&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,&quot;Bungaku Shoujo&quot; Memoire,&quot;Bungaku Shoujo&quot; Movie,&quot;Eiji&quot;,&quot;Eiyuu&quot; Kaitai,.hack//G.U. Returner,.hack//G.U. Trilogy,.hack//G.U. Trilogy: Parody Mode,...,s.CRY.ed,vivi,xxxHOLiC,xxxHOLiC Kei,xxxHOLiC Movie: Manatsu no Yoru no Yume,xxxHOLiC Rou,xxxHOLiC Shunmuki,Üks Uks,ēlDLIVE,◯
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34514,,,,,,,,,,,...,,,,,,,,,,
34519,,,,,,,,,,,...,,,,,,,,,,
34522,,,,,,,,,,,...,,,,,,,,,,
34525,,,,,,,,,,,...,,,,,,,,,,


In [156]:
df2 = df1.fillna(0,axis=1)

## Recommendation System:

Design a function to recommend anime based on cosine similarity.

Given a target anime, recommend a list of similar anime based on cosine similarity scores.

Experiment with different threshold values for similarity scores to adjust the recommendation list size.

In [159]:
df_arr = cosine_similarity(df2)
df_arr

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [160]:
df_new = pd.DataFrame(df_arr)

In [161]:
df_new.index = df.anime_id.unique()
df_new.columns = df.anime_id.unique()

In [162]:
np.fill_diagonal(df_arr,0)

In [163]:
df[(df.anime_id==32281) | (df.anime_id==32281)]

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630


In [164]:
df[(df.anime_id==5114) | (df.anime_id==32281)]

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665


## Evaluation:

Split the dataset into training and testing sets.

Evaluate the recommendation system using appropriate metrics such as precision, recall, and F1-score.

Analyze the performance of the recommendation system and identify areas of improvement

In [165]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report 

In [213]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
lab_enc = LabelEncoder()

In [244]:
df['episodes'] = pd.to_numeric(df['episodes'], errors='coerce')

In [245]:
df['type'] = lab_enc.fit_transform(df['type'])
df['genre'] = lab_enc.fit_transform(df['genre'])

In [246]:
target = df[['type']]
feature = df.drop('type',axis=1)

In [247]:
feature

Unnamed: 0,anime_id,genre,episodes,rating,members
0,32281,2686,1.0,9.37,200630
1,5114,161,64.0,9.26,793665
2,28977,534,51.0,9.25,114262
3,9253,3240,24.0,9.17,673572
4,9969,534,51.0,9.16,151266
...,...,...,...,...,...
12289,9316,2903,1.0,4.15,211
12290,5543,2903,1.0,4.28,183
12291,5621,2903,4.0,4.88,219
12292,6133,2903,1.0,4.98,175


In [248]:
target 

Unnamed: 0,type
0,0
1,5
2,5
3,5
4,5
...,...
12289,3
12290,3
12291,3
12292,3


In [249]:
x_train,x_test,y_train,y_test = train_test_split(feature,target,train_size=0.75,random_state=100)

In [250]:
xg_boost = xgb.XGBClassifier(n_estimators=60, learning_rate=0.01, max_depth=3, min_child_weight=2)

In [251]:
xg_boost.fit(x_train,y_train)

In [252]:
y_pred = xg_boost.predict(x_test)

In [253]:
accuracy_score(y_test,y_pred)

0.6870527000650618

In [254]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.52      0.84      0.64       606
           1       0.63      0.78      0.70       118
           2       0.28      0.03      0.05       177
           3       0.64      0.70      0.67       822
           4       0.50      0.02      0.05       409
           5       0.90      0.98      0.94       942

    accuracy                           0.69      3074
   macro avg       0.58      0.56      0.51      3074
weighted avg       0.66      0.69      0.63      3074



## Interview Questions:

1. Can you explain the difference between user-based and item-based collaborative filtering?

2. What is collaborative filtering, and how does it work?

## Answer 1:

In user-based filtering the recommendation is based on the similarites between users, where as the item-based filtering recommends based on the similarites between items.

Answer 2:

The collaborative filtering is a technique that is used to a sugguestion a user might like based on the behaviour of the users.
It has two types:
    1.) User-based collaborative filtering.
    
    2.) Item-based collaborative filtering.
