In [1]:
import pandas as pd

# Load datasets
animes = pd.read_csv('animes.csv')
user_scores = pd.read_csv('user_rates_test.csv')

In [2]:
user_scores.drop(["Username","Anime Title"], axis=1, inplace=True)
animes.drop(["Image URL"], axis=1, inplace=True)

In [3]:
# Merge datasets
df = anime_fulldata=pd.merge(animes,user_scores,on='anime_id',suffixes= ['', '_user'])
anime_fulldata = anime_fulldata.rename(columns={'Name': 'anime_title', 'rating': 'user_rating'})
anime_fulldata.head()

Unnamed: 0,anime_id,anime_title,Genres,Type,Studios,Source,Duration,user_id,user_rating
0,1,Cowboy Bebop,"Action, Award Winning, Sci-Fi",TV,Sunrise,Original,24 min per ep,114480,10
1,1,Cowboy Bebop,"Action, Award Winning, Sci-Fi",TV,Sunrise,Original,24 min per ep,480255,10
2,1,Cowboy Bebop,"Action, Award Winning, Sci-Fi",TV,Sunrise,Original,24 min per ep,488782,9
3,1,Cowboy Bebop,"Action, Award Winning, Sci-Fi",TV,Sunrise,Original,24 min per ep,497210,10
4,1,Cowboy Bebop,"Action, Award Winning, Sci-Fi",TV,Sunrise,Original,24 min per ep,601187,10


In [4]:
import re

def convert_to_minutes(duration_str):
    minutes = -1  # Default value for unparseable strings or unknowns
    # Check if duration is in hours and minutes
    if 'hr' in duration_str and 'min' in duration_str:
        time = re.findall(r'(\d+)\s*hr\s*(\d+)\s*min', duration_str)
        minutes = int(time[0][0]) * 60 + int(time[0][1])
    # Check if duration is in hours
    elif 'hr' in duration_str:
        time = re.findall(r'(\d+)\s*hr', duration_str)
        minutes = int(time[0]) * 60
    # Check if duration is in minutes
    elif 'min' in duration_str:
        time = re.findall(r'(\d+)\s*min', duration_str)
        minutes = int(time[0])
    # Check if duration is in seconds
    elif 'sec' in duration_str:
        time = re.findall(r'(\d+)\s*sec', duration_str)
        # Convert seconds to minutes;
        minutes = int(time[0]) / 60.0  
    # For 'Unknown' or other unhandled cases, we keep the default value of -1

    return minutes



# Convert duration to minutes
anime_fulldata['Duration'] = anime_fulldata['Duration'].apply(convert_to_minutes)

In [5]:
anime_fulldata.head()

Unnamed: 0,anime_id,anime_title,Genres,Type,Studios,Source,Duration,user_id,user_rating
0,1,Cowboy Bebop,"Action, Award Winning, Sci-Fi",TV,Sunrise,Original,24,114480,10
1,1,Cowboy Bebop,"Action, Award Winning, Sci-Fi",TV,Sunrise,Original,24,480255,10
2,1,Cowboy Bebop,"Action, Award Winning, Sci-Fi",TV,Sunrise,Original,24,488782,9
3,1,Cowboy Bebop,"Action, Award Winning, Sci-Fi",TV,Sunrise,Original,24,497210,10
4,1,Cowboy Bebop,"Action, Award Winning, Sci-Fi",TV,Sunrise,Original,24,601187,10


In [6]:
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

# First, handle the 'Genres' column
anime_fulldata['Genres'] = anime_fulldata['Genres'].apply(lambda x: x.split(', ') if x != 'UNKNOWN' else [])

# Use MultiLabelBinarizer to one-hot encode the 'Genres'
mlb = MultiLabelBinarizer()
anime_fulldata = anime_fulldata.join(pd.DataFrame(mlb.fit_transform(anime_fulldata.pop('Genres')),
                          columns=mlb.classes_,
                          index=anime_fulldata.index))


In [7]:
anime_fulldata.head()

Unnamed: 0,anime_id,anime_title,Type,Studios,Source,Duration,user_id,user_rating,Action,Adventure,...,Girls Love,Gourmet,Horror,Mystery,Romance,Sci-Fi,Slice of Life,Sports,Supernatural,Suspense
0,1,Cowboy Bebop,TV,Sunrise,Original,24,114480,10,1,0,...,0,0,0,0,0,1,0,0,0,0
1,1,Cowboy Bebop,TV,Sunrise,Original,24,480255,10,1,0,...,0,0,0,0,0,1,0,0,0,0
2,1,Cowboy Bebop,TV,Sunrise,Original,24,488782,9,1,0,...,0,0,0,0,0,1,0,0,0,0
3,1,Cowboy Bebop,TV,Sunrise,Original,24,497210,10,1,0,...,0,0,0,0,0,1,0,0,0,0
4,1,Cowboy Bebop,TV,Sunrise,Original,24,601187,10,1,0,...,0,0,0,0,0,1,0,0,0,0


In [8]:
import pandas as pd

# One-hot encode 'Type'
type_encoded = pd.get_dummies(anime_fulldata['Type'], prefix='type')

# One-hot encode 'Studios'
studios_encoded = pd.get_dummies(anime_fulldata['Studios'], prefix='studio')

# One-hot encode 'Source'
source_encoded = pd.get_dummies(anime_fulldata['Source'], prefix='source')

# Concatenate the original DataFrame with the one-hot encoded variables
anime_fulldata_encoded = pd.concat([anime_fulldata, type_encoded, studios_encoded, source_encoded], axis=1)

# Drop the original 'Type', 'Studios', and 'Source' columns as they're now one-hot encoded
anime_fulldata_encoded = anime_fulldata_encoded.drop(['Type', 'Studios', 'Source'], axis=1)

# Convert boolean values to integers (0 and 1)
anime_fulldata_encoded[type_encoded.columns] = anime_fulldata_encoded[type_encoded.columns].astype(int)
anime_fulldata_encoded[studios_encoded.columns] = anime_fulldata_encoded[studios_encoded.columns].astype(int)
anime_fulldata_encoded[source_encoded.columns] = anime_fulldata_encoded[source_encoded.columns].astype(int)

# Now, anime_fulldata_encoded contains your original data plus the one-hot encoded columns, with 0s and 1s instead of False and True
anime_fulldata = anime_fulldata_encoded

anime_fulldata.head()

Unnamed: 0,anime_id,anime_title,Duration,user_id,user_rating,Action,Adventure,Avant Garde,Award Winning,Boys Love,...,source_4-koma manga,source_Game,source_Light novel,source_Manga,source_Novel,source_Original,source_Other,source_Unknown,source_Visual novel,source_Web manga
0,1,Cowboy Bebop,24,114480,10,1,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
1,1,Cowboy Bebop,24,480255,10,1,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
2,1,Cowboy Bebop,24,488782,9,1,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,1,Cowboy Bebop,24,497210,10,1,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,1,Cowboy Bebop,24,601187,10,1,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0


In [9]:
import warnings
from sklearn.exceptions import DataConversionWarning

warnings.filterwarnings(action='ignore', category=UserWarning, module='sklearn')

In [10]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MinMaxScaler

# Normalize the features
features = anime_fulldata.drop(['anime_id', 'user_id', 'anime_title', 'user_rating'], axis=1)
scaler = MinMaxScaler()
normalized_features = scaler.fit_transform(features)
normalized_df = pd.DataFrame(normalized_features, columns=features.columns)

normalized_df.head()

Unnamed: 0,Duration,Action,Adventure,Avant Garde,Award Winning,Boys Love,Comedy,Drama,Ecchi,Erotica,...,source_4-koma manga,source_Game,source_Light novel,source_Manga,source_Novel,source_Original,source_Other,source_Unknown,source_Visual novel,source_Web manga
0,0.14375,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.14375,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.14375,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.14375,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.14375,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [11]:
normalized_df['user_id'] = anime_fulldata['user_id']
normalized_df['user_rating'] = anime_fulldata['user_rating']

In [12]:
normalized_df.head()

Unnamed: 0,Duration,Action,Adventure,Avant Garde,Award Winning,Boys Love,Comedy,Drama,Ecchi,Erotica,...,source_Light novel,source_Manga,source_Novel,source_Original,source_Other,source_Unknown,source_Visual novel,source_Web manga,user_id,user_rating
0,0.14375,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,114480,10
1,0.14375,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,480255,10
2,0.14375,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,488782,9
3,0.14375,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,497210,10
4,0.14375,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,601187,10


In [13]:
from sklearn.model_selection import KFold
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Prepare your data
X = normalized_df.drop(['user_id', 'user_rating'], axis=1)  # features
y = normalized_df['user_rating']  # target variable

# Initialize k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Store all predictions
all_predictions = []

# Go through all k-folds
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train KNN
    knn = NearestNeighbors(n_neighbors=5, metric='cosine')
    knn.fit(X_train)
    
    # Predict ratings
    test_user_indices = normalized_df.iloc[test_index]['user_id']
    for user_id in test_user_indices:
        user_features = X_test.loc[normalized_df['user_id'] == user_id]
        distances, indices = knn.kneighbors(user_features)
        similar_users = indices.flatten()
        predicted_rating = np.mean(y_train.iloc[similar_users])
        all_predictions.append(predicted_rating)

# Add predictions as a new column
normalized_df['predicted'] = all_predictions


In [14]:
normalized_df.head()

Unnamed: 0,Duration,Action,Adventure,Avant Garde,Award Winning,Boys Love,Comedy,Drama,Ecchi,Erotica,...,source_Manga,source_Novel,source_Original,source_Other,source_Unknown,source_Visual novel,source_Web manga,user_id,user_rating,predicted
0,0.14375,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,114480,10,8.733333
1,0.14375,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,480255,10,8.333333
2,0.14375,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,488782,9,8.7
3,0.14375,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,497210,10,8.475
4,0.14375,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,601187,10,8.866667


In [15]:
from sklearn.metrics import mean_absolute_error

# Ensure that the length of actual and predicted ratings are the same
assert len(normalized_df['user_rating']) == len(normalized_df['predicted'])

# Calculate the MAE
mae = mean_absolute_error(normalized_df['user_rating'], normalized_df['predicted'])

print(f"Mean Absolute Error (MAE): {mae}")


Mean Absolute Error (MAE): 1.206204774574216
