In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')
from sys import exc_info

import sqlite3

class InvalidInputError(Exception):
    pass

path = "D:\\Coding\Machine_Learning\Recommendation_System\\"

In [2]:
def loadDatabase_dataFrame(table_name, from_loc = path+"\\input", limit=None):
    from_loc += "\\" + table_name + ".db"
    
    try:
        if (limit is not None and limit < 0):
            raise InvalidInputError("Limit must be greater than 0")

        conn = sqlite3.connect(from_loc)
        query = None
        query = f'SELECT * FROM {table_name}'
        if limit is not None:
            query += f' LIMIT {limit}'
        df = pd.read_sql(query, conn)

        df = pd.DataFrame(df)
    except:
        err = 'Error: {0}, {1}'.format(exc_info()[0], exc_info()[1])
        print('LoadDatabase -> Load: ', err)
        return [False, err]
    else:
        conn.close()
        return [True, df]

In [3]:
movies = loadDatabase_dataFrame("movies")[1]
users = loadDatabase_dataFrame("users")[1]
ratings = loadDatabase_dataFrame("ratings")[1]

In [4]:
if type(movies['Genres'][0]) is not list:
    movies['Genres'] = movies['Genres'].str.split('|')
    print("Change complete")

Change complete


In [5]:
genre_dict = {
    'Adventure': 0,
    'Animation': 1,
    'Children': 2,
    'Comedy': 3,
    'Fantasy': 4,
    'Romance': 5,
    'Drama': 6,
    'Action': 7,
    'Crime': 8,
    'Thriller': 9,
    'Horror': 10,
    'Mystery': 11,
    'Sci-Fi': 12,
    'War': 13,
    'Musical': 14,
    'Documentary': 15,
    'IMAX': 16,
    'Western': 17,
    'Film-Noir': 18,
    '(no genres listed)': 19
}

In [6]:
genres_Movies = movies[["MovieID", "Genres"]].reset_index()

for index, genres_list in genres_Movies[["index", "Genres"]].values:
    vector = list()
    for g in genre_dict:
        if g in genres_list:
            vector.append(1)
        else:
            vector.append(0)
        genres_Movies.at[index, "Genres"] = np.array(vector)

genres_Movies.drop(columns=['index'], inplace=True)
genres_Movies.rename(columns={'Genres':'Movie_Vector'}, inplace=True)

genres_Movies.head(2)

Unnamed: 0,MovieID,Movie_Vector
0,1,"[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,2,"[1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [7]:
vector_movies = pd.merge(movies, genres_Movies, on='MovieID').sort_values(['MovieID'], ascending = [True]).reset_index()
vector_movies.drop(columns=['index', 'Genres', 'Years'], inplace=True)
# vector_movies.drop(columns=['index'], inplace=True)
vector_movies.head(5)

Unnamed: 0,MovieID,Title,Movie_Vector
0,1,Toy Story,"[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,2,Jumanji,"[1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,3,Grumpier Old Men,"[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,4,Waiting to Exhale,"[0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,5,Father of the Bride Part II,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [8]:
vector_movies[vector_movies['MovieID'] == 2]['Movie_Vector'][1]

array([1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [21]:
vector_ratings = pd.merge(ratings, genres_Movies, on='MovieID').sort_values(['UserID', 'MovieID'], ascending = [True, True]).reset_index()
vector_ratings.drop(columns=['index'], inplace=True)
vector_ratings.head(5)

Unnamed: 0,UserID,MovieID,userRating,Movie_Vector
0,3,1,4.0,"[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,3,29,4.5,"[1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, ..."
2,3,32,4.5,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, ..."
3,3,50,5.0,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, ..."
4,3,111,4.0,"[0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, ..."


In [10]:
vector_ratings['Movie_Vector'][0]

array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [68]:
import numpy as np

def user_vector():
    user_ids = vector_ratings['UserID'].sort_values(ascending=True).unique()

    df_users_3 = pd.DataFrame(columns=['UserID', "User_Vector", "Count_Movies"])

    for uid in user_ids:
        user_ratings = vector_ratings[vector_ratings['UserID'] == uid]

        user_vector = np.zeros(len(genre_dict))
        count_vector = np.zeros(len(genre_dict))

        count_Movies = 0

        for _, row in user_ratings.iterrows():
            count_Movies += 1
            count_vector += (row.Movie_Vector)
            user_vector += (row.Movie_Vector) * row.userRating

        count_vector = np.where(count_vector==0, 1, count_vector)
        user_vector = np.divide(user_vector, count_vector)

        user_vector /= 5

        row_df = pd.DataFrame([[uid, user_vector, count_Movies]], 
                            columns=['UserID', 'User_Vector', "Count_Movies"])

        df_users_3 = pd.concat([df_users_3, row_df], ignore_index=True)

    return df_users_3


In [69]:
df_users_3 = user_vector()

In [70]:
df_users_3

Unnamed: 0,UserID,User_Vector,Count_Movies
0,3,"[0.7402061855670103, 0.8045454545454545, 0.754...",634
1,4,"[0.6149122807017544, 0.6966666666666667, 0.644...",233
2,33,"[0.7, 0.6, 0.6, 0.8, 0.7, 0.8, 0.8411764705882...",23
3,63,"[0.95, 0.9400000000000001, 0.8444444444444444,...",40
4,76,"[0.8, 0.0, 0.0, 0.0, 0.0, 0.0, 0.8, 0.8, 0.8, ...",2
...,...,...,...
10440,162457,"[0.6931034482758621, 0.5375, 0.588235294117647...",343
10441,162482,"[0.6636363636363637, 0.6833333333333333, 0.666...",254
10442,162516,"[0.6846938775510204, 0.6833333333333333, 0.657...",355
10443,162521,"[0.7875, 0.8375, 0.8, 0.8150000000000001, 0.77...",53


หาความคล้ายคลี่ง ระหว่าง User กับ Movie โดยใช้ Genres เป็นตัวเปรียบเทียบ

In [71]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [72]:
def cosine(df_users):
    M = vector_movies['Movie_Vector'].to_numpy()
    U = df_users['User_Vector'].to_numpy()

    M = np.vstack(M)
    U = np.vstack(U)

    # คำนวณ cosine similarity ระหว่างหนังและผู้ใช้
    similarity_matrix = cosine_similarity(M, U)
    return similarity_matrix

similarity_matrix = cosine(df_users_3)

In [99]:
def get_recommendations(User_index, cosine_sim=similarity_matrix):
    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[:, User_index]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[:10]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    index_movies = vector_movies.copy().reset_index()

    top_movies_4u = []
    # top_movies_4u = [index_movies[index_movies["index"] == index].Title.values[0] for index in movie_indices]
    for m_index, score in sim_scores:
        m_id = index_movies[index_movies["index"] == m_index].Title.values[0]
        top_movies_4u.append([m_id, round(score, 4)])
    
    return top_movies_4u

In [100]:
my_movie = (get_recommendations(3))
my_movie

[['Mulan ', 0.7808],
 ['Enchanted ', 0.7783],
 ['Who Framed Roger Rabbit? ', 0.7569],
 ['The Lego Movie ', 0.7366],
 ['Ice Age: Dawn of the Dinosaurs ', 0.7355],
 ['Inside Out ', 0.7335],
 ['Shrek 2 ', 0.7307],
 ['Shrek ', 0.7298],
 ['All Dogs Go to Heaven 2 ', 0.7254],
 ['Tangled ', 0.7247]]

In [79]:
test_user = ratings[ratings["UserID"] == 3].copy()