In [14]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.neighbors import NearestNeighbors
import re
import warnings
import time

In [2]:
movie_dataset = pd.read_csv('rotten_tomatoes_movies.csv')
movie_dataset = movie_dataset[movie_dataset['title'].notna()]
movie_dataset.head()

Unnamed: 0,id,title,audienceScore,tomatoMeter,rating,ratingContents,releaseDateTheaters,releaseDateStreaming,runtimeMinutes,genre,originalLanguage,director,writer,boxOffice,distributor,soundMix
0,space-zombie-bingo,Space Zombie Bingo!,50.0,,,,,2018-08-25,75.0,"Comedy, Horror, Sci-fi",English,George Ormrod,"George Ormrod,John Sabotta",,,
1,the_green_grass,The Green Grass,,,,,,2020-02-11,114.0,Drama,English,Tiffany Edwards,Tiffany Edwards,,,
2,love_lies,"Love, Lies",43.0,,,,,,120.0,Drama,Korean,"Park Heung-Sik,Heung-Sik Park","Ha Young-Joon,Jeon Yun-su,Song Hye-jin",,,
3,the_sore_losers_1997,Sore Losers,60.0,,,,,2020-10-23,90.0,"Action, Mystery & thriller",English,John Michael McCarthy,John Michael McCarthy,,,
4,dinosaur_island_2002,Dinosaur Island,70.0,,,,,2017-03-27,80.0,"Fantasy, Adventure, Animation",English,Will Meugniot,John Loy,,,


In [3]:
rating_dataset = pd.read_csv('rotten_tomatoes_movie_reviews.csv')
rating_dataset = rating_dataset[rating_dataset['originalScore'].notna()]
rating_dataset.head()

Unnamed: 0,id,reviewId,creationDate,criticName,isTopCritic,originalScore,reviewState,publicatioName,reviewText,scoreSentiment,reviewUrl
0,beavers,1145982,2003-05-23,Ivan M. Lincoln,False,3.5/4,fresh,Deseret News (Salt Lake City),Timed to be just long enough for most youngste...,POSITIVE,http://www.deseretnews.com/article/700003233/B...
1,blood_mask,1636744,2007-06-02,The Foywonder,False,1/5,rotten,Dread Central,It doesn't matter if a movie costs 300 million...,NEGATIVE,http://www.dreadcentral.com/index.php?name=Rev...
3,city_hunter_shinjuku_private_eyes,2558908,2019-02-14,Matt Schley,False,2.5/5,rotten,Japan Times,The film's out-of-touch attempts at humor may ...,NEGATIVE,https://www.japantimes.co.jp/culture/2019/02/0...
5,dangerous_men_2015,2299284,2015-12-13,Eric Melin,False,4/5,fresh,Lawrence.com,"With every new minute, there's another head-sc...",POSITIVE,http://www.lawrence.com/weblogs/scenestealers/...
6,dangerous_men_2015,2295858,2015-11-22,Matt Donato,False,7/10,fresh,We Got This Covered,"Emotionless reaction shots, zero characterizat...",POSITIVE,http://wegotthiscovered.com/movies/dangerous-m...


In [5]:
# Convert originalScore to normalizedScore due to multiple scoring format
def convert_to_numeric(score):
    if isinstance(score, str) and score.strip():  # Check if score is a non-empty string
        try:
            # Check if score is in the format 'x/y'
            if re.match(r'\d+\s*/\s*\d+', score):
                parts = score.split('/')
                if float(parts[1].split()[0]) == 0:  # Check if the denominator is zero
                    return np.nan
                return float(parts[0]) / float(parts[1].split()[0]) * 10  # Normalize to 0-10 scale
            # Check if score contains a numerical part followed by some text like "stars"
            elif re.match(r'(\d+(\.\d+)?)\s*(?:/|\s*out\s*of)\s*-?\d+(\.\d+)?', score):
                # Extract the numerical part
                numeric_part = re.findall(r'-?\d+(\.\d+)?', score)
                if numeric_part:
                    return float(numeric_part[0]) * 2  # Convert to 0-10 scale (assuming 5-star system)
                else:
                    return np.nan
            # Check if score is a number or in the format 'x out of -4..+4'
            elif re.match(r'-?\d+(\.\d+)?(\s?out of -?\d+(\.\d+)?)?', score):
                # Extract numeric part
                numeric_part = re.findall(r'-?\d+(\.\d+)?', score)
                if numeric_part and numeric_part[0] != '':
                    return float(numeric_part[0])  # Normalize to 0-10 scale
                else:
                    return np.nan
            # Check if score is a letter grade
            elif re.match(r'[A-F][\+\-]?', score):
                # Map letter grades to numerical values
                letter_grades = {'A+': 10, 'A': 9, 'A-': 8, 'B+': 7, 'B': 6, 'B-': 5,
                                 'C+': 4, 'C': 3, 'C-': 2, 'D+': 1, 'D': 1, 'D-': 1, 'F': 0}
                return letter_grades.get(score)
            # Check if score is in the format 'high x out of -4..+4' or 'high x out of -4..+4'
            elif re.match(r'high\s*(-?\d+(\.\d+)?)\s*out\s*of\s*-?\d+(\.\d+)?', score):
                # Extract numeric part
                numeric_part = re.findall(r'-?\d+(\.\d+)?', score)
                if numeric_part and numeric_part[0] != '':
                    return float(numeric_part[0])  # Normalize to 0-10 scale
                else:
                    return np.nan
        except:
            return np.nan
    return np.nan  # Return NaN for empty or non-numeric values

In [6]:
# Apply the conversion function to the 'originalScore' column
rating_dataset['normalizedScore'] = rating_dataset['originalScore'].apply(convert_to_numeric)

In [9]:
# Removing the na values and mergeing the movie to rating dataset on the basis of column "id"
rating_dataset = rating_dataset[rating_dataset['normalizedScore'].notna()]
merged_movie_dataset = rating_dataset.merge(movie_dataset, on='id')
merged_movie_dataset.head()

Unnamed: 0,id,reviewId,creationDate,criticName,isTopCritic,originalScore,reviewState,publicatioName,reviewText,scoreSentiment,...,releaseDateTheaters,releaseDateStreaming,runtimeMinutes,genre,originalLanguage,director,writer,boxOffice,distributor,soundMix
0,beavers,1145982,2003-05-23,Ivan M. Lincoln,False,3.5/4,fresh,Deseret News (Salt Lake City),Timed to be just long enough for most youngste...,POSITIVE,...,,2011-06-21,30.0,Documentary,English,Stephen Low,,,,
1,blood_mask,1636744,2007-06-02,The Foywonder,False,1/5,rotten,Dread Central,It doesn't matter if a movie costs 300 million...,NEGATIVE,...,,,,,,Unknown Director,,,,
2,city_hunter_shinjuku_private_eyes,2558908,2019-02-14,Matt Schley,False,2.5/5,rotten,Japan Times,The film's out-of-touch attempts at humor may ...,NEGATIVE,...,,,,,,Kenji Kodama,,,,
3,dangerous_men_2015,2299284,2015-12-13,Eric Melin,False,4/5,fresh,Lawrence.com,"With every new minute, there's another head-sc...",POSITIVE,...,,,,,,Unknown Director,,,,
4,dangerous_men_2015,2295858,2015-11-22,Matt Donato,False,7/10,fresh,We Got This Covered,"Emotionless reaction shots, zero characterizat...",POSITIVE,...,,,,,,Unknown Director,,,,


In [10]:
# Delete the duplicate entry of same user and same movie_id as "id"
merged_movie_dataset = merged_movie_dataset.drop_duplicates(subset=['criticName', 'id'], inplace=False)

In [12]:
# Review matrix with rows as user ie "criticName" and column as "id" ie movie_id and value as normalizedScore
review_matrix = merged_movie_dataset.pivot(index='criticName', columns='id', values='normalizedScore').fillna(0)
review_matrix.head(5)

id,$5_a_day,009_re_cyborg,00_mhz,1,1-day,10,10-violent-women,1000013_12_angry_men,10000292-rat,10000390-mickey,...,zu_warriors,zubaan,zulu,zulu_dawn,zus_and_zo_2003,zusje_1995,zvenigora,zwei_mutter_2013,zycie_jako_smiertelna_choroba_przenoszona_droga_plciowa_2000,zz_top_that_little_ol_band_from_texas
criticName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Beatriz Ladrón de Guevara,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Julio Plaza Torres,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Kelly-Anne Taylor,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A. Scott Walton,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A.A. Dowd,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
# A KNN model on cosine similarity
cf_knn_model= NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=10, n_jobs=-1)

# Fitting the model on our matrix
cf_knn_model.fit(np.transpose(review_matrix))

In [15]:
warnings.filterwarnings('ignore') 

def movie_recommender_engine(movie_name, movie_dataset, review_matrix, cf_model, n_recs):
    # Extract input movie ID
    movie_list = movie_dataset[movie_dataset['title'].str.contains(movie_name, case=False)]

    if not movie_list.empty:
        # List to store recommendations
        cf_recs = []

        # Get all movie IDs from the filtered movie list
        movie_ids = movie_list['id'].tolist()
        
        # Filter out movie IDs that are present in the review_matrix index
        valid_movie_ids = [movie_id for movie_id in movie_ids if movie_id in review_matrix.columns]
        
        if valid_movie_ids:
            # Get review matrices for valid movies
            
            # review_inputs = review_matrix.loc[valid_movie_ids].to_numpy()
            review_inputs = review_matrix[valid_movie_ids].T
            
            # Calculate distances for all movies at once
            distances, indices = cf_model.kneighbors(review_inputs, n_neighbors=3)

            # Iterate over each movie's recommendations
            for i in range(len(valid_movie_ids)):
                # Extract recommendations for the current movie
                movie_rec_ids = sorted(list(zip(indices[i], distances[i])), key=lambda x: x[1])[:0:-1]

                # Add recommendations to the list
                for rec_id, distance in movie_rec_ids:
                    cf_recs.append({'Title': movie_dataset.iloc[rec_id]['title'], 'Distance': distance})

            # Sort recommendations by distance
            sorted_recs = sorted(cf_recs, key=lambda d: d['Distance'], reverse=True)

            # Select top recommendations
            df = pd.DataFrame(sorted_recs[:n_recs])

            return df
        else:
            return "No valid movies found in review_matrix"
    else:
        return "No movie found"

movie_recommender_engine("Twilight", movie_dataset, review_matrix, cf_knn_model, 10)

Unnamed: 0,Title,Distance
0,Opie Gets Laid,0.593083
1,Center Stage,0.584586
2,House of Z,0.566329
3,Gangster of Love,0.557823
4,Les Boys III,0.556155
5,Kalaignan,0.547295
6,143 I Miss You,0.526821
7,Unplanned,0.52629
8,Once Upon a Time in Ukraine,0.519741
9,The Speed Lovers,0.516959
