In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
from ast import literal_eval


In [2]:

# Step 1: Load and Merge Data
def load_data():
    try:
        df1 = pd.read_csv(r'C:\Users\jones\OneDrive\Desktop\Recom_Sys\dataset\tmdb_5000_credits.csv')
        df2 = pd.read_csv(r'C:\Users\jones\OneDrive\Desktop\Recom_Sys\dataset\tmdb_5000_movies.csv')
        df1.columns = ['id', 'tittle', 'cast', 'crew']
        return df2.merge(df1, on='id')
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

df2 = load_data()
if df2 is None:
    raise SystemExit("Data loading failed. Exiting.")


In [3]:

# Step 2: Calculate Weighted Rating for Popular Movies
C = df2['vote_average'].mean()
m = df2['vote_count'].quantile(0.9)
q_movies = df2[df2['vote_count'] >= m]

def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    return (v / (v + m) * R) + (m / (m + v) * C)

q_movies['score'] = q_movies.apply(weighted_rating, axis=1)
q_movies = q_movies.sort_values('score', ascending=False)
print("Top 10 Movies based on weighted rating:")
print(q_movies[['title', 'vote_count', 'vote_average', 'score']].head(10))


Top 10 Movies based on weighted rating:
                                              title  vote_count  vote_average  \
1881                       The Shawshank Redemption        8205           8.5   
662                                      Fight Club        9413           8.3   
65                                  The Dark Knight       12002           8.2   
3232                                   Pulp Fiction        8428           8.3   
96                                        Inception       13752           8.1   
3337                                  The Godfather        5893           8.4   
95                                     Interstellar       10867           8.1   
809                                    Forrest Gump        7927           8.2   
329   The Lord of the Rings: The Return of the King        8064           8.1   
1990                        The Empire Strikes Back        5879           8.2   

         score  
1881  8.059258  
662   7.939256  
65    7.920020  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  q_movies['score'] = q_movies.apply(weighted_rating, axis=1)


In [4]:

# Step 3: Content-Based Filtering with TF-IDF
df2['overview'] = df2['overview'].fillna('')
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df2['overview'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
indices = pd.Series(df2.index, index=df2['title']).drop_duplicates()

def get_recommendations(title, cosine_sim=cosine_sim):
    if title not in indices:
        return f"Movie '{title}' not found in the dataset."
    try:
        idx = indices[title]
        sim_scores = list(enumerate(cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:11]
        movie_indices = [i[0] for i in sim_scores]
        return df2['title'].iloc[movie_indices]
    except Exception as e:
        return f"An error occurred: {str(e)}"

print("\nRecommendations for 'The Dark Knight Rises':")
print(get_recommendations('The Dark Knight Rises'))

print("\nRecommendations for 'The Avengers':")
print(get_recommendations('The Avengers'))



Recommendations for 'The Dark Knight Rises':
65                              The Dark Knight
299                              Batman Forever
428                              Batman Returns
1359                                     Batman
3854    Batman: The Dark Knight Returns, Part 2
119                               Batman Begins
2507                                  Slow Burn
9            Batman v Superman: Dawn of Justice
1181                                        JFK
210                              Batman & Robin
Name: title, dtype: object

Recommendations for 'The Avengers':
7               Avengers: Age of Ultron
3144                            Plastic
1715                            Timecop
4124                 This Thing of Ours
3311              Thank You for Smoking
3033                      The Corruptor
588     Wall Street: Money Never Sleeps
2136         Team America: World Police
1468                       The Fountain
1286                        Snowpiercer
Name: titl

In [5]:

# Step 4: Parse and Clean Feature Data
features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    df2[feature] = df2[feature].apply(literal_eval)

def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        return names[:3] if len(names) > 3 else names
    return []

df2['director'] = df2['crew'].apply(get_director)
for feature in features:
    df2[feature] = df2[feature].apply(get_list)

print("\nNew features for the first 3 films:")
print(df2[['title', 'cast', 'director', 'keywords', 'genres']].head(3))



New features for the first 3 films:
                                      title  \
0                                    Avatar   
1  Pirates of the Caribbean: At World's End   
2                                   Spectre   

                                               cast        director  \
0  [Sam Worthington, Zoe Saldana, Sigourney Weaver]   James Cameron   
1     [Johnny Depp, Orlando Bloom, Keira Knightley]  Gore Verbinski   
2      [Daniel Craig, Christoph Waltz, Léa Seydoux]      Sam Mendes   

                              keywords                        genres  
0   [culture clash, future, space war]  [Action, Adventure, Fantasy]  
1   [ocean, drug abuse, exotic island]  [Adventure, Fantasy, Action]  
2  [spy, based on novel, secret agent]    [Action, Adventure, Crime]  


In [6]:

# Step 5: Clean Data
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    elif isinstance(x, str):
        return str.lower(x.replace(" ", ""))
    else:
        return ''

for feature in features + ['director']:
    df2[feature] = df2[feature].apply(clean_data)


In [7]:

# Step 6: Create a "soup" of features
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

df2['soup'] = df2.apply(create_soup, axis=1)


In [8]:

# Step 7: Content-Based Recommendations using CountVectorizer
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df2['soup'])
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

df2 = df2.reset_index()
indices = pd.Series(df2.index, index=df2['title'])

print("\nRecommendations for 'The Dark Knight Rises' (CountVectorizer):")
print(get_recommendations('The Dark Knight Rises', cosine_sim2))

print("\nRecommendations for 'The Godfather' (CountVectorizer):")
print(get_recommendations('The Godfather', cosine_sim2))



Recommendations for 'The Dark Knight Rises' (CountVectorizer):
65               The Dark Knight
119                Batman Begins
4638    Amidst the Devil's Wings
1196                The Prestige
3073           Romeo Is Bleeding
3326              Black November
1503                      Takers
1986                      Faster
303                     Catwoman
747               Gangster Squad
Name: title, dtype: object

Recommendations for 'The Godfather' (CountVectorizer):
867      The Godfather: Part III
2731      The Godfather: Part II
4638    Amidst the Devil's Wings
2649           The Son of No One
1525              Apocalypse Now
1018             The Cotton Club
1170     The Talented Mr. Ripley
1209               The Rainmaker
1394               Donnie Brasco
1850                    Scarface
Name: title, dtype: object


In [9]:

# Step 8: Collaborative Filtering with Surprise (SVD)
ratings = pd.read_csv(r'C:\Users\jones\OneDrive\Desktop\Recom_Sys\dataset\ratings_small.csv')
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
svd = SVD()

results = cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
print("\nSVD Cross-Validation Results:")
print(results)

trainset = data.build_full_trainset()
svd.fit(trainset)

prediction = svd.predict(1, 302, 3)
print("\nPrediction for user 1 and movie 302 with rating 3:")
print(prediction)

user_ratings = ratings[ratings['userId'] == 1]
print("\nRatings for user 1:")
print(user_ratings)

print(df2[['title']].head(20))


Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8927  0.8917  0.9003  0.8973  0.9027  0.8969  0.0043  
MAE (testset)     0.6893  0.6885  0.6915  0.6925  0.6929  0.6909  0.0018  
Fit time          1.51    1.33    1.50    1.84    1.39    1.52    0.18    
Test time         0.13    0.10    0.10    0.19    0.10    0.12    0.04    

SVD Cross-Validation Results:
{'test_rmse': array([0.89266793, 0.89165179, 0.90034986, 0.89729021, 0.9027006 ]), 'test_mae': array([0.68927251, 0.68851027, 0.69150398, 0.6925375 , 0.69291361]), 'fit_time': (1.5138866901397705, 1.3321514129638672, 1.5031940937042236, 1.8383228778839111, 1.3877174854278564), 'test_time': (0.1272132396697998, 0.0992743968963623, 0.10210204124450684, 0.1924445629119873, 0.10004115104675293)}

Prediction for user 1 and movie 302 with rating 3:
user: 1          item: 302        r_ui = 3.00   est = 2.77   {'was_impossible': False}

Ratin

In [19]:
print(get_recommendations('Spectre'))

1343    Never Say Never Again
4071    From Russia with Love
3162              Thunderball
1717               Safe Haven
11          Quantum of Solace
4339                   Dr. No
29                    Skyfall
1880              Dance Flick
3336     Diamonds Are Forever
1743                Octopussy
Name: title, dtype: object
