## Content-Based Filtering
### content_based_api_retrieval.py

In [1]:
import requests
import pandas as pd
import time

api_key = "d3e8d7fcb94be031986259192b4fdfb0"

# Base URLs
url = "https://api.themoviedb.org/3/movie/popular"
credits_url_template = "https://api.themoviedb.org/3/movie/{}/credits"
providers_url_template = "https://api.themoviedb.org/3/movie/{}/watch/providers"

total_pages = 200
all_movies = []

for page in range(1, total_pages + 1):
    parameters = {"api_key": api_key, "page": page}
    response = requests.get(url, params=parameters)
    
    if response.status_code == 200:
        data = response.json()
        movies = data["results"]
        
        for movie in movies:
            movie["movie_id"] = movie.pop("id")  # Rename 'id' to 'movie_id'
            movie["rating_average"] = movie.pop("vote_average")  # Rename 'vote_average' to 'rating_average'
            
            # Get cast names
            credits_url = credits_url_template.format(movie["movie_id"])
            credits_response = requests.get(credits_url, params={"api_key": api_key})
            if credits_response.status_code == 200:
                credits_data = credits_response.json()
                cast_names = {cast_member["name"] for cast_member in credits_data.get("cast", [])}  # Use a set for uniqueness
                movie["cast_names"] = ", ".join(cast_names)  # Convert to comma-separated string
            else:
                movie["cast_names"] = None
            
            # Get watch providers
            providers_url = providers_url_template.format(movie["movie_id"])
            providers_response = requests.get(providers_url, params={"api_key": api_key})
            if providers_response.status_code == 200:
                providers_data = providers_response.json()
                provider_names = set()  # Store unique provider names
                
                for region, provider_info in providers_data.get("results", {}).items():
                    for category, providers_list in provider_info.items():
                        if isinstance(providers_list, list):
                            provider_names.update(provider["provider_name"] for provider in providers_list)
                
                movie["watch_providers"] = ", ".join(provider_names)
            else:
                movie["watch_providers"] = None
            
            time.sleep(0.2)  # Short delay to avoid rate limiting
        
        all_movies.extend(movies)
    else:
        print("Error:", response.status_code)
    
    time.sleep(0.5)
    print(page)
    
# Convert to DataFrame
movie_content_df = pd.DataFrame(all_movies)


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200


In [2]:
movie_content_df.to_csv("movie_content_data.csv", index=False, encoding="utf-8-sig")

### genre_id_dict.py

In [4]:
# Creates the dictionary of genre IDs and names
import requests
import pandas as pd

api_key = "d3e8d7fcb94be031986259192b4fdfb0"

# Base URL for the TMDb popular movies endpoint
url = "https://api.themoviedb.org/3/genre/movie/list"

# Set parameters like the page number and API key
parameters = {
    "api_key": api_key,
    "page":1
}

# Make the GET request to fetch the data
response = requests.get(url, params=parameters)

# Check if the request was successful
if response.status_code == 200:
    genre_data = response.json()  # Convert response to JSON
    genres = genre_data["genres"]  # Extract the list of genres
    genre_dict = {genre["id"]: genre["name"] for genre in genres}  # Create dictionary
    print(genre_dict)  # Print the genre mapping

else:
    print("Error", response.status_code)


{28: 'Action', 12: 'Adventure', 16: 'Animation', 35: 'Comedy', 80: 'Crime', 99: 'Documentary', 18: 'Drama', 10751: 'Family', 14: 'Fantasy', 36: 'History', 27: 'Horror', 10402: 'Music', 9648: 'Mystery', 10749: 'Romance', 878: 'Science Fiction', 10770: 'TV Movie', 53: 'Thriller', 10752: 'War', 37: 'Western'}


### content_based_proprocessing.py

In [5]:
import pandas as pd

def preprocessing(dataframe):
    # Map genre_ids to genre names
    dataframe['genre_ids'] = dataframe['genre_ids'].apply(lambda x: [genre_dict[genre_id] for genre_id in x])
    
    # Select columns
    dataframe = dataframe[['movie_id', 'title', 'release_date', 'genre_ids', 'original_language', 'cast_names', 'watch_providers', 'rating_average', 'vote_count']]
    
    ## more data preprocessing 
    
    return dataframe


movie_content_df = preprocessing(movie_content_df)


In [6]:
movie_content_df.head()

Unnamed: 0,movie_id,title,release_date,genre_ids,original_language,cast_names,watch_providers,rating_average,vote_count
0,950396,The Gorge,2025-02-13,"[Romance, Science Fiction, Thriller]",en,"Miles Teller, James Marlowe, Oliver Trevena, J...","Canal+, Apple TV+, Apple TV Plus Amazon Channel",7.78,1798
1,1126166,Flight Risk,2025-01-22,"[Action, Thriller, Crime]",en,"Maaz Ali, Georgi S. Georgiev, Atanas Srebrev, ...","Amazon Video, Google Play Movies, Fandango At ...",6.073,424
2,1241982,Moana 2,2024-11-21,"[Animation, Adventure, Family, Comedy]",en,"Nicole Scherzinger, Bryson Chun, Hualālai Chun...","Sky Store, Fandango At Home, blue TV, Pathé Th...",7.166,1809
3,939243,Sonic the Hedgehog 3,2024-12-19,"[Action, Science Fiction, Comedy, Family]",en,"Sofia Pernas, Barry Calvert, Jim Carrey, Adam ...","Sky Store, Paramount Plus Basic with Ads, U-NE...",7.7,2121
4,927342,Amaran,2024-10-31,"[Action, Drama, Adventure, War]",ta,"Hanun Bawra, Sivakarthikeyan, Lallu Prasath, S...","Netflix, Netflix basic with Ads",7.468,185


### content_based_tidying.py

In [None]:
import pandas as pd
import pycountry # for languages convertion
import pandas as pd


# Tidying genre_IDs 
def clean_genre_ids(value):
    if isinstance(value, list):  # If it's already a list, clean and join
        return ', '.join(genre.strip() for genre in value)
    elif isinstance(value, str) and value.startswith('c(') and value.endswith(')'):
        # Handle string cases formatted like R's "c(...)"
        genres = [genre.strip().strip('"') for genre in value[2:-1].split(',')]
        return ', '.join(genres)
    return value  # Return as is if neither case

    # Apply the function to genre_ids column
movie_content_df['genre_ids'] = movie_content_df['genre_ids'].apply(clean_genre_ids)
    # Check genre_ids missing and type
movie_content_df['genre_ids'].isna().sum() # No missing data
movie_content_df['genre_ids'].apply(type).value_counts() # All are string type


# Tidying original language to be full word
def convert_language_code(code):
    try:
        language = pycountry.languages.get(alpha_2=code)
        return language.name
    except:
        return code  # no corresponding language, return original language code
      
movie_content_df['original_language'] = movie_content_df['original_language'].apply(convert_language_code)




# Rating average, vote count
"""
    Rating average: Average of all user ratings, on a scale of 1 to 10.
                  A quantitative assessment of the overall quality of a movie.
    
    Vote count: The total number of people who voted for the movie.
                The more votes there are, the more reliable the average score is.
"""
    
    # Data type
movie_content_df['rating_average'] = pd.to_numeric(movie_content_df['rating_average'], errors='coerce')
movie_content_df['rating_average'] = movie_content_df['rating_average'].round(0).astype(int) # vote_average to round
movie_content_df['vote_count'] = pd.to_numeric(movie_content_df['vote_count'], errors='coerce')


# Creating a `release_year` column
movie_content_df = movie_content_df.copy()  # Ensure movies_df is a separate DataFrame
movie_content_df["release_date"] = movie_content_df["release_date"].astype(str)
movie_content_df = movie_content_df[movie_content_df["release_date"] != '']
movie_content_df["release_year"] = pd.to_numeric(movie_content_df["release_date"].str[:4], errors = "coerce")
#movie_content_df["release_year"] = movie_content_df["release_date"].astype(str).str[:4].astype(int)
movie_content_df = movie_content_df.drop(columns=["release_date"])


# Changing the `title` type
  # Convert to pandas' new string type
movie_content_df["title"] = movie_content_df["title"].astype("string")
  # Check the dtype again
print(movie_content_df["title"].dtype)
    
    
# Editing `cast_names`
movie_content_df["cast_names"] = movie_content_df["cast_names"].replace("", pd.NA)


# Editing `watch_providers`
movie_content_df["watch_providers"] = movie_content_df["watch_providers"].replace("", pd.NA)


# Reordering column names
  # Define the new column order
new_column_order = ['movie_id', 'title', 'release_year', 'genre_ids', 'original_language', 'cast_names', 'watch_providers', 'rating_average', 'vote_count']
# Reorganize columns in the DataFrame
movie_content_df = movie_content_df[new_column_order]


KeyError: 'release_date'

In [9]:
movie_content_df.head()

Unnamed: 0,movie_id,title,release_year,genre_ids,original_language,cast_names,watch_providers,rating_average,vote_count
0,950396,The Gorge,2025,"Romance, Science Fiction, Thriller",English,"Miles Teller, James Marlowe, Oliver Trevena, J...","Canal+, Apple TV+, Apple TV Plus Amazon Channel",8,1798
1,1126166,Flight Risk,2025,"Action, Thriller, Crime",English,"Maaz Ali, Georgi S. Georgiev, Atanas Srebrev, ...","Amazon Video, Google Play Movies, Fandango At ...",6,424
2,1241982,Moana 2,2024,"Animation, Adventure, Family, Comedy",English,"Nicole Scherzinger, Bryson Chun, Hualālai Chun...","Sky Store, Fandango At Home, blue TV, Pathé Th...",7,1809
3,939243,Sonic the Hedgehog 3,2024,"Action, Science Fiction, Comedy, Family",English,"Sofia Pernas, Barry Calvert, Jim Carrey, Adam ...","Sky Store, Paramount Plus Basic with Ads, U-NE...",8,2121
4,927342,Amaran,2024,"Action, Drama, Adventure, War",Tamil,"Hanun Bawra, Sivakarthikeyan, Lallu Prasath, S...","Netflix, Netflix basic with Ads",7,185


#### language detail check

In [12]:
unique_languages = movie_content_df["original_language"].unique()
print(unique_languages)
#movie_content_df[movie_content_df["original_language"] == "xx"][["title", "overview"]]


['English' 'Tamil' 'French' 'Chinese' 'Latvian' 'Japanese' 'Thai'
 'Spanish' 'Hindi' 'Indonesian' 'Telugu' 'Norwegian' 'Korean' 'Finnish'
 'Swedish' 'Portuguese' 'Tagalog' 'Polish' 'Italian' 'Danish'
 'Modern Greek (1453-)' 'Turkish' 'Dutch' 'German' 'Zulu' 'Catalan'
 'Russian' 'Bengali' 'xx' 'Malayalam' 'cn' 'Kurdish' 'Persian' 'Serbian'
 'Arabic' 'Hebrew' 'Vietnamese' 'Bulgarian' 'Nepali (macrolanguage)'
 'Ukrainian' 'Amharic' 'Armenian' 'Kazakh' 'Azerbaijani' 'Croatian'
 'Czech' 'Kannada' 'Lithuanian' 'Hungarian' 'Kirghiz' 'Sinhala'
 'Luxembourgish' 'Estonian' 'Basque']


In [19]:
xx_movies = movie_content_df[movie_content_df["original_language"] == "cn"][["movie_id", "title"]]
xx_movies.head(10)  # 查看前10行


Unnamed: 0,movie_id,title
454,1071646,My First of May
602,11770,Shaolin Soccer
673,298094,Hidden Desire
826,923667,Twilight of the Warriors: Walled In
939,9470,Kung Fu Hustle
940,9470,Kung Fu Hustle
1176,1354736,"True Love, For Once in My Life"
1496,843,In the Mood for Love
1526,213646,Don't Stop My Crazy Love for You
1548,172956,Erotic Dream of the Red Chamber


In [22]:
movie_content_df["original_language"] = movie_content_df["original_language"].replace("cn", "Chinese") # cn to Chinese
movie_content_df["original_language"] = movie_content_df["original_language"].replace("xx", "Unknown") # xx to Unknown

In [23]:
unique_languages = movie_content_df["original_language"].unique()
print(unique_languages)

['English' 'Tamil' 'French' 'Chinese' 'Latvian' 'Japanese' 'Thai'
 'Spanish' 'Hindi' 'Indonesian' 'Telugu' 'Norwegian' 'Korean' 'Finnish'
 'Swedish' 'Portuguese' 'Tagalog' 'Polish' 'Italian' 'Danish'
 'Modern Greek (1453-)' 'Turkish' 'Dutch' 'German' 'Zulu' 'Catalan'
 'Russian' 'Bengali' 'Unknown' 'Malayalam' 'Kurdish' 'Persian' 'Serbian'
 'Arabic' 'Hebrew' 'Vietnamese' 'Bulgarian' 'Nepali (macrolanguage)'
 'Ukrainian' 'Amharic' 'Armenian' 'Kazakh' 'Azerbaijani' 'Croatian'
 'Czech' 'Kannada' 'Lithuanian' 'Hungarian' 'Kirghiz' 'Sinhala'
 'Luxembourgish' 'Estonian' 'Basque']


### Cosine Similarity
#### TF-IDF
Combined text feature
Movie name usually does not affect content similarity and can be used as a unique identifier but not for TF-IDF
We need to combine `genre_ids`, `cast_names`, and `watch_providers` into a single column:

In [40]:
movie_content_df["combined_features"] = (
    movie_content_df["genre_ids"].fillna("").str.replace(",", " ") + " | " +  
    movie_content_df["cast_names"].fillna("").str.replace(",", " ") + " | " +  
    movie_content_df["watch_providers"].fillna("").str.replace(",", " ")
)

#movie_content_df["combined_features"] = (
    #movie_content_df["genre_ids"].fillna("").str.replace(",", " ") + " | " +  
    #movie_content_df["cast_names"].fillna("").str.replace(",", " ")
#)

movie_content_df["combined_features"].head()

0    Romance  Science Fiction  Thriller | Miles Tel...
1    Action  Thriller  Crime | Maaz Ali  Georgi S. ...
2    Animation  Adventure  Family  Comedy | Nicole ...
3    Action  Science Fiction  Comedy  Family | Sofi...
4    Action  Drama  Adventure  War | Hanun Bawra  S...
Name: combined_features, dtype: object

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(stop_words="english", min_df=2)

tfidf_matrix = tfidf_vectorizer.fit_transform(movie_content_df["combined_features"])

print("TF-IDF Matrix Shape:", tfidf_matrix.shape)

TF-IDF Matrix Shape: (3996, 24947)


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

print("Cosine Similarity Matrix Shape:", cosine_sim.shape)
print("Sample Cosine Similarity Scores:\n", cosine_sim[:5, :5])

Cosine Similarity Matrix Shape: (3996, 3996)
Sample Cosine Similarity Scores:
 [[1.         0.01850778 0.01994011 0.0563586  0.        ]
 [0.01850778 1.         0.02599636 0.04370548 0.01046924]
 [0.01994011 0.02599636 1.         0.08599738 0.00408057]
 [0.0563586  0.04370548 0.08599738 1.         0.00612457]
 [0.         0.01046924 0.00408057 0.00612457 1.        ]]


In [72]:
from difflib import get_close_matches

def recommend_movies(movie_title, movie_content_df, cosine_sim, top_n=11):
    # prepprocess input: remove space and lowercase
    clean_title = movie_title.strip().lower()

    # preprocess movie title
    movie_content_df["clean_title"] = movie_content_df["title"].str.strip().str.lower()

    # find the most similar movie
    possible_matches = get_close_matches(clean_title, movie_content_df["clean_title"], n=1, cutoff=0.7)

    
    if possible_matches:
        clean_title = possible_matches[0]

    
    movie_idx = movie_content_df[movie_content_df["clean_title"] == clean_title].index

    if movie_idx.empty:
        return f"Movie '{movie_title.strip()}' not found. Please check the title."


    movie_idx = movie_idx[0]

   
    similarity_scores = list(enumerate(cosine_sim[movie_idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]

   
    top_movies = similarity_scores[1:top_n+1]

    
    recommendations = movie_content_df.iloc[[i[0] for i in top_movies]][["title", "genre_ids", "rating_average"]]

    print(f"\n🎬 Using matched movie: {movie_content_df.iloc[movie_idx]['title']}")
    print(f"\n📌 Top 10 movies similar to '{movie_title.strip()}':")
    for i, row in enumerate(recommendations.itertuples(), start=1):
        print(f"{i}. {row.title} (Genre: {row.genre_ids}, Rating: {row.rating_average})")

    return recommendations


In [73]:
#Test
recommendations = recommend_movies("  THE GORGE  ", movie_content_df, cosine_sim)
recommendations = recommend_movies(" the gorge", movie_content_df, cosine_sim)
recommendations = recommend_movies("The Gorgee", movie_content_df, cosine_sim)


🎬 Using matched movie: The Gorge

📌 Top 10 movies similar to 'THE GORGE':
1. Alien (Genre: Horror, Science Fiction, Rating: 8)
2. Paradox Effect (Genre: Action, Thriller, Rating: 6)
3. The New Mutants (Genre: Science Fiction, Horror, Action, Rating: 6)
4. Sherlock Holmes (Genre: Action, Adventure, Crime, Mystery, Rating: 7)
5. Azureus Rising (Genre: Science Fiction, Animation, Rating: 6)
6. Split (Genre: Horror, Thriller, Rating: 7)
7. The Huntsman: Winter's War (Genre: Action, Adventure, Drama, Rating: 6)
8. A Monster Calls (Genre: Fantasy, Adventure, Family, Rating: 7)
9. The Pursuit of Lust (Genre: Romance, Rating: 0)
10. Paddington 2 (Genre: Adventure, Comedy, Family, Rating: 7)

🎬 Using matched movie: The Gorge

📌 Top 10 movies similar to 'the gorge':
1. Alien (Genre: Horror, Science Fiction, Rating: 8)
2. Paradox Effect (Genre: Action, Thriller, Rating: 6)
3. The New Mutants (Genre: Science Fiction, Horror, Action, Rating: 6)
4. Sherlock Holmes (Genre: Action, Adventure, Crime, 

### Sigmoid Kernel
### Linear Kernel

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# TF-IDF
movie_content_df["combined_features"] = (
    movie_content_df["genre_ids"].fillna("").str.replace(",", " ") + " | " +
    movie_content_df["cast_names"].fillna("").str.replace(",", " ") + " | " +
    movie_content_df["watch_providers"].fillna("").str.replace(",", " ")
)

tfidf_vectorizer = TfidfVectorizer(stop_words="english", min_df=2)
tfidf_matrix = tfidf_vectorizer.fit_transform(movie_content_df["combined_features"])

# add numeric features(need standardization)
scaler = MinMaxScaler()
numeric_features = movie_content_df[["rating_average", "vote_count"]].fillna(0)
numeric_matrix = scaler.fit_transform(numeric_features)

# integrate TFIDF and numeric features
final_matrix = np.hstack((tfidf_matrix.toarray(), numeric_matrix))

# Linear Kernel similarity
from sklearn.metrics.pairwise import linear_kernel
linear_sim = linear_kernel(final_matrix, final_matrix)

print("Linear Kernel Similarity Matrix Shape:", linear_sim.shape)
print("Sample Linear Kernel Similarity Scores:\n", linear_sim[:5, :5])


Linear Kernel Similarity Matrix Shape: (3996, 3996)
Sample Linear Kernel Similarity Scores:
 [[1.64234228 0.49906013 0.58229671 0.69912166 0.560241  ]
 [0.49906013 1.36013025 0.44655209 0.52435706 0.43052607]
 [0.58229671 0.44655209 1.49237102 0.64877734 0.49432304]
 [0.69912166 0.52435706 0.64877734 1.64325942 0.56640887]
 [0.560241   0.43052607 0.49432304 0.56640887 1.4900248 ]]


In [None]:
from difflib import get_close_matches

def recommend_movies_linear(movie_title, movie_content_df, linear_sim, top_n=11):
    # prepprocess input: remove space and lowercase
    clean_title = movie_title.strip().lower()

    # preprocess movie title
    movie_content_df["clean_title"] = movie_content_df["title"].str.strip().str.lower()

    # find the most similar movie
    possible_matches = get_close_matches(clean_title, movie_content_df["clean_title"], n=1, cutoff=0.7)


    if possible_matches:
        clean_title = possible_matches[0]
    else:
        return f"Movie '{movie_title.strip()}' not found. Please check the title."


    movie_idx = movie_content_df[movie_content_df["clean_title"] == clean_title].index
    if movie_idx.empty:
        return f"Movie '{movie_title.strip()}' not found. Please check the title."

    movie_idx = movie_idx[0]


    similarity_scores = list(enumerate(linear_sim[movie_idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Top 10
    top_movies = similarity_scores[1:top_n+1]

  
    recommendations = movie_content_df.iloc[[i[0] for i in top_movies]][["title", "genre_ids", "rating_average"]]

    print(f"\n🎬 Using matched movie: {movie_content_df.iloc[movie_idx]['title']}")
    print(f"\n📌 Top 10 movies similar to '{movie_title.strip()}':")
    for i, row in enumerate(recommendations.itertuples(), start=1):
        print(f"{i}. {row.title} (Genre: {row.genre_ids}, Rating: {row.rating_average})")

    return recommendations


In [74]:
#Test
movie_name = "the gorge"
recommended_movies_linear = recommend_movies_linear(movie_name, movie_content_df, linear_sim)



🎬 Using matched movie: The Gorge

📌 Top 10 movies similar to 'the gorge':
1. Nude (Genre: Documentary, Rating: 10)
2. Close To Me (Genre: Romance, Thriller, Rating: 10)
3. The Shawshank Redemption (Genre: Drama, Crime, Rating: 9)
4. Another Simple Favor (Genre: Comedy, Crime, Thriller, Rating: 10)
5. Alien (Genre: Horror, Science Fiction, Rating: 8)
6. The Green Mile (Genre: Fantasy, Drama, Crime, Rating: 9)
7. The Dark Knight (Genre: Drama, Action, Crime, Thriller, Rating: 9)
8. Quando Elas Se Movimentam (Genre: Documentary, Rating: 10)
9. Summer Illusion (Genre: Drama, Rating: 10)
10. The Hungry Wolf (Genre: Animation, Family, Rating: 10)
11. El Apocalipsis de san Juan (Genre: Documentary, History, Rating: 10)
