# Movie Recommendation Model


In [39]:
import pandas as pd
import ast # For converting string representation of lists/dictionaries
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
import nltk
from nltk.stem.porter import PorterStemmer
import re

In [40]:
df = pd.read_csv("tmdb_5000_movies.csv")

df.head()

df.info()

movies = df[["title", "overview", "genres", "keywords", "production_companies", "vote_average", "vote_count"]]
movies = movies.dropna()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [41]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4800 entries, 0 to 4802
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   title                 4800 non-null   object 
 1   overview              4800 non-null   object 
 2   genres                4800 non-null   object 
 3   keywords              4800 non-null   object 
 4   production_companies  4800 non-null   object 
 5   vote_average          4800 non-null   float64
 6   vote_count            4800 non-null   int64  
dtypes: float64(1), int64(1), object(5)
memory usage: 300.0+ KB


## Processing Genre + Keyword (json-like)


In [42]:
def convert(text):
    try:
        # Convert string representation of list/dictionary to actual list/dictionary
        return " ".join(i["name"].replace(" ", "") for i in ast.literal_eval(text))
    except:
        return ""

In [43]:
convert(movies["genres"][0])

'Action Adventure Fantasy ScienceFiction'

In [44]:
movies["tags"] = (
    movies["overview"] + " " +
    movies["keywords"].apply(convert) + " " +
    movies["genres"].apply(convert) + " " +
    movies["production_companies"].apply(convert)
)

final_data = movies[["title", "tags", "vote_average", "vote_count"]]
final_data

Unnamed: 0,title,tags,vote_average,vote_count
0,Avatar,"In the 22nd century, a paraplegic Marine is di...",7.2,11800
1,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",6.9,4500
2,Spectre,A cryptic message from Bond’s past sends him o...,6.3,4466
3,The Dark Knight Rises,Following the death of District Attorney Harve...,7.6,9106
4,John Carter,"John Carter is a war-weary, former military ca...",6.1,2124
...,...,...,...,...
4798,El Mariachi,El Mariachi just wants to play his guitar and ...,6.6,238
4799,Newlyweds,A newlywed couple's honeymoon is upended by th...,5.9,5
4800,"Signed, Sealed, Delivered","""Signed, Sealed, Delivered"" introduces a dedic...",7.0,6
4801,Shanghai Calling,When ambitious New York attorney Sam is sent t...,5.7,7


## Cleaning Text using PorterStemmer


In [45]:
ps = PorterStemmer()

def clean_text(text):
    text = re.sub(r'^a-zA-Z', ' ', text) # Remove special characters
    text = text.lower()
    text = text.split()
    text = [ps.stem(word) for word in text]  # Stem each word (remove suffixes)
    return ' '.join(text)

final_data["tags"] = final_data["tags"].apply(clean_text)
final_data["tags"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data["tags"] = final_data["tags"].apply(clean_text)


0       in the 22nd century, a parapleg marin is dispa...
1       captain barbossa, long believ to be dead, ha c...
2       a cryptic messag from bond’ past send him on a...
3       follow the death of district attorney harvey d...
4       john carter is a war-weary, former militari ca...
                              ...                        
4798    el mariachi just want to play hi guitar and ca...
4799    a newlyw couple' honeymoon is upend by the arr...
4800    "signed, sealed, delivered" introduc a dedic q...
4801    when ambiti new york attorney sam is sent to s...
4802    ever sinc the second grade when he first saw h...
Name: tags, Length: 4800, dtype: object

In [46]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, stop_words='english')),
])

X = pipeline.fit_transform(final_data["tags"])

In [48]:
similarity = cosine_similarity(X)

In [49]:
def recommend(movie):
    movie_index = final_data[final_data["title"] == movie].index[0]
    print(movie_index)
    distances = similarity[movie_index]
    movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]

    recommended_movies = []
    for i in movie_list:
        recommended_movies.append(final_data.iloc[i[0]].title)

    return recommended_movies

In [50]:
recommend("Avatar")

0


['Aliens',
 'Falcon Rising',
 'Meet Dave',
 'Aliens vs Predator: Requiem',
 'Battle: Los Angeles']