# Movie Recommendation Using NLP

### Import necessary libraries

In [18]:
import pandas as pd
import matplotlib.pyplot as plt
import io
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from skimage import io
from IPython.display import clear_output
pd.set_option('display.max_colwidth', None)

## Data Preprocessing

In [19]:
# read the dataset https://www.kaggle.com/datasets/akshaypawar7/millions-of-movies
df = pd.read_csv('movies.csv')

In [20]:
# drop movies that have a short description
df['word_count'] = df['overview'].apply(lambda x: len(str(x).split()))
df = df[df['word_count'] >= 50]
df.drop('word_count', axis=1, inplace=True)

In [21]:
# drop all duplicates
df.drop_duplicates(subset=['title', 'release_date'], inplace=True)
df.reset_index(drop=True, inplace=True)

In [22]:
# Drop rows with missing poster_path
df.dropna(subset=['poster_path'], inplace=True)

In [23]:
# fill empty cells
df.fillna(value={i: '' for i in ['overview', 'genres', 'keywords', 'credits']}, inplace=True)

In [24]:
# lambda function to preprocess string data
strOp= lambda x: ' '.join(x.split('-'))

In [25]:
# set overview to lowercase and remove punctuation
df.overview = df.overview.str.lower()
df.overview = df.overview.str.replace(r'[^\w\s]+', '')

  df.overview = df.overview.str.replace(r'[^\w\s]+', '')


In [26]:
# add keywords, genres and credits to overview for full information
df.overview = df.overview + df.keywords.apply(strOp) + df.genres.apply(strOp) + df.credits.apply(lambda x: ' '.join(x.replace(' ', '').split('-')[:3]))
# example
df.overview[0]

'set more than a decade after the events of the first film learn the story of the sully family jake neytiri and their kids the trouble that follows them the lengths they go to keep each other safe the battles they fight to stay alive and the tragedies they endureloss of loved one dying and death alien life form resurrection sequel dysfunctional family alien planet distant future adopted child rebirth family dynamics adopted son stronger villain warScience Fiction Adventure ActionSamWorthington ZoeSaldaña SigourneyWeaver'

### Initialize the TF-IDF Vectorizer to transform text data into vectors

#### only run these lines once!

In [27]:
# TF-IDF Vectorizer to transform words into numbers and remove common english words like 'the'
tfidf = TfidfVectorizer(stop_words='english')

In [None]:
# transform overview data with TF-IDF vectorizer to create matrix
tfidf_matrix = tfidf.fit_transform(df['overview'])

In [None]:
#display some columns with vectorized words
display(pd.DataFrame(
    tfidf_matrix[:5, 10000:10005].toarray(),
    columns= tfidf.get_feature_names_out()[10000:10005],
    index = df.title[:5]).round())

print(tfidf_matrix.shape)
# over 970000 different words used to describe all movies

## Movie Recommendation

### Initialize the recommendation function

### Ask for User Input

In [None]:
def get_genres():
    genres = input("What Movie Genre are you interested in (if multiple, please separate them with a comma)? [Type 'skip' to skip this question] ")
    genres = " ".join(["".join(n.split()) for n in genres.lower().split(',')])
    return genres

def get_overview():
    overview = input("What should the movie be about [Type 'skip' to skip this question]")
    overview = overview.lower()
    overview = overview.replace(r'[^\w\s]+', '')
    return overview
    
def get_searchTerms():
    searchTerms = [] 
    genres = get_genres()
    if genres != 'skip':
        searchTerms.append(genres)

    overview = get_overview()
    if overview != 'skip':
        searchTerms.append(overview)
    
    searchTerms = " ".join(searchTerms)

    return searchTerms

In [None]:
user_input = get_searchTerms()

### Show Recommendations

In [None]:
get_recommendation(user_input)