In [None]:
import pandas as pd
import numpy as np

In [None]:
pip install nltk

In [None]:
data = pd.read_csv("data.csv") #file path

In [None]:
data

In [None]:
data.columns


In [None]:
df = pd.DataFrame(data['Movie Name'])

In [None]:
df['description'] = data['Description']

In [None]:
df

In [None]:
# The provided task is to process the text data and formulate recommendation function :

df.description

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Importing natural language toolkit functions to process text

In [None]:
tokens = [word_tokenize(text) for text in df['description']]

In [None]:
tokens

In [None]:
# reducing the tokens into lower case letters :  (normalizing)


normalized_tokens = [[current.lower() for current in token] for token in tokens]

In [None]:
# Now as the tokens are normalized, we can remove the english stop words from the text. 

stops = stopwords.words('english')

In [None]:
# filtering the text normalized tokens to remove stop words on "stops"

In [None]:
filtered_tokens = [[word for word in text] for text in tokens if text not in stops]

In [None]:
filtered_tokens

In [None]:
# the stop words are filtered out and filtered token list contains only non stop word lists.


# Punctuations should also be removed from the text. 

import string

In [None]:
def remove_punctuation(words):
    return [word for word in words if word.isalpha() and word not in string.punctuation]

In [None]:
filtered = [remove_punctuation(word) for word in filtered_tokens]

In [None]:
filtered

In [None]:
# Now the data is tokenized and vectorization can be done. 


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity   # for similarity index and recommendation

In [None]:
df['filtered_description'] = filtered
df['filtered_description'] = df['filtered_description'].apply(lambda x: " ".join(x))

In [None]:
df

In [None]:
# vectorization

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df["filtered_description"])

In [None]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [None]:
# now we need to prepare a recommendation function that will take the input of description string from the user
# recommend similar movies to the user. 



# 1. Take input -> pre-process and convert -> calculate similarity -> Recommend top 5.



In [None]:
def recommend(input_string, df=df, vectorizer=vectorizer, tfidf_matrix=tfidf_matrix):

    # pre-processing the input text (same functions as done before)
    tokens = word_tokenize(input_string)  # tokenizing
    normalized_tokens = [current.lower() for current in tokens]  # Normalizing (to lower case)
    filtered_tokens = [word for word in normalized_tokens if word not in stops] # removing stop words
    filtered_input_tokens = remove_punctuation(filtered_tokens)
    filtered_string = " ".join(filtered_input_tokens)
    
    # vectorizing input 
    
    input_vector = vectorizer.transform([filtered_string])
    
    # calculating similarity between input and data vectors
    
    cosine_sim = cosine_similarity(input_vector, tfidf_matrix).flatten()
    
    # retreiving and returning top 5 movies
    
    sim_scores = sorted(list(enumerate(cosine_sim)), key=lambda x: x[1], reverse=True)
    top_movies = [df["Movie Name"].iloc[i[0]] for i in sim_scores[:5]]
    
    return top_movies
    

In [None]:
# Example usage of the function and similarity model :: 


movie_description = df['description'][np.random.randint(0, len(df))]
print (movie_description)
print()
print("As per the description, the top 5 recommended picks are ") 
recommend(movie_description)