In [1]:
'''

used Imports

'''
import pandas as pd
import numpy as np
import nltk
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tag import pos_tag
from rake_nltk import Rake
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('stopwords')
#nltk.download('averaged_perceptron_tagger')

In [2]:
'''
Lemmatization function to return words to root value 
It take @param String and return each word in this
string to root value the return new string 

'''
def lemmatize_sentence(sentence):
    wordnet_lemmatizer = WordNetLemmatizer()
    token_words=word_tokenize(sentence)

    lemm_sentence=[]
    for word, tag in pos_tag(token_words):

        if tag.startswith("NN"):
            lemm_sentence.append(wordnet_lemmatizer.lemmatize(word, pos='n'))
            lemm_sentence.append(" ")

        elif tag.startswith("VB"):
            lemm_sentence.append(wordnet_lemmatizer.lemmatize(word, pos='v'))
            lemm_sentence.append(" ")
      
        elif tag.startswith("JJ"):
            lemm_sentence.append(wordnet_lemmatizer.lemmatize(word, pos='a'))
            lemm_sentence.append(" ")

        else:
            lemm_sentence.append(wordnet_lemmatizer.lemmatize(word))
            lemm_sentence.append(" ")
      

    return "".join(lemm_sentence)

In [3]:
'''

Function to remove stop words from string 
@param String and return string without
stop words

'''
def clean_sentence(sentence):
    stop_words = set(stopwords.words('english')) 
    stop_words.update((',', '.', '!', '?', '&', "-", ";", ":"))
  
    word_tokens = word_tokenize(sentence) 
  
    filtered_sentence = [word for word in word_tokens if not word in stop_words] 
  
    filtered_sentence = [] 
    for word in word_tokens: 
        if word not in stop_words: 
            filtered_sentence.append(word) 
            filtered_sentence.append(" ")
  
    return "".join(filtered_sentence)

In [4]:
'''

Function to read csv_file

'''
def read_csv(csv_file, columns):
    df = pd.read_csv(csv_file)
    df = df[columns]

    return df

In [5]:
def organize_data(df):
    
  # discarding the commas between the actors' full names and getting only the first three names
    df['Actors'] = df['Actors'].map(lambda x: x.split(',')[:3])

  # putting the genres in a list of words
    df['Genre'] = df['Genre'].map(lambda x: x.lower().split(','))

  # putting the Director in a list 
    df['Director'] = df['Director'].map(lambda x: x.split(' '))

  # merging together first and last name for each actor and director, so it's considered as one word 
  # and there is no mix up between people sharing a first name
    for index, row in df.iterrows():
        row['Actors'] = [x.lower().replace(' ','') for x in row['Actors']]
        row['Director'] = ''.join(row['Director']).lower()
      
    return df

In [6]:
'''

Function take description and return it
into list of word

'''
def description_to_list(description):
    description_list = list(description.split())

    return description_list

In [7]:
def clean_key_words(df):
    # initializing the new column
    df['Key_words'] = ""
    for index, row in df.iterrows():
        plot = row['Plot']
      
        lemmatize_words = lemmatize_sentence(plot)
  
        words_without_stop_words = clean_sentence(lemmatize_words)
  
        # assigning the key words to the new column
        row['Key_words'] = description_to_list(words_without_stop_words)

    # dropping the Plot column
    df.drop(columns = ['Plot'], inplace = True)

    df.set_index('Title', inplace = True)

    return df


In [8]:
def create_bag_of_words(df):
    df['bag_of_words'] = ''
    columns = df.columns
    for index, row in df.iterrows():
        words = ''
        for col in columns:
            if col != 'Director':
                words = words + ' '.join(row[col])+ ' '
            else:
                words = words + row[col]+ ' '
        row['bag_of_words'] = words
    
    df.drop(columns = [col for col in df.columns if col!= 'bag_of_words'], inplace = True)
  
    return df

In [9]:
def Tfidf_Vectorization(df):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_vectorizer_matrix = vectorizer.fit_transform(df['bag_of_words'])
    
    indices = pd.Series(df.index)
      
    return tfidf_vectorizer_matrix, indices

In [10]:
def calc_cosine_similarity(tfidf_vectorizer_matrix):
    cosine_similarity_matrix = cosine_similarity(tfidf_vectorizer_matrix, tfidf_vectorizer_matrix)
  
    return cosine_similarity_matrix


In [11]:
# function that takes in movie title as input and returns the top 10 recommended movies
def recommendations(title, cosine_similarity_matrix, indices):
    
    recommended_movies = []
    
    # gettin the index of the movie that matches the title
    idx = indices[indices == title].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_similarity_matrix[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar movies
    top_10_indexes = list(score_series.iloc[1:10].index)
    
    # populating the list with the titles of the best 10 matching movies
    for i in top_10_indexes:
        recommended_movies.append(list(df.index)[i])
        
    return recommended_movies

In [12]:
if __name__ =='__main__' :
    df = read_csv('/home/mohamedessam/Movies-Recommendation/movies.csv', ['Title','Genre','Director','Actors','Plot'])
    organize_data(df)
    clean_key_words(df)
    create_bag_of_words(df)
    tfidf_vectorizer_matrix, indices = Tfidf_Vectorization(df)
    cosine_similarity_matrix = calc_cosine_similarity(tfidf_vectorizer_matrix)
    recommended_movies = recommendations("The Godfather", cosine_similarity_matrix, indices) 
    print(recommended_movies)

['The Godfather: Part II', 'Apocalypse Now', 'Scarface', 'On the Waterfront', 'The Night of the Hunter', 'Casino', 'Guardians of the Galaxy', 'Heat', 'A Streetcar Named Desire']
