# **PREREQUISITES**

In [77]:
# Importing necessary libraries

# For numerical operations and data manipulation
import numpy as np
import pandas as pd

# For text processing and feature extraction
from sklearn.feature_extraction.text import CountVectorizer

# For measuring similarity between feature vectors
from sklearn.metrics.pairwise import cosine_similarity

# For stemming words in text data
from nltk.stem.porter import PorterStemmer

In [78]:
# Loading the dataset
movies = pd.read_csv('data.csv')

In [79]:
# Renaming columns for better readability
movies.rename(columns={'Unnamed: 0': 'id', 'Movie Name': 'title'}, inplace=True)

In [80]:
# Selecting relevant columns
movies = movies[['id', 'title', 'Genre', 'Director', 'Stars', 'Description']]

In [81]:
# Handling missing values
movies.isnull().sum()

id             0
title          0
Genre          0
Director       0
Stars          0
Description    0
dtype: int64

In [82]:
# Handling duplicates
movies.duplicated().sum()

0

In [83]:
# Cleaning text data in columns
def clean_text(x):
    x = x.replace("[", "").replace("]", "").replace("'", "").replace(",", " ")
    x = x.split()
    return x

In [84]:
columns_to_clean = ['Genre', 'Director', 'Stars', 'Description']
for col in columns_to_clean:
    movies[col] = movies[col].apply(clean_text)

In [85]:
movies.head(1)

Unnamed: 0,id,title,Genre,Director,Stars,Description
0,0,The Shawshank Redemption,[Drama],"[Frank, Darabont]","[Tim, Robbins, Morgan, Freeman, Bob, Gunton, W...","[Over, the, course, of, several, years, two, c..."


In [86]:
# Concatenating cleaned text from different columns into 'tags'
movies['tags'] = movies['Genre'] + movies['Director'] + movies['Stars'] + movies['Description']

In [87]:
movies.head(1)

Unnamed: 0,id,title,Genre,Director,Stars,Description,tags
0,0,The Shawshank Redemption,[Drama],"[Frank, Darabont]","[Tim, Robbins, Morgan, Freeman, Bob, Gunton, W...","[Over, the, course, of, several, years, two, c...","[Drama, Frank, Darabont, Tim, Robbins, Morgan,..."


In [88]:
# Converting 'tags' to lowercase and applying stemming
ps = PorterStemmer()
movies['tags'] = movies['tags'].apply(lambda x: " ".join([ps.stem(word.lower()) for word in x]))

In [89]:
movies.head(1)

Unnamed: 0,id,title,Genre,Director,Stars,Description,tags
0,0,The Shawshank Redemption,[Drama],"[Frank, Darabont]","[Tim, Robbins, Morgan, Freeman, Bob, Gunton, W...","[Over, the, course, of, several, years, two, c...",drama frank darabont tim robbin morgan freeman...


In [90]:
# Creating feature vectors using CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(movies['tags']).toarray()

In [91]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [92]:
# Calculating cosine similarity matrix
similarity = cosine_similarity(vectors)

In [93]:
similarity

array([[1.        , 0.03615508, 0.        , ..., 0.        , 0.06726728,
        0.04756515],
       [0.03615508, 1.        , 0.        , ..., 0.22951012, 0.04134491,
        0.02923527],
       [0.        , 0.        , 1.        , ..., 0.04415108, 0.        ,
        0.        ],
       ...,
       [0.        , 0.22951012, 0.04415108, ..., 1.        , 0.05337605,
        0.        ],
       [0.06726728, 0.04134491, 0.        , ..., 0.05337605, 1.        ,
        0.10878566],
       [0.04756515, 0.02923527, 0.        , ..., 0.        , 0.10878566,
        1.        ]])

In [97]:
# Recommender function based on cosine similarity
def recommend(movie):
    movie_index = movies[movies['title'] == movie].index[0]
    distances = similarity[movie_index]
    movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])

    print("Top 4 recommended movies similar to '{}':".format(movie))
    for i in range(1, 5):
        print("{}. {}".format(i, movies.iloc[movie_list[i][0]]['title']))

In [98]:
# Example of usage
recommend('The Avengers')

Top 4 recommended movies similar to 'The Avengers':
1. Avengers: Age of Ultron
2. Captain America: Civil War
3. Avengers: Infinity War
4. Captain America: The Winter Soldier


# Storing the values in as new dataset so as to decrease the file size

In [100]:
def recommend1(movie):
    index = movies[movies['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x: x[1])
    recommended_movie_names = []
    for i in distances[0:5]:
        recommended_movie_names.append(movies.iloc[i[0]].title)

    # return recommended_movie_names
    return recommended_movie_names

In [102]:
movies['recommend']=movies['title'].apply(recommend1)

In [103]:
movies.head(1)

Unnamed: 0,id,title,Genre,Director,Stars,Description,tags,recommend
0,0,The Shawshank Redemption,[Drama],"[Frank, Darabont]","[Tim, Robbins, Morgan, Freeman, Bob, Gunton, W...","[Over, the, course, of, several, years, two, c...",drama frank darabont tim robbin morgan freeman...,"[The Shawshank Redemption, The Secret Life of ..."


In [104]:
movies=movies[['title','recommend']]

In [105]:
movies.head()

Unnamed: 0,title,recommend
0,The Shawshank Redemption,"[The Shawshank Redemption, The Secret Life of ..."
1,The Godfather,"[The Godfather, The Godfather Part III, The Go..."
2,Ramayana: The Legend of Prince Rama,"[Ramayana: The Legend of Prince Rama, Attack o..."
3,The Chaos Class,"[The Chaos Class, Goodbye, Children, Exam, Hic..."
4,The Dark Knight,"[The Dark Knight, Batman Begins, The Dark Knig..."


In [112]:
name='Batman Begins'

In [113]:
movies.loc[movies['title'] == name, 'recommend'].iloc[0]

['Batman Begins',
 'The Dark Knight',
 'Game of Death',
 'Cold Pursuit',
 'The Dark Knight Rises']

## **Now with the help of some jugaad we can run the dataset without using git LFS**

In [115]:
# Library to change file into pickle
import pickle

In [116]:
# Storing as recommend.pkl file
pickle.dump(movies,open('recommend.pkl','wb'))