<h2>Data Exploration & Preprocessing</h2>

In [2]:
import pandas as pd
import difflib
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score,recall_score,f1_score,accuracy_score
from tabulate import tabulate




In [3]:

# loading the movieLens dataset
# ratings= pd.read_csv('../dataset/raw/rating.csv') # loading rating data
moviesData= pd.read_csv('../dataset/raw/movie.csv')# loading movies data
moviesData['index'] =moviesData.index
print(moviesData.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  index  
0  Adventure|Animation|Children|Comedy|Fantasy      0  
1                   Adventure|Children|Fantasy      1  
2                               Comedy|Romance      2  
3                         Comedy|Drama|Romance      3  
4                                       Comedy      4  


In [4]:
# checking for missing values and duplicates
nullValues=moviesData.isnull().sum()
nullValues



movieId    0
title      0
genres     0
index      0
dtype: int64

In [5]:
selected_columns=moviesData[['movieId','title','genres']]
selected_columns

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
27273,131254,Kein Bund für's Leben (2007),Comedy
27274,131256,"Feuer, Eis & Dosenbier (2002)",Comedy
27275,131258,The Pirates (2014),Adventure
27276,131260,Rentun Ruusu (2001),(no genres listed)


In [7]:
for null in selected_columns:
    moviesData[null]=moviesData[null].fillna(' ')
    

In [8]:
merged_column=moviesData['movieId'].astype(str)+" "+moviesData['title']+" "+moviesData['genres']
merged_column.head()

0    1 Toy Story (1995) Adventure|Animation|Childre...
1          2 Jumanji (1995) Adventure|Children|Fantasy
2             3 Grumpier Old Men (1995) Comedy|Romance
3      4 Waiting to Exhale (1995) Comedy|Drama|Romance
4          5 Father of the Bride Part II (1995) Comedy
dtype: object

In [13]:
vectorizer=CountVectorizer() #Vectorizing genres
feature_extraction=vectorizer.fit_transform(merged_column)

similarity=cosine_similarity(feature_extraction)


In [32]:
#recommendation movies function
def reccomendMovie(movie_name,similarityMatrix,dataset, top_n=10):
    titles=dataset['title'].tolist()
    findCloseMatch=difflib.get_close_matches(movie_name,titles)

    if not findCloseMatch:
        return f"no Matches found of the movie: {movie_name}"
    
    closeMatch=findCloseMatch[0]

    indexOfMovie=dataset[dataset.title==closeMatch]['index'].values[0]
    
    similarityScore=list(enumerate(similarityMatrix[indexOfMovie]))
    
    sortedSimilarMovies=sorted(similarityScore,key=lambda x:x[1],reverse=True)

    recommendations=[]
    for i, (index, score) in enumerate(sortedSimilarMovies[1:top_n + 1], start=1):
        title = moviesData[moviesData.index == index]['title'].values[0]
        recommendations.append([i, title, round(score, 2)])
            # return recommendations
    
    print(tabulate( recommendations,headers=["Rank","Title","Similarity Score"],tablefmt="fancy_grid"))
movieName=input("Enter name of Movie: ")
print(f"Similar  movie to {movieName}")
reccomendMovie(movieName,similarity,moviesData)   

Similar  movie to Toy story (1995)
╒════════╤═══════════════════════════════════════════╤════════════════════╕
│   Rank │ Title                                     │   Similarity Score │
╞════════╪═══════════════════════════════════════════╪════════════════════╡
│      1 │ Toy Story 2 (1999)                        │               0.82 │
├────────┼───────────────────────────────────────────┼────────────────────┤
│      2 │ Toy Story 3 (2010)                        │               0.78 │
├────────┼───────────────────────────────────────────┼────────────────────┤
│      3 │ Toy Story Toons: Hawaiian Vacation (2011) │               0.71 │
├────────┼───────────────────────────────────────────┼────────────────────┤
│      4 │ Toy Story Toons: Small Fry (2011)         │               0.71 │
├────────┼───────────────────────────────────────────┼────────────────────┤
│      5 │ Jumanji (1995)                            │               0.63 │
├────────┼───────────────────────────────────────────

In [43]:

x=moviesData['movieId']
y=moviesData['genres']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)
print("training data set",len(x_train))
print("testing data set",len(x_test))

training data set 19094
testing data set 8184
