<h2>Data Exploration & Preprocessing</h2>

In [1]:
import pandas as pd  # For data manipulation and analysis
import difflib  # For finding close matches of strings
from sklearn.feature_extraction.text import CountVectorizer  # For converting text data into feature vectors
from sklearn.metrics.pairwise import cosine_similarity  # For measuring similarity between feature vectors
from sklearn.model_selection import train_test_split  # For splitting data into training and testing sets
from sklearn.metrics import  accuracy_score  # For performance evaluation
from sklearn.naive_bayes import MultinomialNB  # Naive Bayes algorithm for classification
from tabulate import tabulate  # For displaying tabular data in a nicely formatted way

# Load the MovieLens dataset
moviesData = pd.read_csv('../dataset/raw/movie.csv')  # Load movie data
moviesData['index'] = moviesData.index  # Add an index column for reference
print(moviesData.head())  # Display the first few rows of the dataset


   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  index  
0  Adventure|Animation|Children|Comedy|Fantasy      0  
1                   Adventure|Children|Fantasy      1  
2                               Comedy|Romance      2  
3                         Comedy|Drama|Romance      3  
4                                       Comedy      4  


<h2>Data Preprocessing <h2>

In [None]:
# checking for missing values and duplicates
nullValues=moviesData.isnull().sum()
print(nullValues)

# Combining multiple columns into a single string for feature extraction
merged_column=moviesData['movieId'].astype(str)+" "+moviesData['title']+" "+moviesData['genres']
merged_column.head()     # Display the first few merged rows



movieId    0
title      0
genres     0
index      0
dtype: int64


0    1 Toy Story (1995) Adventure|Animation|Childre...
1          2 Jumanji (1995) Adventure|Children|Fantasy
2             3 Grumpier Old Men (1995) Comedy|Romance
3      4 Waiting to Exhale (1995) Comedy|Drama|Romance
4          5 Father of the Bride Part II (1995) Comedy
dtype: object

In [7]:
# Vectorizing the merged column to extract features
vectorizer=CountVectorizer()  # Initialize CountVectorizer
feature_extraction=vectorizer.fit_transform(merged_column.head())  # Convert text data into numerical vectors

# Calculating cosine similarity between feature vectors
similarity=cosine_similarity(feature_extraction)
similarity

array([[1.        , 0.63245553, 0.28867513, 0.26726124, 0.25      ],
       [0.63245553, 1.        , 0.18257419, 0.16903085, 0.15811388],
       [0.28867513, 0.18257419, 1.        , 0.46291005, 0.28867513],
       [0.26726124, 0.16903085, 0.46291005, 1.        , 0.26726124],
       [0.25      , 0.15811388, 0.28867513, 0.26726124, 1.        ]])

In [16]:
# Function to recommend movies based on a given movie name
def reccomendMovie(movie_name,similarityMatrix,dataset, top_n=10):
    titles=dataset['title'].tolist()
    findCloseMatch=difflib.get_close_matches(movie_name,titles)

    if not findCloseMatch:
        return f"no Matches found of the movie: {movie_name}"
    
    closeMatch=findCloseMatch[0]

    # Find the index of the matched movie
    indexOfMovie=dataset[dataset.title==closeMatch]['index'].values[0]
    
    # Get similarity scores for the matched movie
    similarityScore=list(enumerate(similarityMatrix[indexOfMovie]))
    # Sort the movies by similarity scores 
    sortedSimilarMovies=sorted(similarityScore,key=lambda x:x[1])
    
    # Generate a list of recommendations
    recommendations=[]
    for i, (index, score) in enumerate(sortedSimilarMovies[1:top_n + 1], start=1):
        title = moviesData[moviesData.index == index]['title'].values[0]
        recommendations.append([i, title, round(score, 2)])
            # return recommendations
    
    print(tabulate( recommendations,headers=["Rank","Title","Similarity Score"],tablefmt="fancy_grid"))
movieName=input("Enter name of Movie: ")
print(f"Similar  movie to {movieName}")
reccomendMovie(movieName,similarity,moviesData)   

Similar  movie to Jumanji 
╒════════╤══════════════════════════╤════════════════════╕
│   Rank │ Title                    │   Similarity Score │
╞════════╪══════════════════════════╪════════════════════╡
│      1 │ Waiting to Exhale (1995) │               0.17 │
├────────┼──────────────────────────┼────────────────────┤
│      2 │ Grumpier Old Men (1995)  │               0.18 │
├────────┼──────────────────────────┼────────────────────┤
│      3 │ Toy Story (1995)         │               0.63 │
├────────┼──────────────────────────┼────────────────────┤
│      4 │ Jumanji (1995)           │               1    │
╘════════╧══════════════════════════╧════════════════════╛


In [14]:

x=moviesData['title']
y=moviesData['genres']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.5,random_state=42)
print("training data set",len(x_train))
print("testing data set",len(x_test))

training data set 13639
testing data set 13639


In [21]:
x_train_transformed=vectorizer.fit_transform(x_train)
x_test_transformed=vectorizer.transform(x_test)

In [None]:
model=MultinomialNB()
model.fit(x_train_transformed,y_train)

In [None]:
y_pred=model.predict(x_train_transformed)
accuracy=accuracy_score(y_test,y_pred)
print(f"Accuracy :", accuracy)