In [1]:
# Importing necessary libraries for data manipulation and analysis
import pandas as pd 
import numpy as np
# Importing libraries for text processing and similarity calculations
from sklearn.feature_extraction.text import CountVectorizer
# Importing NLTK library for natural language processing tasks
import nltk
# Importing PorterStemmer for stemming words in text data
from nltk.stem.porter import PorterStemmer as ps

In [2]:
# Read the movie data from the 'movie_df.csv' file into a pandas DataFrame (redundant line)
df = pd.read_csv('movie_df.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'movie_df.csv'

In [None]:
# Sample 5 random rows from the DataFrame to inspect the data
df.sample(5)

In [None]:
# Selecting specific columns from the DataFrame 'df'
df = df[["id","title","genre_ids","overview","popularity","keywords","cast","crew"]]

In [None]:
# Sampling 5 random rows from the DataFrame 'df'
df.sample(5)

In [None]:
# checking the dimensions of the DataFrame (number of rows, number of columns)
df.shape

In [None]:
# checking the total number of elements in the DataFrame (number of rows * number of columns)
df.size

In [None]:
# summary of the DataFrame, including column data types, non-null values, and memory usage
df.info()

In [None]:
# Check for missing values in each column of the DataFrame and sum them up
df.isnull().sum()

In [None]:
# Remove rows with missing values from the DataFrame and update it in place
df.dropna(inplace=True)
# Check for missing values again after dropping rows and sum them up
df.isnull().sum()

In [None]:
# Check for duplicated rows in the DataFrame and sum them up
df.duplicated().sum()

In [None]:
# Remove duplicated rows from the DataFrame and update it in place
df.drop_duplicates(inplace=True)
# Check for duplicated rows again after dropping duplicates and sum them up
df.duplicated().sum()

In [None]:
# Assuming df is your DataFrame and 'column_name' is the column you want to categorize
# Find the 33rd and 66th percentiles
percentiles = df['popularity'].quantile([0.33, 0.66])

# Define a function to categorize values based on percentiles
def categorize(value):
    # If the value is less than or equal to the 33rd percentile, categorize as 'lessPopular'
    if value <= percentiles[0.33]:
        return 'lessPopular'
    # If the value is less than or equal to the 66th percentile, categorize as 'mediumPopular'
    elif value <= percentiles[0.66]:
        return 'mediumPopular'
    # Otherwise, categorize as 'Popular'
    else:
        return 'Popular'

# Apply the categorization function to the column
df['popularity'] = df['popularity'].apply(categorize)


In [None]:
df.sample(5)

In [None]:
# Define a function named string_to_list that takes a string (col) as input
def string_to_list(col):
    # Split the input string into a list of words using space (" ") as the separator
    words_list = col.split(" ")
    # Return the resulting list of words
    return words_list

# Apply the string_to_list function to each element in the 'overview' column of the DataFrame (df),
# converting each string into a list of words and updating the 'overview' column with the lists
df['overview'] = df['overview'].apply(string_to_list)

# Apply the string_to_list function to each element in the 'popularity' column of the DataFrame (df),
# converting each string into a list of words and updating the 'popularity' column with the lists
df['popularity'] = df['popularity'].apply(string_to_list)


In [None]:
# Checking the changes
df.head(5)

In [None]:
# Remove any spaces in the 'keywords' column and update the DataFrame with the modified values
df['keywords'] = df['keywords'].apply(lambda x: ''.join(x.split()))

# Remove any spaces in the 'cast' column and update the DataFrame with the modified values
df['cast'] = df['cast'].apply(lambda x: ''.join(x.split()))

# Remove any spaces in the 'crew' column and update the DataFrame with the modified values
df['crew'] = df['crew'].apply(lambda x: ''.join(x.split()))


In [None]:
#Checking the changes
df.head(1)

In [None]:
import ast
# Assuming 'column' contains the string representation of a list
# It converts the string representation of a list to an actual list datatype.
# imports the "ast" (Abstract Syntax Trees) module, which is used to safely 
# evaluate literal expressions from strings containing Python data types.
df['genre_ids'] = df['genre_ids'].apply(ast.literal_eval)
df['keywords'] = df['keywords'].apply(ast.literal_eval)
df['cast'] = df['cast'].apply(ast.literal_eval)
df['crew'] = df['crew'].apply(ast.literal_eval)

# Now 'column' contains the list instead of the string representation

In [None]:
df.head()

In [None]:
# creates a new column named 'tags' in the DataFrame 'df'. It concatenates the values from multiple 
# existing columns ('genre_ids', 'overview', 'popularity', 'keywords', 'cast', and 'crew') row-wise 
# and assigns the result to the 'tags' column.
df['tags'] = df['genre_ids'] + df['overview'] + df['popularity'] + df['keywords'] + df['cast'] + df['crew'] 

In [None]:
#checking changes
df.head()

In [None]:
# creates a new DataFrame called 'movies_df' by selecting specific columns ('id', 'title', and 'tags') from the DataFrame 'df'.
movies_df = df[['id','title','tags']]

In [None]:
# Check changes
movies_df.head()

In [None]:
# Converting a list into a string 
movies_df['tags'] = movies_df['tags'].apply(lambda x:" ".join(x))

In [None]:
# Converting each words in lower words in tags column 
movies_df['tags'] = movies_df['tags'].apply(lambda x:x.lower())

In [None]:
# Checking changes
movies_df['tags'][0]

In [None]:
movies_df.sample(2)

In [None]:
from nltk.stem import PorterStemmer

# Create an instance of PorterStemmer
ps = PorterStemmer()

# Define the stem function
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))  # Append the stemmed word to the list
    return " ".join(y)  # Join the stemmed words back into a single string


In [None]:
# Apply the stem function to the 'tags' column
movies_df['tags'] = movies_df['tags'].apply(stem)

In [None]:
# Create a CountVectorizer object
cv = CountVectorizer(max_features=8000, stop_words='english')

# Fit the vectorizer to your list of strings and transform the data
vectorized_data = cv.fit_transform(movies_df['tags'])

# Now vectorized_data contains the Bag-of-Words (BoW) representation of your data

In [None]:
# convertion a sparce matrix into a array
vectorized_array = vectorized_data.toarray()

In [None]:
# Checking the array
vectorized_array

In [None]:
# checking the dimensions of the DataFrame (number of rows, number of columns)
vectorized_array.shape

In [None]:
# the feature names from the CountVectorizer object.
cv.get_feature_names_out()

In [None]:
# Importing the cosine_similarity function from the sklearn library.
from sklearn.metrics.pairwise import cosine_similarity

# Calculating cosine similarity between vectorized_array and itself.
cosine_sim = cosine_similarity(vectorized_array)


In [None]:
# checking the chnages
cosine_sim.shape

In [None]:
# Define a function called recommend that takes a movie as input
def recommend(movie):
    # Find the index of the movie in the DataFrame
    movie_id = movies_df[movies_df['title'] == movie].index[0]
    # Get the cosine similarity scores for the movie
    distance = cosine_sim[movie_id]
    # Create a list of tuples containing movie indices and their cosine similarity scores, sorted by score
    movies_list = sorted(list(enumerate(distance)), reverse=True, key=lambda x:x[1])[1:6]

    # Iterate over the top 5 recommended movies and print their titles
    for i in movies_list:
        print(movies_df.iloc[i[0]].title)


In [None]:
# Recommend similar movies to 'movie_name'.
recommend('Dilwale Dulhania Le Jayenge')

In [None]:
movies_df[movies_df['id'] == 19404]