In [None]:
import pandas as pd
import numpy as np
import ast
import json
import matplotlib.pyplot as plt
%matplotlib inline
import string
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
df1 = pd.read_csv('tmdb_5000_credits.csv')
df2 = pd.read_csv('tmdb_5000_movies.csv')

ParserError: ignored

### DATA CLEANING

In [None]:
# observe Dataset_1
df1.shape

In [None]:
# observe Dataset_2
df2.shape

In [None]:
# describe NaN and empty cells in each column
for col in df1.columns:
    nan_count = df1[col].isna().sum()
    empty_count = df1[col].eq('').sum()
    print(f"Column {col}: NaN count = {nan_count}, Empty count = {empty_count}")

In [None]:
# Convert strings to lists of dictionaries
df1["cast"] = df1["cast"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Create new column in df1 - 'cast_names'
df1["cast_names"] = df1["cast"].apply(lambda x: [d["name"] for d in x])

In [None]:
crew_list = json.loads(df1['crew'][1])
for crew_member in crew_list:
    print(crew_member['job'])

In [None]:
# extracting the top 3 fields
#Director: The director has a significant impact on the overall vision and style of a movie, including the tone, pacing, camera work, and performance direction. Movies directed by the same director may have similar themes, visual styles, or narrative techniques, which can be used for recommendation purposes.

#Screenplay writer: The screenplay is the foundation of a movie, providing the story, characters, dialogue, and structure. Similarities between movies based on the same source material or with similar themes, genres, or narrative structures can be identified and used for recommendation.

#Producer: The producer oversees the financial and logistical aspects of a movie, including casting, hiring, scheduling, and marketing. The production company or studio associated with a movie may have a specific brand or target audience, which can be used for recommendation purposes. Additionally, producers may have a track record of successful movies or collaborations with specific directors or actors, which can also be used as a recommendation feature.

# Apply literal_eval to convert stringified dictionaries to dictionaries
df1["crew"] = df1["crew"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Extract names of director, producer, and screenplay writer from crew list
df1["director"] = df1["crew"].apply(lambda x: [d["name"] for d in x if d["job"] == "Director"][0] if [d["job"] for d in x if d["job"] == "Director"] else None)
df1["producer"] = df1["crew"].apply(lambda x: [d["name"] for d in x if d["job"] == "Producer"])
df1["screenplay_writer"] = df1["crew"].apply(lambda x: [d["name"] for d in x if d["job"] == "Screenplay"])

In [None]:
df1.head()

In [None]:
# describe NaN and empty cells in each df2 column
for col in df2.columns:
    nan_count = df2[col].isna().sum()
    empty_count = df2[col].eq('').sum()
    print(f"Column {col}: NaN count = {nan_count}, Empty count = {empty_count}")

In [None]:
# convert the 'genres' column to dtype object
df2['genres'] = df2['genres'].astype(object)

# specify the key to extract
key = 'name'

# create a new column called 'genre_list'
df2['genre_list'] = ''

# loop through the values in the 'genres' column and extract the values for the specified key
for i, genre_list in enumerate(df2['genres']):
    genre_values = []
    # check if the value is a string and convert it to a list of dictionaries if necessary
    if isinstance(genre_list, str):
        genre_list = ast.literal_eval(genre_list)
    for genre_dict in genre_list:
        if genre_dict.get(key):
            genre_values.append(genre_dict[key])
    df2.at[i, 'genre_list'] = genre_values

In [None]:
#Unpacking Keywords Column

# convert the 'keywords' column to dtype object
df2['keywords'] = df2['keywords'].astype(object)

# specify the key to extract
key = 'name'

# create a new column called 'keywords_unpacked'
df2['keywords_unpacked'] = ''

# loop through the values in the 'keywords' column and extract the values for the specified key
for i, keywords_unpacked in enumerate(df2['keywords']):
    values = []
    # check if the value is a string and convert it to a list of dictionaries if necessary
    if isinstance(keywords_unpacked, str):
        keywords_unpacked = ast.literal_eval(keywords_unpacked)
    for key_dict in keywords_unpacked:
        if key_dict.get(key):
            values.append(key_dict[key])
    df2.at[i, 'keywords_unpacked'] = values

In [None]:
# Movies with no overview
nan_titles = []
for idx, row in df2.iterrows():
    if pd.isna(row['overview']):
        nan_titles.append(row['original_title'])

In [None]:
# Create an empty list to store the data for the new dataframe
data = []

# Loop through the rows of the dataframe
for index, row in df2.iterrows():
    # check if the overview value is NaN
    if pd.isna(row['overview']):
        # if it is NaN, add the corresponding values to the data list
        data.append({
            'original_title': row['original_title'],
            'homepage': row['homepage']
        })

# create the new dataframe from the data list
df_sub = pd.DataFrame(data)
df_sub

In [None]:
# Creating a dictionary with movie descriptions for blank movies, manually filled out.
dict1 = {
'The Dark Knight': 'A superhero action-thriller where Batman battles the Joker to save Gotham City. The movie showcases intense fight scenes and explores the dark psychological struggles of the characters.',
'Inside Out': 'An animated adventure movie that explores the emotions and memories of a young girl. The film provides a creative portrayal of complex emotions and has a heartwarming message about growing up.',
'Guardians of the Galaxy': 'A superhero space opera where a group of misfits team up to save the galaxy from a powerful villain. The movie features an eclectic soundtrack and a mix of action, humor, and heart.',
'Interstellar': 'A science fiction movie where a group of astronauts travel through a wormhole in search of a new home for humanity. The film combines stunning visuals with complex theories of space and time.',
'Inception': 'A mind-bending heist movie where a thief steals information by entering people’s dreams. The movie is known for its intricate plot and stunning visual effects that keep the audience on the edge of their seats.',
'The Lord of the Rings: The Fellowship of the Ring': 'An epic fantasy movie where a young hobbit must destroy a powerful ring to save Middle-earth from evil. The movie features breathtaking landscapes, epic battles, and memorable characters.',
'Django Unchained': 'A western drama where a freed slave teams up with a bounty hunter to rescue his wife from a brutal plantation owner. The film is known for its gritty realism and powerful performances.',
'The Wolf of Wall Street': 'A biographical black comedy movie that follows the rise and fall of a corrupt stockbroker. The movie showcases the excess and corruption of Wall Street in the 1990s and features a dynamic performance from Leonardo DiCaprio.',
'The Lord of the Rings: The Return of the King': 'The final installment of the epic fantasy trilogy where the fate of Middle-earth is decided in a climactic battle. The movie provides a satisfying conclusion to the story with stunning action sequences and emotional moments.',
'The Lord of the Rings: The Two Towers': 'The second installment of the epic fantasy trilogy where the fellowship is scattered and faces new challenges. The movie features epic battles and a deeper exploration of the characters and their motivations.',
'The Lion King': 'An animated musical movie where a young lion prince must reclaim his throne from his treacherous uncle. The movie features memorable songs and breathtaking animation that brings the African savannah to life.',
'The Matrix': 'A science fiction action movie where a hacker discovers the truth about reality and leads a rebellion against intelligent machines. The movie features groundbreaking special effects and a thought-provoking exploration of reality and identity.',
'Fight Club': 'A psychological drama movie where an insomniac office worker forms a secret club that evolves into a violent anarchist movement. The movie features an unreliable narrator and a subversive critique of consumer culture.',
'The Green Mile': 'A supernatural drama movie where a prison guard discovers that an inmate on death row has miraculous healing powers. The movie explores themes of justice, morality, and redemption with powerful performances from the cast.',
'Forrest Gump': 'A comedy-drama movie that follows the life of a simple man who unwittingly becomes part of some of the defining moments of the 20th century. The movie features a heartwarming message about the power of kindness and perseverance.',
'Se7en': 'A crime thriller movie where two detectives track down a serial killer who uses the seven deadly sins as his inspiration. The movie is known for its gritty atmosphere and suspenseful storytelling.',
 "Schindler's List": 'A poignant historical drama that depicts the heroic acts of a German businessman who risks everything to save the lives of Jewish refugees during the Holocaust.',
"The Shawshank Redemption": 'A gripping prison drama that portrays the unbreakable bond between two inmates, as they navigate the harsh realities of life behind bars.',
"The Empire Strikes Back": 'A thrilling space epic that continues the Star Wars saga, as the rebels face new challenges and the Force is further explored.',
"The Silence of the Lambs": 'A chilling psychological thriller that delves into the mind of a cannibalistic serial killer, as an FBI agent races against time to catch another killer on the loose.',
"Back to the Future": 'A time-traveling adventure-comedy that is full of action, humor, and heart, as a teenager tries to fix the past and secure his future.',
"千と千尋の神隠し": 'A captivating and visually stunning animated film that takes the audience on a magical journey through a mystical world filled with strange creatures and enigmatic spirits.',
"The Imitation Game": 'A gripping historical drama that pays tribute to a brilliant mathematician and codebreaker who played a pivotal role in saving countless lives during World War II.',
"Chiamatemi Francesco - Il Papa della gente": 'An inspiring biographical film that tells the story of a man who dedicated his life to serving others and became a beacon of hope and compassion for millions around the world.',
"The Godfather: Part II": 'A complex and riveting crime drama that explores the rise of a powerful mafia family and the struggles of its heirs to maintain control and protect their loved ones.',
"Star Wars": 'An iconic and imaginative sci-fi adventure that takes the audience on a thrilling ride through a galaxy far, far away, as a group of rebels fight to overthrow an oppressive regime.',
"Pulp Fiction": 'A groundbreaking crime film that blends humor, violence, and pop culture in a way that defies expectations and challenges conventions.',
"The Godfather": 'A masterful crime drama that explores the dark side of power and loyalty, as a patriarch of a powerful mafia family struggles to maintain his authority and protect his loved ones.',
"Whiplash": 'A powerful and intense drama that delves into the obsessive world of music, as a young drummer is pushed to his limits by a demanding and abusive instructor.',
"To Be Frank, Sinatra at 100": 'A fascinating documentary that celebrates the life and legacy of one of the greatest entertainers of all time, as friends, family, and fellow musicians reflect on his impact and influence.',
"Food Chains": 'A thought-provoking documentary that sheds light on the harsh realities of farm labor in America, as workers fight for their rights and fair treatment in a system that often exploits them.'
}

In [None]:
#Filling in empty overview values using corresponding values from dict1 into df2:

# Iterate over keys in dict1
for key in dict1:
    # Check if key matches any value in "title" column of df2
    mask = df2['title'] == key
    if mask.any():
        # Update "overview" column with value from dict1
        df2.loc[mask, 'overview'] = dict1[key]

In [None]:
# Drop one of the 'title' columns based on the duplicate mask
df2 = df2.drop('title', axis=1)

In [None]:
# Combining the dataframes by concatenating them horizontally
df = pd.concat([df1, df2], axis=1)

In [None]:
df.head()

In [None]:
# Get a list of all columns in the dataframe
columns = df.columns.tolist()
columns

In [None]:
# Droping unnecessary columns from the dataframe
df = df.drop(['movie_id', 'cast', 'crew', 'genres','popularity','tagline','runtime','release_date','production_companies','production_countries','homepage', 'id', 'keywords','original_language','original_title','revenue','spoken_languages','status'],axis=1)

In [None]:
df.head()

In [None]:
# Get the number of NaN and empty cells in each column
nan_counts = df.isna().sum()
empty_counts = (df.fillna('') == '').sum()

# Print the results
print("NaN counts:\n", nan_counts)
print("\nEmpty counts:\n", empty_counts)

In [None]:
print(df.dtypes)

In [None]:
# define a function to convert list values to string
def list_to_string(val):
    if isinstance(val, list):
        return ', '.join(val)
    return str(val)

# apply the function to the specified columns and create the "text" column
df['text'] = df.apply(lambda row: ' '.join([
    list_to_string(row['director']),
    list_to_string(row['title']),
    list_to_string(row['cast_names']),
    list_to_string(row['overview']),
    list_to_string(row['producer']),
    list_to_string(row['screenplay_writer']),
    list_to_string(row['genre_list']),
    list_to_string(row['keywords_unpacked'])
]), axis=1)

In [None]:
df['text'][1]

In [None]:
# Calculate the weighted average of vote_average and vote_count
voters = df['vote_count']
avg_votes = df['vote_average']
C = avg_votes.mean()
m = voters.quantile(0.90)
weighted_rating = (voters/(voters+m) * avg_votes) + (m/(m+voters) * C)

# Add the weighted_rating column to the merged_df DataFrame
df['weighted_rating'] = weighted_rating

### DATA PREPROCESSING

In [None]:
# Create function to clean text
def clean_text(text):
    if isinstance(text, str):
        # Remove punctuation and non-alphanumeric characters using regex
        text_clean = re.sub(r'[^a-zA-Z0-9\s]', '', text)
        # Remove extra whitespace
        text_clean = re.sub(r'\s+', ' ', text_clean).strip()
    else:
        text_clean = ""
    return text_clean

# Clean the overview column
df['text_clean'] = df['text'].apply(lambda x: clean_text(x))
df.head()

In [None]:
# Create function to tokenize and lowercase data
def tokenize(text):
    # W+ means that either a word character (A-Za-z0-9_) or a dash (-) can go there.
    tokens = re.split('\W+', text)
    return tokens

# Tokenize and lowercase data
df['text_tokenized'] = df['text_clean'].apply(lambda x: tokenize(x.lower()))
df.head()

In [None]:
# Define stop words list
stopwords = nltk.corpus.stopwords.words('english')     # All English Stopwords
stopwords

# Create function to remove stopwords
def remove_stopwords(tokenized_list):
    text = [word for word in tokenized_list if word not in stopwords]
    return text

# Remove stop words from data
df['text_nostop'] = df['text_tokenized'].apply(lambda x: remove_stopwords(x))
df.head()

In [None]:
wn = nltk.WordNetLemmatizer()

# Create function to apply lematizer
def lemmatizing(tokenized_text):
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text

# Apply lemmatizer
df['text_lemmatized'] = df['text_nostop'].apply(lambda x: lemmatizing(x))
df.head(10)

In [None]:
#Vectorization
tfidf_vect = TfidfVectorizer(analyzer=lambda x: ' '.join([word for word in x]))
tfidf_counts = tfidf_vect.fit_transform(df['text_lemmatized'])

In [None]:
# Compute pairwise cosine similarity on tf-idf matrix
cosine_sim = cosine_similarity(tfidf_counts, tfidf_counts)

# Print similarity matrix
print(cosine_sim)

#Create a dataframe that contains only the title and overview_lemmatized columns from your original dataframe

In [None]:
df_movies = df[['title', 'text_lemmatized']]

#Define a function that takes a movie title as input, and returns the top 5 most similar movies based on cosine similarity.

In [None]:
def get_movie_recomendation(title):
    # Get index of the input movie
    index = df_movies[df_movies['title'] == title].index[0]

    # Calculate cosine similarity between the input movie and all other movies
    cosine_similarities = cosine_similarity(tfidf_counts[index], tfidf_counts)

    # Get the top 5 most similar movies
    similar_movies_indices = cosine_similarities.argsort()[0][-6:-1][::-1]
    similar_movies = df_movies.iloc[similar_movies_indices]['title']

    return similar_movies.tolist()

In [None]:
get_movie_recomendation ('The Dark Knight')

In [None]:
get_movie_recomendation ('The Shawshank Redemption')

In [None]:
get_movie_recomendation ('Frozen')

In [None]:
## Trying a different method

In [None]:
credits = pd.read_csv('tmdb_5000_credits.csv')
movies = pd.read_csv('tmdb_5000_movies.csv')

In [None]:
credits.columns

In [None]:
# Function to extract values from a dict
def get_names(lst):
    feat = []
    for i in ast.literal_eval(lst):
        feat.append(i['name'])          # All the needed keywords has the key 'name'
    return feat

In [None]:
# Extract needed values

movies['genres_names'] = movies['genres'].apply(get_names)

movies['keywords_names'] = movies['keywords'].apply(get_names)

movies['prod_companies_names'] = movies['production_companies'].apply(get_names)

movies['prod_countries_names'] = movies['production_countries'].apply(get_names)

movies['spoken_lang_names'] = movies['spoken_languages'].apply(get_names)

In [None]:
credits.head(2)

In [None]:
credits["cast"] = credits["cast"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

credits["cast_names"] = credits["cast"].apply(lambda x: [d["name"] for d in x])

In [None]:
# Apply literal_eval to convert stringified dictionaries to dictionaries
credits["crew"] = credits["crew"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Extract names of director, producer, and screenplay writer from crew list
credits["director"] = credits["crew"].apply(lambda x: [d["name"] for d in x if d["job"] == "Director"][0] if [d["job"] for d in x if d["job"] == "Director"] else None)
credits["producer"] = credits["crew"].apply(lambda x: [d["name"] for d in x if d["job"] == "Producer"])

In [None]:
# drop the original columns from which names have been extracted

credits1 = credits.drop(['cast','crew'], axis =1)
movies1 = movies.drop(['genres', 'keywords', 'production_companies', 'production_countries', 'spoken_languages', 'homepage', 'original_title'], axis = 1)

In [None]:
# Merge the two dataframes
new_df = new_df = pd.merge(credits1, movies1, left_on='movie_id', right_on='id')

# Drop the 'id' column from the movies1 dataframe
new_df.drop('id', axis=1, inplace=True)

In [None]:
# Define a function to remove repetitive words
def remove_repetitive_words(name):
    # List of words to remove
    words_to_remove = ['Pictures', 'Films', 'Entertainment', 'Productions', 'Studios', 'Company', 'Media', 'films', 'entertainment', 'film']

    # Split the name into words
    words = name.split()

    # Remove the words_to_remove
    words = [word for word in words if word not in words_to_remove]

    # Join the words back together
    return ' '.join(words)

# Apply the function to the column
new_df['prod_companies_names'] = new_df['prod_companies_names'].apply(lambda x: [remove_repetitive_words(name) for name in x])

In [None]:
# Flatten the list of lists
lang = [lan for lans in new_df['spoken_lang_names'] for lan in lans]

# Get the unique values
unique_lang = set(lang)

# Print the unique values
print(unique_lang)

In [None]:
# Remove spaces from the cast_names, director, and producer columns
new_df['cast_names'] = new_df['cast_names'].apply(lambda x: [i.replace(" ", "") for i in x])
new_df['director'] = new_df['director'].str.replace(" ", "")
new_df['producer'] = new_df['producer'].apply(lambda x: [i.replace(" ", "") for i in x])
new_df['prod_countries_names'] = new_df['prod_countries_names'].apply(lambda x: [i.replace(" ", "") for i in x])
new_df['prod_companies_names'] = new_df['prod_companies_names'].apply(lambda x: [i.replace(" ", "") for i in x])
new_df['spoken_lang_names'] = new_df['spoken_lang_names'].apply(lambda x: [i.replace(" ", "") for i in x])

In [None]:
new_df.head(2)

In [None]:
_# define a function to convert list values to string
def list_to_string(val):
    if isinstance(val, list):
        return ', '.join(val)
    return str(val)

# apply the function to the specified columns and create the "text" column
new_df['text'] = new_df.apply(lambda row: ' '.join([
    list_to_string(row['director']),
    list_to_string(row['title_x']),
    list_to_string(row['cast_names']),
    list_to_string(row['overview']),
    list_to_string(row['producer']),
    list_to_string(row['genres_names']),
    list_to_string(row['keywords_names'])
]), axis=1)

In [None]:
new_df.head(2)

In [None]:
# remove the redundant columns
new_df = new_df.drop(['cast_names', 'director', 'producer','status', 'title_y', 'movie_id', 'prod_countries_names', 'prod_companies_names', 'spoken_lang_names'], axis=1)

new_df.head(2)

In [None]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# define lemmatizer
lemmatizer = WordNetLemmatizer()

# function to tokenize, remove stop words and lemmatize text
def preprocess_text(text):
    if text is not None and isinstance(text, str):
        # tokenize the text into words
        words = word_tokenize(text.lower())
        # remove stop words from the words list
        words = [word for word in words if word not in stopwords.words('english')]
        # lemmatize the words
        lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
        # join the lemmatized words to form a text string
        lemmatized_text = ' '.join(lemmatized_words)
        return lemmatized_text
    else:
        return ""

# apply preprocessing function to the 'overview' column in the merged_df dataframe
new_df['tagline'] = new_df['tagline'].apply(preprocess_text)
new_df['overview'] = new_df['overview'].apply(preprocess_text)
new_df['title_y'] = new_df['title_x'].apply(preprocess_text)

In [None]:
# Drop unnecessary columns
new_df.drop(['genres_names', 'keywords_names', 'overview', 'tagline', 'original_language', 'title_y'], axis=1, inplace=True)

In [None]:
new_df.head(2)

In [None]:
# Combining columns vote_averge and vote_count
new_df['rating'] = new_df['vote_average'] * new_df['vote_count']

#categorizing them by creating bins
bins = pd.qcut(new_df['rating'], q=4, labels=['bad', 'okay', 'good', 'great'])
dummies = pd.get_dummies(bins, prefix='rating')

# combine encoded vote category columns to original df
new_df = pd.concat([new_df, dummies], axis=1)

# drop original vote_count and vote_average columns
new_df = new_df.drop(['vote_count', 'vote_average'], axis=1)

In [None]:
#categorizing runtime
import matplotlib.pyplot as plt
plt.hist(new_df['runtime'], bins=10)
plt.show()

In [None]:
import numpy as np

# Create a function to categorize the runtime
def categorize_runtime(runtime):
    if runtime < 90:
        return 'Short'
    elif runtime <= 150:
        return 'Medium'
    else:
        return 'Long'

# Apply the categorize_runtime function to create a new column
new_df['runtime_cat'] = np.vectorize(categorize_runtime)(new_df['runtime'])

In [None]:
new_df['popularity'].describe()

In [None]:
new_df['budget'].describe()

In [None]:
# categorize 'popularity' into 3 bins using pandas qcut function
new_df['popularity_bins'] = pd.qcut(new_df['popularity'], q=3, labels=['low', 'medium', 'high'])

In [None]:
# Define the bin edges
bin_edges = [0, 50000000, 150000000, float('inf')]

# Define the bin labels
bin_labels = ['Low', 'Medium', 'High']

# Create a new column with the budget category
new_df['budget_bins'] = pd.cut(new_df['budget'], bins=bin_edges, labels=bin_labels)

# Drop the original 'budget' column
new_df = new_df.drop(['budget', 'popularity','runtime'], axis=1)

In [None]:
# create dummy variables for popularity_bins and budget_bins
popularity_dummies = pd.get_dummies(new_df['popularity_bins'], prefix='popularity')
budget_dummies = pd.get_dummies(new_df['budget_bins'], prefix='budget')

# concatenate the dummy variables with the original dataframe
new_df = pd.concat([new_df, popularity_dummies, budget_dummies], axis=1)

# drop the original categorical columns
#new_df.drop(['popularity_bins', 'budget_bins'], axis=1, inplace=True)

In [None]:
# Convert release_date to datetime format
new_df['release_date'] = pd.to_datetime(new_df['release_date'])

# Categorize release_date into 3 categories
new_df['release_date_cat'] = pd.cut(new_df['release_date'], bins=3, labels=['early', 'mid', 'late'])

# Encode release_date_cat column
release_date_cat_encoded = pd.get_dummies(new_df['release_date_cat'], prefix='release_date_cat')

# Concatenate with original DataFrame
new_df = pd.concat([new_df, release_date_cat_encoded], axis=1)

# Drop original release_date and release_date_cat columns
#new_df = new_df.drop(['release_date', 'release_date_cat'], axis=1)

In [None]:
# 25th and 75th percentile values of revenue
q1 = new_df['revenue'].quantile(0.25)
q3 = new_df['revenue'].quantile(0.75)

In [None]:
# Categorize revenue into 3 bins
new_df['revenue_cat'] = pd.cut(new_df['revenue'], bins=[-np.inf, q1, q3, np.inf], labels=['Low', 'Medium', 'High'])

# Encode revenue categories using dummy encoding
new_df = pd.concat([new_df, pd.get_dummies(new_df['revenue_cat'], prefix='revenue')], axis=1)

# Drop the original 'revenue' and 'revenue_cat' columns
#new_df.drop(['revenue', 'revenue_cat'], axis=1, inplace=True)

In [None]:
new_df.head(2)

In [None]:
wn = nltk.WordNetLemmatizer()

# Create function to apply lematizer
def lemmatizing(tokenized_text):
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text

# Apply lemmatizer
new_df['text'] = new_df['text'].apply(lambda x: lemmatizing(x))

In [None]:
#Vectorization
tfidf_vect = TfidfVectorizer(analyzer=lambda x: ' '.join([word for word in x]))
tfidf_counts = tfidf_vect.fit_transform(new_df['text'])

In [None]:
# Compute pairwise cosine similarity on tf-idf matrix
cosine_sim = cosine_similarity(tfidf_counts, tfidf_counts)

# Print similarity matrix
print(cosine_sim)

In [None]:
#Create a dataframe that contains only the title and overview_lemmatized columns from your original dataframe
df_movies = new_df[['title_x', 'text']]
df_movies['title'] = df_movies['title_x']

In [None]:
def get_movie_recomendation(title):
    # Get index of the input movie
    index = df_movies[df_movies['title'] == title].index[0]

    # Calculate cosine similarity between the input movie and all other movies
    cosine_similarities = cosine_similarity(tfidf_counts[index], tfidf_counts)

    # Get the top 5 most similar movies
    similar_movies_indices = cosine_similarities.argsort()[0][-6:-1][::-1]
    similar_movies = df_movies.iloc[similar_movies_indices]['title']

    return similar_movies.tolist()

In [None]:
new_df_knn = new_df[['budget_bins', 'popularity_bins', 'revenue_cat', 'release_date_cat', 'text']]
print(new_df_knn)

In [None]:
get_movie_recomendation ('The Dark Knight')

In [None]:
get_movie_recomendation ('The Shawshank Redemption')

In [None]:
get_movie_recomendation ('Frozen')