In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_excel("E:\\CSD201\\movies.xlsx")

In [3]:
df.head()

Unnamed: 0,NAME,YEAR,GENRE,RATING,VOTES,RunTime,Gross
0,Blood Red Sky,(2021),"\nAction, Horror, Thriller",6.1,21062.0,121.0,401834
1,Masters of the Universe: Revelation,(2021– ),"\nAnimation, Action, Adventure",5.0,17870.0,25.0,$17 million
2,The Walking Dead,(2010–2022),"\nDrama, Horror, Thriller",8.2,885805.0,44.0,6014340
3,Rick and Morty,(2013– ),"\nAnimation, Adventure, Comedy",9.2,414849.0,23.0,
4,Army of Thieves,(2021),"\nAction, Crime, Horror",,,,20000000


In [4]:
df.columns = df.columns.str.upper()

In [5]:
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

In [6]:
missing_values = df.isnull().sum()
print("Missing values:\n", missing_values)

Missing values:
 NAME          0
YEAR        435
GENRE        70
RATING      897
VOTES       897
RUNTIME    1333
GROSS      5947
dtype: int64


In [7]:
df.drop('GROSS',axis=1,inplace=True)
df.head()


Unnamed: 0,NAME,YEAR,GENRE,RATING,VOTES,RUNTIME
0,Blood Red Sky,(2021),"\nAction, Horror, Thriller",6.1,21062.0,121.0
1,Masters of the Universe: Revelation,(2021– ),"\nAnimation, Action, Adventure",5.0,17870.0,25.0
2,The Walking Dead,(2010–2022),"\nDrama, Horror, Thriller",8.2,885805.0,44.0
3,Rick and Morty,(2013– ),"\nAnimation, Adventure, Comedy",9.2,414849.0,23.0
4,Army of Thieves,(2021),"\nAction, Crime, Horror",,,


In [8]:
numerical_cols = df.select_dtypes(include='number').columns
categorical_cols = df.select_dtypes(exclude='number').columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].mean())
for col in categorical_cols:
    df[col] = df[col].astype(str)
    df[col].fillna(df[col].mode().iloc[0], inplace=True)

In [9]:
df.head()

Unnamed: 0,NAME,YEAR,GENRE,RATING,VOTES,RUNTIME
0,Blood Red Sky,(2021),"\nAction, Horror, Thriller",6.1,21062.0,121.0
1,Masters of the Universe: Revelation,(2021– ),"\nAnimation, Action, Adventure",5.0,17870.0,25.0
2,The Walking Dead,(2010–2022),"\nDrama, Horror, Thriller",8.2,885805.0,44.0
3,Rick and Morty,(2013– ),"\nAnimation, Adventure, Comedy",9.2,414849.0,23.0
4,Army of Thieves,(2021),"\nAction, Crime, Horror",6.60362,21737.63011,79.905022


In [10]:
print(df.columns)


Index(['NAME', 'YEAR', 'GENRE', 'RATING', 'VOTES', 'RUNTIME'], dtype='object')


In [11]:
df['GENRE'] = df['GENRE'].str.strip().str.split(',')


In [12]:
print(df['GENRE'])

0                 [Action,  Horror,  Thriller]
1             [Animation,  Action,  Adventure]
2                  [Drama,  Horror,  Thriller]
3             [Animation,  Adventure,  Comedy]
4                    [Action,  Crime,  Horror]
5                     [Action,  Crime,  Drama]
6                            [Drama,  Romance]
7                    [Crime,  Drama,  Mystery]
8                                     [Comedy]
9                            [Drama,  Romance]
10             [Action,  Adventure,  Thriller]
11                   [Crime,  Drama,  Fantasy]
12                  [Drama,  Horror,  Mystery]
13                  [Comedy,  Drama,  Romance]
14                 [Drama,  Horror,  Thriller]
15                           [Drama,  Romance]
16                  [Crime,  Drama,  Thriller]
17                                     [Drama]
18                            [Comedy,  Drama]
19                  [Drama,  Fantasy,  Horror]
20                  [Drama,  Horror,  Mystery]
21           

In [13]:
unique_genres = set()
for genres in df['GENRE']:
    unique_genres.update(genres)
for genre in unique_genres:
    df[genre.strip()] = False
for index, row in df.iterrows():
    for genre in row['GENRE']:
        df.at[index, genre.strip()] = True
df.drop(columns=['GENRE'], inplace=True)
print(df.columns)

Index(['NAME', 'YEAR', 'RATING', 'VOTES', 'RUNTIME', 'Crime', 'Thriller',
       'Drama', 'Sport', 'Mystery', 'Family', 'News', 'History', 'Biography',
       'Comedy', 'Game-Show', 'Documentary', 'Action', 'Reality-TV', 'War',
       'Animation', 'Fantasy', 'Western', 'Horror', 'Music', 'Adventure',
       'Film-Noir', 'Romance', 'Talk-Show', 'Short', 'Musical', 'Sci-Fi',
       'nan'],
      dtype='object')


In [14]:
df.head()

Unnamed: 0,NAME,YEAR,RATING,VOTES,RUNTIME,Crime,Thriller,Drama,Sport,Mystery,Family,News,History,Biography,Comedy,Game-Show,Documentary,Action,Reality-TV,War,Animation,Fantasy,Western,Horror,Music,Adventure,Film-Noir,Romance,Talk-Show,Short,Musical,Sci-Fi,nan
0,Blood Red Sky,(2021),6.1,21062.0,121.0,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False
1,Masters of the Universe: Revelation,(2021– ),5.0,17870.0,25.0,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False
2,The Walking Dead,(2010–2022),8.2,885805.0,44.0,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False
3,Rick and Morty,(2013– ),9.2,414849.0,23.0,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False
4,Army of Thieves,(2021),6.60362,21737.63011,79.905022,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False


In [21]:
def recommend_movies(df, input_genre):
  genres_df = df.copy()
  genres_df['GENRE'] = genres_df.apply(lambda row: '|'.join(row[row == True].index.tolist()), axis=1)
  vectorizer = TfidfVectorizer(stop_words='english') 
  genres_tfidf = vectorizer.fit_transform(genres_df['GENRE'])
  similarity_scores = cosine_similarity(genres_tfidf, genres_tfidf)
  input_genre_index = vectorizer.vocabulary_.get(input_genre.lower())
  if input_genre_index is None:
      input_genre_index = vectorizer.vocabulary_.get(input_genre.upper())
  if input_genre_index is None:
      print(f"Genre '{input_genre}' not found in data.")
      return []  
  similar_movies_indices = similarity_scores[:, input_genre_index].argsort()[::-1][1:6]
  recommended_movies = df.iloc[similar_movies_indices]['NAME'].tolist()
  return recommended_movies
input_genre = input("Enter a genre: ")
recommendations = recommend_movies(df.copy(), input_genre)
print("Recommended movies:")
for i, movie in enumerate(recommendations):
  print(f"{i+1}. {movie}")


Enter a genre:  Thriller


Recommended movies:
1. Hotel Del Luna
2. Ghost Wars
3. Tiempos de guerra
4. Salem
5. Thelma


In [33]:
df.head()

Unnamed: 0,NAME,YEAR,RATING,VOTES,RUNTIME,Crime,Thriller,Drama,Sport,Mystery,Family,News,History,Biography,Comedy,Game-Show,Documentary,Action,Reality-TV,War,Animation,Fantasy,Western,Horror,Music,Adventure,Film-Noir,Romance,Talk-Show,Short,Musical,Sci-Fi,nan
0,Blood Red Sky,(2021),6.1,21062.0,121.0,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False
1,Masters of the Universe: Revelation,(2021– ),5.0,17870.0,25.0,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False
2,The Walking Dead,(2010–2022),8.2,885805.0,44.0,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False
3,Rick and Morty,(2013– ),9.2,414849.0,23.0,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False
4,Army of Thieves,(2021),6.60362,21737.63011,79.905022,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False


In [41]:

def recommend_similar_movies(df, input_movie):

  input_movie_lower = input_movie.lower()
  if not df['NAME'].str.lower().isin([input_movie_lower]).any():
      print(f"Movie '{input_movie}' not found in data.")
      return []
  input_movie_data = df[df['NAME'].str.lower() == input_movie_lower]
  input_movie_rating = input_movie_data.iloc[0]['RATING']
  genre_columns = [col for col in df.columns if col in ['Crime', 'Thriller', 'Drama', 'etc.']] 
  input_movie_genre = input_movie_data[genre_columns].values.ravel() 
  genres_df = df[genre_columns].astype(float)  
  similarity_scores = cosine_similarity([input_movie_genre], genres_df)
  movie_indices = similarity_scores.argsort()[0][::-1][1:6]  
  recommended_movies = df.iloc[movie_indices]['NAME'].tolist()
  return recommended_movies
input_movie = input("Enter a movie name: ")
recommendations = recommend_similar_movies(df.copy(), input_movie)
print("Recommended similar movies:")
for i, movie in enumerate(recommendations):
  print(f"{i+1}. {movie}")


Enter a movie name:  The walking dead


Recommended similar movies:
1. Revenge
2. Bir Baskadir
3. Live
4. The Invitation
5. Lobo Feroz
