## Popularity Based Recommendation System

### Top 5 Movie list of each genre 

In [2]:
import pandas as pd
import numpy as np
import warnings 
warnings.filterwarnings('ignore')

In [33]:
# List of column names for the movie data
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
          'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
          'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

# Read the movie data from the 'u.item' file in 'ml-100k' directory
# The file is delimited by the '|' character and the encoding is set to 'latin-1'
# Column names are assigned using the 'i_cols' list
items = pd.read_csv('ml-100k/u.item', 
                     sep ='|', names = i_cols, encoding = 'latin-1')


In [34]:
# Assign the 'items' DataFrame to a new variable 'movies_df'
movies_df = items

# Rename the column 'movie id' to 'movie_id' for consistency or to avoid spaces in the column name
movies_df.rename(columns={'movie id': 'movie_id'}, inplace=True)

# Display the modified 'movies_df' DataFrame with the new column name
movies_df


Unnamed: 0,movie_id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
# List of column names for the ratings data
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

# Read the ratings data from the 'u.data' file in the 'ml-100k' directory
# The file is tab-separated ('\t'), and the encoding is set to 'latin-1'
# Column names are assigned using the 'r_cols' list
ratings_df = pd.read_csv('ml-100k/u.data',
                         sep='\t', names=r_cols, encoding='latin-1')

In [36]:
ratings_df

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


## Combine both datasets by keeping the 'movie_id' as common column

In [37]:
# Merge the 'ratings_df' DataFrame with the 'movies_df' DataFrame on the 'movie_id' column
# This combines the ratings data with movie details for each movie, based on their 'movie_id'
merged_df = pd.merge(ratings_df, movies_df, on='movie_id')

In [11]:
merged_df.columns

Index(['user_id', 'movie_id', 'rating', 'unix_timestamp', 'movie title',
       'release date', 'video release date', 'IMDb URL', 'unknown', 'Action',
       'Adventure', 'Animation', 'Children's', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
       'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'],
      dtype='object')

In [12]:
merged_df

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,movie title,release date,video release date,IMDb URL,unknown,Action,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,196,242,3,881250949,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,0,0,0,0,0,0
1,186,302,3,891717742,L.A. Confidential (1997),01-Jan-1997,,http://us.imdb.com/M/title-exact?L%2EA%2E+Conf...,0,0,...,0,1,0,0,1,0,0,1,0,0
2,22,377,1,878887116,Heavyweights (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?Heavyweights%...,0,0,...,0,0,0,0,0,0,0,0,0,0
3,244,51,2,880606923,Legends of the Fall (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?Legends%20of%...,0,0,...,0,0,0,0,0,1,0,0,1,1
4,166,346,1,886397596,Jackie Brown (1997),01-Jan-1997,,http://us.imdb.com/M/title-exact?imdb-title-11...,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,880,476,3,880175444,"First Wives Club, The (1996)",14-Sep-1996,,http://us.imdb.com/M/title-exact?First%20Wives...,0,0,...,0,0,0,0,0,0,0,0,0,0
99996,716,204,5,879795543,Back to the Future (1985),01-Jan-1985,,http://us.imdb.com/M/title-exact?Back%20to%20t...,0,0,...,0,0,0,0,0,0,1,0,0,0
99997,276,1090,1,874795795,Sliver (1993),01-Jan-1993,,http://us.imdb.com/M/title-exact?Sliver%20(1993),0,0,...,0,0,0,0,0,0,0,1,0,0
99998,13,225,2,882399156,101 Dalmatians (1996),27-Nov-1996,,http://us.imdb.com/M/title-exact?101%20Dalmati...,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
# Get the dimensions (number of rows and columns) of the 'merged_df' DataFrame
# It returns a tuple with (number of rows, number of columns)
merged_df.shape

(100000, 27)

## Average rating for each movie

In [39]:
# Group the 'merged_df' DataFrame by 'movie_id' and calculate the mean of the 'rating' column for each movie
# The result is reset as a new DataFrame with 'movie_id' and the average rating ('Average Rating')
movies_average_rating = merged_df.groupby('movie_id')['rating'].mean().reset_index(name='Average Rating')

# Display the resulting DataFrame with 'movie_id' and their corresponding 'Average Rating'
movies_average_rating

Unnamed: 0,movie_id,Average Rating
0,1,3.878319
1,2,3.206107
2,3,3.033333
3,4,3.550239
4,5,3.302326
...,...,...
1677,1678,1.000000
1678,1679,3.000000
1679,1680,2.000000
1680,1681,3.000000


## Add the average ratings with merged data

In [40]:
# Merge 'merged_df' with 'movies_average_rating' to add 'Average Rating' for each movie
movies_with_average_rating = pd.merge(merged_df, movies_average_rating, on='movie_id')

In [41]:
# Display the column names of the 'movies_with_average_rating' DataFrame
movies_with_average_rating.columns

Index(['user_id', 'movie_id', 'rating', 'unix_timestamp', 'movie title',
       'release date', 'video release date', 'IMDb URL', 'unknown', 'Action',
       'Adventure', 'Animation', 'Children's', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
       'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western',
       'Average Rating'],
      dtype='object')

In [42]:
# Get the dimensions (rows, columns) of the 'movies_with_average_rating' DataFrame
movies_with_average_rating.shape

(100000, 28)

## Deleting all duplicate values in movie_id column

In [43]:
# Drop duplicate rows based on 'movie_id'
movies_with_average_rating = movies_with_average_rating.drop_duplicates(subset='movie_id')

In [19]:
movies_with_average_rating.shape

(1682, 28)

## Genres

In [44]:
# Define a list containing genres
movie_genre = [['Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime',
                'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
                'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']]
movie_genre


[['Action',
  'Adventure',
  'Animation',
  "Children's",
  'Comedy',
  'Crime',
  'Documentary',
  'Drama',
  'Fantasy',
  'Film-Noir',
  'Horror',
  'Musical',
  'Mystery',
  'Romance',
  'Sci-Fi',
  'Thriller',
  'War',
  'Western']]

## Flatten the List

In [46]:
# Flatten the list using itertools.chain
movie_genre = list(itertools.chain(movie_genre))
movie_genre

['Action',
 'Adventure',
 'Animation',
 "Children's",
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

## Just Checking for 1 genre - Action

In [47]:
# Filter movies with the 'Action' genre
genre_movies = movies_with_average_rating[movies_with_average_rating['Action'] == 1]

In [48]:
genre_movies

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,movie title,release date,video release date,IMDb URL,unknown,Action,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,Average Rating
6,115,265,2,881171488,"Hunt for Red October, The (1990)",01-Jan-1990,,http://us.imdb.com/M/title-exact?Hunt+for+Red+...,0,1,...,0,0,0,0,0,0,1,0,0,3.863436
10,62,257,2,879372434,Men in Black (1997),04-Jul-1997,,http://us.imdb.com/M/title-exact?Men+in+Black+...,0,1,...,0,0,0,0,0,1,0,0,0,3.745875
12,200,222,5,876042340,Star Trek: First Contact (1996),22-Nov-1996,,http://us.imdb.com/M/title-exact?Star%20Trek:%...,0,1,...,0,0,0,0,0,1,0,0,0,3.660274
14,224,29,3,888104457,Batman Forever (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Batman%20Fore...,0,1,...,0,0,0,0,0,0,0,0,0,2.666667
22,299,144,4,877881320,Die Hard (1988),01-Jan-1988,,http://us.imdb.com/M/title-exact?Die%20Hard%20...,0,1,...,0,0,0,0,0,0,1,0,0,3.872428
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55436,528,1618,1,888521905,King of New York (1990),01-Jan-1990,,http://us.imdb.com/Title?King+of+New+York+(1990),0,1,...,0,0,0,0,0,0,0,0,0,1.000000
59476,489,1613,4,891449466,Tokyo Fist (1995),11-Feb-1998,,http://us.imdb.com/M/title-exact?Tokyo+Fist+(1...,0,1,...,0,0,0,0,0,0,0,0,0,4.000000
66358,835,1673,3,891034023,Mirage (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Mirage%20(1995),0,1,...,0,0,0,0,0,0,1,0,0,3.000000
69884,787,1433,3,888979181,Men of Means (1998),01-Jan-1997,,http://us.imdb.com/M/title-exact?imdb-title-11...,0,1,...,0,0,0,0,0,0,0,0,0,2.000000


In [49]:
# Get top 5 action movies based on average rating
Top_5_action_movies = genre_movies.sort_values(by='Average Rating', ascending=False).head(5)

In [50]:
# Select only the 'movie_id', 'movie title', and 'Average Rating' columns for top 5 action movies
Top_5_action_movies = Top_5_action_movies[['movie_id', 'movie title', 'Average Rating']]

## Top_5_action_movies

In [121]:
Top_5_action_movies

Unnamed: 0,movie_id,movie title,Average Rating
499,50,Star Wars (1977),4.358491
230,127,"Godfather, The (1972)",4.283293
108,174,Raiders of the Lost Ark (1981),4.252381
1698,313,Titanic (1997),4.245714
191,172,"Empire Strikes Back, The (1980)",4.20436


### Using For loop & Check for all the genres

In [29]:
# Flatten the movie_genre list if it's a list of lists
movie_genre = [genre for sublist in movie_genre for genre in sublist]

# Dictionary to hold top movies for each genre
Top_movies_by_genre = {}

# Get top 5 movies for each genre
for genre in movie_genre:
    genre_movies = movies_with_average_rating[movies_with_average_rating[genre] == 1]
    Top_5_movies = genre_movies.sort_values(by='Average Rating', ascending=False).head(5)
    Top_movies_by_genre[genre] = Top_5_movies[['movie_id', 'movie title', 'Average Rating']]


In [30]:
Top_movies_by_genre

{'Action':       movie_id                      movie title  Average Rating
 499         50                 Star Wars (1977)        4.358491
 230        127            Godfather, The (1972)        4.283293
 108        174   Raiders of the Lost Ark (1981)        4.252381
 1698       313                   Titanic (1997)        4.245714
 191        172  Empire Strikes Back, The (1980)        4.204360,
 'Adventure':        movie_id                      movie title  Average Rating
 23044      1293                  Star Kid (1997)        5.000000
 499          50                 Star Wars (1977)        4.358491
 108         174   Raiders of the Lost Ark (1981)        4.252381
 970         511        Lawrence of Arabia (1962)        4.231214
 191         172  Empire Strikes Back, The (1980)        4.204360,
 'Animation':       movie_id                                        movie title  \
 218        408                              Close Shave, A (1995)   
 372        169                     

## Convert the dictionary to a DataFrame for saving

In [51]:
# Concatenate all genre data into a single DataFrame
Top_movies_by_genre_df = pd.concat(Top_movies_by_genre, axis=0)

In [52]:
Top_movies_by_genre_df

Unnamed: 0,Unnamed: 1,movie_id,movie title,Average Rating
Action,499,50,Star Wars (1977),4.358491
Action,230,127,"Godfather, The (1972)",4.283293
Action,108,174,Raiders of the Lost Ark (1981),4.252381
Action,1698,313,Titanic (1997),4.245714
Action,191,172,"Empire Strikes Back, The (1980)",4.204360
...,...,...,...,...
Western,668,661,High Noon (1952),4.102273
Western,855,589,"Wild Bunch, The (1969)",4.023256
Western,1266,435,Butch Cassidy and the Sundance Kid (1969),3.949074
Western,1247,510,"Magnificent Seven, The (1954)",3.942149


## Save the top movies by category to a CSV file

In [53]:
# Save the DataFrame to a CSV file and print a confirmation message
Top_movies_by_genre_df.to_csv('top_5_movies_by_category.csv', index=False)
print("Top 5 movies by category saved to top_5_movies_by_category.csv")

Top 5 movies by category saved to top_5_movies_by_category.csv
