In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MultiLabelBinarizer
import re

In [2]:
movies = pd.read_csv("movies.csv", encoding='ISO-8859-1')
ratings = pd.read_csv("rating.csv", encoding='ISO-8859-1')
tags = pd.read_csv("tags.csv", encoding='ISO-8859-1')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
def extract_year(title):
    match = re.search(r'\((\d{4})\)', title)
    if match:
        return int(match.group(1))  # Extract and return the year as an integer
    return None  # Return None if no year is found

def remove_year_from_title(title):
    # Regular expression to remove the year in parentheses
    return re.sub(r'\s?\(\d{4}\)', '', title)

# Apply the functions to extract year and remove year from the title
movies['year'] = movies['title'].apply(extract_year)
movies['title'] = movies['title'].apply(remove_year_from_title)

# Preview the data to check the result
print(movies[['title', 'year']].head())

                         title  year
0                    Toy Story  1995
1                      Jumanji  1995
2             Grumpier Old Men  1995
3            Waiting to Exhale  1995
4  Father of the Bride Part II  1995


In [5]:
tag_counts = tags.groupby(['movieId', 'tag']).size().reset_index(name='tag_count')

tag_frequency = tags['tag'].value_counts().reset_index()
tag_frequency.columns = ['tag', 'total_count']

tag_counts_with_frequency = pd.merge(tag_counts, tag_frequency, on='tag', how='left')

# Calculate the "uniqueness" score (tags that are less frequent across movies should have a lower score)
tag_counts_with_frequency['uniqueness_score'] = tag_counts_with_frequency['total_count'].rank(method='min')

top_unique_tags_per_movie = tag_counts_with_frequency.groupby('movieId').apply(
    lambda x: x.nsmallest(5, 'uniqueness_score')).reset_index(drop=True)

top_unique_tags_aggregated = top_unique_tags_per_movie.groupby('movieId')['tag'].apply(lambda x: ', '.join(x)).reset_index()

top_unique_tags_with_titles = pd.merge(top_unique_tags_aggregated, movies[['movieId', 'title', 'year', 'genres']], on='movieId', how='left')

top_unique_tags_with_titles = top_unique_tags_with_titles[['movieId', 'title', 'year','genres', 'tag']]

print(top_unique_tags_with_titles[['title', 'tag']].head())

# Save the result to a CSV file
#top_unique_tags_with_titles.to_csv('top_5_unique_tags_per_movie.csv', index=False)


                         title  \
0                    Toy Story   
1                      Jumanji   
2             Grumpier Old Men   
3            Waiting to Exhale   
4  Father of the Bride Part II   

                                                 tag  
0  TÃ©a Leoni does not star in this movie, toy, v...  
1  see also:Zathura, Chris Van Allsburg, childhoo...  
2  comedinha de velhinhos engraÃÂ§ada, comedinha...  
3              characters, chick flick, revenge, CLV  
4  steve martin, Diane Keaton, Steve Martin, Fant...  


  top_unique_tags_per_movie = tag_counts_with_frequency.groupby('movieId').apply(


In [6]:
movies=top_unique_tags_with_titles

In [7]:
movies.head(20)

Unnamed: 0,movieId,title,year,genres,tag
0,1,Toy Story,1995,Adventure|Animation|Children|Comedy|Fantasy,"TÃ©a Leoni does not star in this movie, toy, v..."
1,2,Jumanji,1995,Adventure|Children|Fantasy,"see also:Zathura, Chris Van Allsburg, childhoo..."
2,3,Grumpier Old Men,1995,Comedy|Romance,"comedinha de velhinhos engraÃÂ§ada, comedinha..."
3,4,Waiting to Exhale,1995,Comedy|Drama|Romance,"characters, chick flick, revenge, CLV"
4,5,Father of the Bride Part II,1995,Comedy,"steve martin, Diane Keaton, Steve Martin, Fant..."
5,6,Heat,1995,Action|Crime|Thriller,"career criminal, individualism, tough guy, ONE..."
6,7,Sabrina,1995,Comedy|Romance,"remadeAs:Sabrina(1995), remakeOf:Sabrina(1954)..."
7,9,Sudden Death,1995,Action,"Peter Hyams, Jean-Claude Van Damme, Can't reme..."
8,10,GoldenEye,1995,Action|Adventure|Thriller,"007 movies are bad., tank chase scene, Puerto ..."
9,11,"American President, The",1995,Comedy|Drama|Romance,"23.03.06, Amazing kung fu, clever dialogue, de..."


In [8]:
avg_ratings = ratings.groupby('movieId')['rating'].mean().reset_index()

movies = pd.merge(movies, avg_ratings, on='movieId', how='left')

movies = movies.rename(columns={'rating': 'average_rating'})

movies.head(10)

Unnamed: 0,movieId,title,year,genres,tag,average_rating
0,1,Toy Story,1995,Adventure|Animation|Children|Comedy|Fantasy,"TÃ©a Leoni does not star in this movie, toy, v...",3.793347
1,2,Jumanji,1995,Adventure|Children|Fantasy,"see also:Zathura, Chris Van Allsburg, childhoo...",3.069892
2,3,Grumpier Old Men,1995,Comedy|Romance,"comedinha de velhinhos engraÃÂ§ada, comedinha...",2.923077
3,4,Waiting to Exhale,1995,Comedy|Drama|Romance,"characters, chick flick, revenge, CLV",2.576923
4,5,Father of the Bride Part II,1995,Comedy,"steve martin, Diane Keaton, Steve Martin, Fant...",2.848684
5,6,Heat,1995,Action|Crime|Thriller,"career criminal, individualism, tough guy, ONE...",3.818182
6,7,Sabrina,1995,Comedy|Romance,"remadeAs:Sabrina(1995), remakeOf:Sabrina(1954)...",3.25
7,9,Sudden Death,1995,Action,"Peter Hyams, Jean-Claude Van Damme, Can't reme...",2.634615
8,10,GoldenEye,1995,Action|Adventure|Thriller,"007 movies are bad., tank chase scene, Puerto ...",3.335968
9,11,"American President, The",1995,Comedy|Drama|Romance,"23.03.06, Amazing kung fu, clever dialogue, de...",3.380734


In [9]:
movies.shape

(2495, 6)

In [10]:
movies.to_csv('movies_streamlit.csv', index=False)