In [2]:
import dask.dataframe as dd

# Define the paths to the datasets
movies_path = '/content/movies.csv'
tags_path = '/content/tags.csv'
genome_scores_path = '/content/genome-scores.csv'
genome_tags_path = '/content/genome-tags.csv'

# Load the datasets using Dask
movies_df = dd.read_csv(movies_path)
tags_df = dd.read_csv(tags_path)
genome_scores_df = dd.read_csv(genome_scores_path)
genome_tags_df = dd.read_csv(genome_tags_path)

In [3]:
# Display the first few rows and the column information for each dataset
print("Movies Dataset:")
print(movies_df.head())
print(movies_df.dtypes)

Movies Dataset:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
movieId              int64
title      string[pyarrow]
genres     string[pyarrow]
dtype: object


In [4]:
print("Tags Dataset:")
print(tags_df.head())
print(tags_df.dtypes)

Tags Dataset:
   userId  movieId               tag   timestamp
0       3      260           classic  1439472355
1       3      260            sci-fi  1439472256
2       4     1732       dark comedy  1573943598
3       4     1732    great dialogue  1573943604
4       4     7569  so bad it's good  1573943455
userId                 int64
movieId                int64
tag          string[pyarrow]
timestamp              int64
dtype: object


In [5]:
print("Genome Scores Dataset:")
print(genome_scores_df.head())
print(genome_scores_df.dtypes)

Genome Scores Dataset:
   movieId  tagId  relevance
0        1      1    0.02875
1        1      2    0.02375
2        1      3    0.06250
3        1      4    0.07575
4        1      5    0.14075
movieId        int64
tagId          int64
relevance    float64
dtype: object


In [6]:
print("Genome Tags Dataset:")
print(genome_tags_df.head())
print(genome_tags_df.dtypes)

Genome Tags Dataset:
   tagId           tag
0      1           007
1      2  007 (series)
2      3  18th century
3      4         1920s
4      5         1930s
tagId              int64
tag      string[pyarrow]
dtype: object


In [7]:
# Merge the movies and tags DataFrames on the 'movieId' column
combined_df = movies_df.merge(tags_df, on='movieId', how='inner')

# Display the first few rows of the merged DataFrame to verify the merge
print(combined_df.head())

# Examine the structure and data types to ensure the merge went as expected
print(combined_df.dtypes)

   movieId             title                                       genres  \
0        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
1        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
2        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
3        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
4        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   

   userId           tag   timestamp  
0     791         Owned  1515175493  
1    1048  imdb top 250  1172144394  
2    1361         Pixar  1216146311  
3    3164         Pixar  1223304727  
4    3164   time travel  1223304729  
movieId                int64
title        string[pyarrow]
genres       string[pyarrow]
userId                 int64
tag          string[pyarrow]
timestamp              int64
dtype: object


In [8]:
# Dropping the 'timestamp' column from the combined DataFrame
combined_df = combined_df.drop('timestamp', axis=1)

# Combine genome_scores with genome_tags on 'tagId'
genome_combined_df = genome_scores_df.merge(genome_tags_df, on='tagId', how='inner')

# Now, merge this combined genome data with the previous combined dataset on 'movieId'
final_combined_df = combined_df.merge(genome_combined_df, on='movieId', how='inner')

# Display the first few rows to check the final combined DataFrame
print(final_combined_df.head())

   movieId             title                                       genres  \
0        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
1        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
2        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
3        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
4        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   

   userId  tag_x  tagId  relevance         tag_y  
0     791  Owned      1    0.02875           007  
1     791  Owned      2    0.02375  007 (series)  
2     791  Owned      3    0.06250  18th century  
3     791  Owned      4    0.07575         1920s  
4     791  Owned      5    0.14075         1930s  


In [9]:
# Checking the structure and data types of the final DataFrame
print(final_combined_df.dtypes)

movieId                int64
title        string[pyarrow]
genres       string[pyarrow]
userId                 int64
tag_x        string[pyarrow]
tagId                  int64
relevance            float64
tag_y        string[pyarrow]
dtype: object


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Scale relevance values to emphasize their effect, e.g., scale by 100 or 1000 depending on the range of relevance
scale_factor = 100
final_combined_df['tags_processed'] = final_combined_df.apply(
    lambda x: (x['tag_y'] + ' ') * int(x['relevance'] * scale_factor), axis=1
)

# Combine 'genres' and 'tags_processed' into a single column for TF-IDF processing
final_combined_df['text_data'] = final_combined_df['genres'] + ' ' + final_combined_df['tags_processed']

# Initialize the TfidfVectorizer with parameters that suit your text data, such as lowercasing and removal of stop words
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the combined text data
tfidf_matrix = tfidf.fit_transform(final_combined_df['text_data'])

# Check the shape of the resulting TF-IDF matrix to confirm the inclusion and impact of scaled relevance values
print("Shape of TF-IDF Matrix:", tfidf_matrix.shape)

Shape of TF-IDF Matrix: (1828684, 1128)


In [13]:
# Fill NaN values with a blank space in the 'genres' and 'tags_processed' columns
final_combined_df['genres'].fillna('', inplace=True)
final_combined_df['tags_processed'].fillna('', inplace=True)

# Reconstruct the 'text_data' column with filled values
final_combined_df['text_data'] = final_combined_df['genres'] + ' ' + final_combined_df['tags_processed']

# Continue with the TF-IDF vectorization (assuming the TfidfVectorizer instance 'tfidf' is already initialized)
tfidf_matrix = tfidf.fit_transform(final_combined_df['text_data'])

Shape of TF-IDF Matrix: (1828684, 1128)


In [15]:
from sklearn.metrics.pairwise import cosine_similarity

# Choose a particular movie index for comparison, for example, index 0 (first movie in the DataFrame)
selected_movie_index = 0
selected_movie_vector = tfidf_matrix[selected_movie_index]

# Compute cosine similarity between the selected movie vector and all movie vectors
cosine_similarities = cosine_similarity(selected_movie_vector, tfidf_matrix)

# Convert the cosine similarities to a DataFrame for better handling
similarity_scores = pd.DataFrame(cosine_similarities.flatten(), index=final_combined_df['movieId'], columns=['similarity_score'])

# Drop duplicates in the DataFrame
similarity_scores = similarity_scores[~similarity_scores.index.duplicated(keep='first')]

# Print the unique similarity scores for each movie
print(similarity_scores.sort_values(by='similarity_score', ascending=False))

         similarity_score
movieId                  
1                1.000000
13               0.964698
60               0.961626
126              0.954981
2                0.946177
...                   ...
32               0.826911
22               0.824858
131              0.824296
123              0.777629
136              0.715566

[128 rows x 1 columns]


In [16]:
def get_recommendations(title, df, tfidf_matrix, top_n=10):
    # Get the index of the movie that matches the title
    idx = df.index[df['title'] == title].tolist()[0]

    # Compute the cosine similarity matrix (if not already computed)
    cosine_sim = cosine_similarity(tfidf_matrix[idx], tfidf_matrix)

    # Convert to a DataFrame for easier handling
    sim_scores = pd.DataFrame(cosine_sim.flatten(), index=df['title'], columns=['similarity_score'])

    # Remove duplicate titles if any
    sim_scores = sim_scores[~sim_scores.index.duplicated(keep='first')]

    # Sort the movies based on the similarity scores
    sim_scores = sim_scores.sort_values('similarity_score', ascending=False)

    # Get the scores of the 10 most similar movies
    top_scores = sim_scores.iloc[1:top_n+1]  # Skip the first one as it will be the movie itself

    return top_scores

In [17]:
# Top 10 recommendations based on the given movie
movie_title = "Jumanji (1995)"
recommendations = get_recommendations(movie_title, final_combined_df, tfidf_matrix)
print(recommendations)

                                     similarity_score
title                                                
NeverEnding Story III, The (1994)            0.998087
Amazing Panda Adventure, The (1995)          0.992320
Mortal Kombat (1995)                         0.990213
Dunston Checks In (1996)                     0.985451
Now and Then (1995)                          0.984720
Indian in the Cupboard, The (1995)           0.983935
Broken Arrow (1996)                          0.982963
Big Green, The (1995)                        0.982629
It Takes Two (1995)                          0.982629
GoldenEye (1995)                             0.981778
