In [15]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
import h5py
import joblib
from scipy.sparse import vstack

In [16]:
# Load the cleaned data
df = pd.read_csv('cleaned_data.csv')

df.head()

Unnamed: 0,title,id,audienceScore,tomatoMeter,rating,ratingContents,releaseDateTheaters,releaseDateStreaming,runtimeMinutes,genre,...,distributor,reviewId,creationDate,criticName,isTopCritic,reviewState,publicatioName,reviewText,releaseYear,sentiment_score
0,"love, lies",love_lies,43.0,65.76381,Unknown,Unknown,1900-01-01,1900-01-01,120.0,drama,...,Unknown,2739073,2020-10-31,James Mudge,False,fresh,easternkicks.com,though let down by its routine love triangle n...,1900,0.348889
1,dinosaur island,dinosaur_island_2015,23.0,65.76381,Unknown,Unknown,1900-01-01,2015-05-12,79.0,"adventure, fantasy",...,Unknown,2261561,2015-05-12,Renee Schonfeld,False,rotten,common sense media,"uneven time-travel adventure has peril, mild v...",1900,0.066667
2,adrift,adrift_2018,65.0,69.0,PG-13,"['Injury Images', 'Brief Drug Use', 'Thematic ...",2018-06-01,2018-08-21,120.0,"adventure, drama, romance",...,STX Films,102694850,2022-05-29,Josh Parham,False,fresh,next best picture,this is nowhere near the level of other great ...,2018,0.229167
3,born to kill,1035316-born_to_kill,74.0,83.0,Unknown,Unknown,1947-04-30,2016-05-23,92.0,"crime, drama",...,Unknown,2710947,2020-08-05,Mike Massie,False,fresh,gone with the twins,"one of the most acerbic of all films noir, boa...",1947,-0.125
4,the garden murder case,garden_murder_case,58.349096,65.76381,Unknown,Unknown,1900-01-01,2016-10-20,61.0,mystery & thriller,...,Unknown,2132383,2013-03-10,Paul Chambers,False,rotten,movie chambers,another philo vance entry with familiar mgm co...,1900,-0.0625


In [17]:
# Data Preprocessing
df.fillna('', inplace=True)  # Fill missing values
scaler = MinMaxScaler()
df[['audienceScore', 'tomatoMeter', 'sentiment_score']] = scaler.fit_transform(
    df[['audienceScore', 'tomatoMeter', 'sentiment_score']]
)

In [18]:
# Step 1: Split the 'genre' column into multiple columns based on the maximum number of genres
genres_split = df['genre'].str.split(',', expand=True)
genres_split.columns = [f'genre_{i+1}' for i in range(genres_split.shape[1])]

# Step 2: Split the 'ratingContents' column into multiple columns based on the maximum number of contents
rating_contents_split = df['ratingContents'].str.split(',', expand=True)
rating_contents_split.columns = [f'ratingContent_{i+1}' for i in range(rating_contents_split.shape[1])]

# Step 3: Concatenate the original dataframe with the new genre and rating content columns
df = pd.concat([df, genres_split, rating_contents_split], axis=1)

# Step 4: Ensure the split columns are strings and handle any None or NaN values
for col in genres_split.columns:
    df[col] = df[col].astype('category')
    df[col] = df[col].cat.codes 

for col in rating_contents_split.columns:
    df[col] = df[col].astype('category')
    df[col] = df[col].cat.codes 


# Replace -1 with 0 for better matrix computation
df[genres_split.columns] = df[genres_split.columns].replace(-1, 0)
df[rating_contents_split.columns] = df[rating_contents_split.columns].replace(-1, 0)

# Final DataFrame Overview
print(df.head())

                    title                    id  audienceScore  tomatoMeter  \
0              love, lies             love_lies       0.430000     0.657638   
1         dinosaur island  dinosaur_island_2015       0.230000     0.657638   
2                  adrift           adrift_2018       0.650000     0.690000   
3            born to kill  1035316-born_to_kill       0.740000     0.830000   
4  the garden murder case    garden_murder_case       0.583491     0.657638   

    rating                                     ratingContents  \
0  Unknown                                            Unknown   
1  Unknown                                            Unknown   
2    PG-13  ['Injury Images', 'Brief Drug Use', 'Thematic ...   
3  Unknown                                            Unknown   
4  Unknown                                            Unknown   

  releaseDateTheaters releaseDateStreaming  runtimeMinutes  \
0          1900-01-01           1900-01-01           120.0   
1         

In [19]:
# Combine encoded features for the recommendation system
encoded_columns = [col for col in df.columns if col.startswith('genre_') or col.startswith('ratingContent_')]
df['combined_features'] = df[encoded_columns].apply(lambda row: ' '.join(row.astype(str)), axis=1)

In [20]:
# Check if any NaN values remain
print(df.isna().sum())

title                   0
id                      0
audienceScore           0
tomatoMeter             0
rating                  0
ratingContents          0
releaseDateTheaters     0
releaseDateStreaming    0
runtimeMinutes          0
genre                   0
originalLanguage        0
director                0
writer                  0
distributor             0
reviewId                0
creationDate            0
criticName              0
isTopCritic             0
reviewState             0
publicatioName          0
reviewText              0
releaseYear             0
sentiment_score         0
genre_1                 0
genre_2                 0
genre_3                 0
genre_4                 0
genre_5                 0
genre_6                 0
genre_7                 0
ratingContent_1         0
ratingContent_2         0
ratingContent_3         0
ratingContent_4         0
ratingContent_5         0
ratingContent_6         0
ratingContent_7         0
ratingContent_8         0
combined_fea

In [21]:
# Fit the TF-IDF vectorizer on the entire dataset to establish a consistent vocabulary
tfidf = TfidfVectorizer(stop_words='english')
tfidf.fit(df['combined_features'])

# Process in chunks to manage memory usage
chunk_size = 1000  # Adjust this based on your available memory
tfidf_matrix_list = []

# Transform each chunk using the already fitted TF-IDF vectorizer
for i in range(0, df.shape[0], chunk_size):
    chunk = df['combined_features'].iloc[i:i+chunk_size]
    tfidf_chunk = tfidf.transform(chunk)
    tfidf_matrix_list.append(tfidf_chunk)

# Concatenate the TF-IDF chunks into a single matrix
tfidf_matrix = vstack(tfidf_matrix_list)

# Save the TF-IDF matrix
with h5py.File('tfidf_matrix.h5', 'w') as hdf:
    hdf.create_dataset('tfidf_matrix', data=tfidf_matrix.toarray())

print("TF-IDF matrix saved.")

TF-IDF matrix saved.
