In [None]:
import pandas as pd
import numpy as np
import os
import re
import tensorflow as tf
from tensorflow.python.keras.layers import Input, Dense
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.engine import data_adapter
from sklearn.metrics.pairwise import cosine_similarity


def read_csv_file(file_name):
    # Get current .py file and 
    # current_dir = os.path.dirname(os.path.abspath(__file__))
    current_dir = os.getcwd()
    # Get data full path
    file_path = os.path.join(current_dir, file_name)
    print(f"Full path: {file_path}")
    try:
        # Check the file
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"File not found in {file_path}. Please check the file path and name.")
        # read the data and store data in DataFrame
        data = pd.read_csv(file_path)
        # print a summary of the data
        print("File read successfully! Data Info:")
        print(data.info())
        #print(data.describe())

    except FileNotFoundError as e:
        print(f"Error: {e}")
        print("File not found.")

    except pd.errors.EmptyDataError:
        print("Error: File is empty. Please check the file path and name.")

    except pd.errors.ParserError:
        print("Error: Failed to parse the CSV file. Please check the file format.")

    except Exception as e:
        print(f"Error: {e}")

    return data

def handle_missing_data(data):
    """Check and remove rows with missing values from the dataset."""
    # Remove rows with missing values
    cleaned_data = data.dropna()

    return cleaned_data

def one_hot_encode(data, column, sep):
    data_dummies = data[column].str.get_dummies(sep)
    features = pd.concat([data, data_dummies], axis=1)

    return features, data_dummies.columns

def convert_obj2num_columns(data, columns):
    for column in columns:
        data[column] = pd.to_numeric(data[column], errors='coerce')
    return data

def _is_distributed_dataset(ds):
    return isinstance(ds, data_adapter.input_lib.DistributedDatasetSpec)

def convert_runtime(runtime):
    # convert 2h22min
    match = re.match(r'(\d+)h (\d+)m', str(runtime))
    if match:
        return int(match.group(1)) * 60 + int(match.group(2))
    # convert 2h
    match = re.match(r'(\d+)h', str(runtime))
    if match:
        return int(match.group(1)) * 60
    # convert 30min
    match = re.match(r'(\d+)m', str(runtime))
    if match:
        return int(match.group(1))
    # Return NaN if convert unsuccessful
    return np.nan

def get_similar_movies(movie_idx, embeddings, top_k=5):
    """Return similar movies index
    movie_idx: input movies index in embeddings
    embeddings: all moviesembeddings
    """
    # Calculates the cosine similarity between a specific movie and all movies in the dataset.
    similarities = cosine_similarity([embeddings[movie_idx]], embeddings)[0]
    # Sorts the similarity scores in descending order and removes the movie itself (cosine similarity of 1) 
    similar_indices = similarities.argsort()[::-1][1:top_k+1]
    return similar_indices


In [7]:
""" parameter """
# the data file name
file_name = 'IMDB Top 250 Movies.csv'
encoding_dim = [8, 16]

"""Data """
# Read CSV File
movies_data = read_csv_file(file_name)
# Remove rows with missing values from the dataset
cleaned_movies_data = handle_missing_data(movies_data)
# One-Hot Encoded
movies_features, genre_dummies = one_hot_encode(cleaned_movies_data, column='genre', sep=',')
# Budget: Not Available to NaN
movies_features['budget'] = movies_features['budget'].replace('Not Available', np.nan)
# Object to number
movies_features = convert_obj2num_columns(movies_features, ['budget'])
# Transfer nan to float32
movies_features['budget'] = movies_features['budget'].fillna(0)  # 用 0 或其他適當數值填補

# Rum time convert to time number
movies_features['run_time'] = movies_features['run_time'].apply(convert_runtime)
movies_features['run_time'] = movies_features['run_time'].fillna(movies_features['run_time'].median())

# Add 'budget', 'run_time', 'rating', 'year' as feature
feature_cols = ['budget', 'run_time', 'rating', 'year'] + list(genre_dummies)
movie_input_features = movies_features[feature_cols]
features_array = movie_input_features.values


"""Autoencoder Model"""
input_dim = movie_input_features.shape[1] # 25
# Set input layer
input_layer = Input(shape=(input_dim,)) # 1-D tuple, if shape=(input_dim) means a number 
# Encoder：Feature Compression
encoded = Dense(encoding_dim[1], activation='relu')(input_layer)
# Decoder：Reconstructing the original features
decoded = Dense(input_dim, activation='linear')(encoded)
# Set Autoencoder model
data_adapter._is_distributed_dataset = _is_distributed_dataset
autoencoder = Model(inputs=input_layer, outputs=decoded)
autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.summary()

autoencoder.fit(features_array, features_array, epochs=50, batch_size=16, shuffle=True, validation_split=0.2)

encoder = Model(inputs=input_layer, outputs=encoded)
movie_embeddings = encoder.predict(features_array)
print("Movie embeddings shape:", movie_embeddings.shape)

Full path: d:\pythonProject\sideproject\Box Office Prediction\IMDB Top 250 Movies.csv
File read successfully! Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   rank         250 non-null    int64  
 1   name         250 non-null    object 
 2   year         250 non-null    int64  
 3   rating       250 non-null    float64
 4   genre        250 non-null    object 
 5   certificate  250 non-null    object 
 6   run_time     250 non-null    object 
 7   tagline      250 non-null    object 
 8   budget       250 non-null    object 
 9   box_office   250 non-null    object 
 10  casts        250 non-null    object 
 11  directors    250 non-null    object 
 12  writers      250 non-null    object 
dtypes: float64(1), int64(2), object(10)
memory usage: 25.5+ KB
None
Model: "model"
____________________________________________________________

In [9]:
movie_embeddings

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 5.47041438e+05, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 1.31360719e+05, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 4.04739975e+06, 0.00000000e+00],
       ...,
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 1.26466705e+02, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 4.81391906e+05, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 4.81396250e+05, 0.00000000e+00]], dtype=float32)

In [None]:
# Test: choose the first movie
similar_movies = get_similar_movies(0, movie_embeddings, top_k=5)
print("Indices of similar movies:", similar_movies)
# Get the name of movies from similar index
similar_movie_names = [movies_data.iloc[i]['name'] for i in similar_movies]
print("Names of similar movies:", similar_movie_names)

Indices of similar movies: [134 149  72 214 138]
Names of similar movies: ['Casino', 'Spider-Man: No Way Home', 'Coco', 'The Terminator', 'There Will Be Blood']


In [24]:
movies_data.iloc[0]

rank                                                           1
name                                    The Shawshank Redemption
year                                                        1994
rating                                                       9.3
genre                                                      Drama
certificate                                                    R
run_time                                                  2h 22m
tagline        Fear can hold you prisoner. Hope can set you f...
budget                                                  25000000
box_office                                              28884504
casts          Tim Robbins,Morgan Freeman,Bob Gunton,William ...
directors                                         Frank Darabont
writers                              Stephen King,Frank Darabont
Name: 0, dtype: object

In [16]:
similar_movie_names_df = movies_data.iloc[similar_movies]
similar_movie_names_df

Unnamed: 0,rank,name,year,rating,genre,certificate,run_time,tagline,budget,box_office,casts,directors,writers
134,135,Casino,1995,8.2,"Crime,Drama",R,2h 58m,You don't stay at the top forever,52000000,116112375,"Robert De Niro,Sharon Stone,Joe Pesci,James Wo...",Martin Scorsese,"Nicholas Pileggi,Martin Scorsese"
149,150,Spider-Man: No Way Home,2021,8.2,"Action,Adventure,Fantasy",PG-13,2h 28m,The Multiverse Unleashed.,200000000,1921847111,"Tom Holland,Zendaya,Benedict Cumberbatch,Jacob...",Jon Watts,"Chris McKenna,Erik Sommers,Stan Lee"
72,73,Coco,2017,8.4,"Animation,Adventure,Comedy",PG,1h 45m,The celebration of a lifetime,175000000,814337054,"Anthony Gonzalez,Gael García Bernal,Benjamin B...","Lee Unkrich,Adrian Molina(co-directed by)","Lee Unkrich,Jason Katz,Matthew Aldrich"
214,215,The Terminator,1984,8.1,"Action,Sci-Fi",R,1h 47m,"La sua missione e una sola: distruggere, uccid...",6400000,78371200,"Arnold Schwarzenegger,Linda Hamilton,Michael B...",James Cameron,"James Cameron,Gale Anne Hurd,William Wisher"
138,139,There Will Be Blood,2007,8.2,Drama,R,2h 38m,There Will Be Greed. There Will Be Vengeance.,25000000,76182388,"Daniel Day-Lewis,Paul Dano,Ciarán Hinds,Martin...",Paul Thomas Anderson,"Paul Thomas Anderson,Upton Sinclair"


In [21]:
similar_movies

array([134, 149,  72, 214, 138], dtype=int64)

In [26]:
movies_data

Unnamed: 0,rank,name,year,rating,genre,certificate,run_time,tagline,budget,box_office,casts,directors,writers
0,1,The Shawshank Redemption,1994,9.3,Drama,R,2h 22m,Fear can hold you prisoner. Hope can set you f...,25000000,28884504,"Tim Robbins,Morgan Freeman,Bob Gunton,William ...",Frank Darabont,"Stephen King,Frank Darabont"
1,2,The Godfather,1972,9.2,"Crime,Drama",R,2h 55m,An offer you can't refuse.,6000000,250341816,"Marlon Brando,Al Pacino,James Caan,Diane Keato...",Francis Ford Coppola,"Mario Puzo,Francis Ford Coppola"
2,3,The Dark Knight,2008,9.0,"Action,Crime,Drama",PG-13,2h 32m,Why So Serious?,185000000,1006234167,"Christian Bale,Heath Ledger,Aaron Eckhart,Mich...",Christopher Nolan,"Jonathan Nolan,Christopher Nolan,David S. Goyer"
3,4,The Godfather Part II,1974,9.0,"Crime,Drama",R,3h 22m,All the power on earth can't change destiny.,13000000,47961919,"Al Pacino,Robert De Niro,Robert Duvall,Diane K...",Francis Ford Coppola,"Francis Ford Coppola,Mario Puzo"
4,5,12 Angry Men,1957,9.0,"Crime,Drama",Approved,1h 36m,Life Is In Their Hands -- Death Is On Their Mi...,350000,955,"Henry Fonda,Lee J. Cobb,Martin Balsam,John Fie...",Sidney Lumet,Reginald Rose
...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,246,The Help,2011,8.1,Drama,PG-13,2h 26m,Change begins with a whisper.,25000000,216639112,"Viola Davis,Emma Stone,Octavia Spencer,Bryce D...",Tate Taylor,"Tate Taylor,Kathryn Stockett"
246,247,Dersu Uzala,1975,8.2,"Adventure,Biography,Drama",G,2h 22m,There is man and beast at nature's mercy. Ther...,4000000,14480,"Maksim Munzuk,Yuriy Solomin,Mikhail Bychkov,Vl...",Akira Kurosawa,"Akira Kurosawa,Yuriy Nagibin,Vladimir Arsenev"
247,248,Aladdin,1992,8.0,"Animation,Adventure,Comedy",G,1h 30m,Wish granted! (DVD re-release),Not Available,Not Available,"Scott Weinger,Robin Williams,Linda Larkin,Jona...","Ron Clements,John Musker","Ron Clements,John Musker,Ted Elliott"
248,249,Gandhi,1982,8.0,"Biography,Drama,History",PG,3h 11m,His Triumph Changed The World Forever.,22000000,52767889,"Ben Kingsley,John Gielgud,Rohini Hattangadi,Ro...",Richard Attenborough,John Briley
