In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os
import sys

In [2]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Normalization, Lambda
from tensorflow.keras.initializers import GlorotUniform
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split

In [3]:
train_data = "data/train_dataset.csv"
tets_data = "data/test_dataset.csv"

df_train = pd.read_csv(train_data)
df_test = pd.read_csv(tets_data)

In [4]:
print(df_train.size)
print(pd.unique(df_train["director_name"]).size)

81108
1460


In [5]:
print(df_train.columns)

Index(['id', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'num_user_for_reviews', 'language', 'country', 'content_rating',
       'title_year', 'actor_2_facebook_likes', 'movie_facebook_likes',
       'title_embedding', 'average_degree_centrality', 'imdb_score_binned'],
      dtype='object')


In [6]:
def preprocess_tags(tags):
    # Split tags and apply MultiLabelBinarizer or similar strategy
    tags_splitted = tags.apply(lambda x: x.split('|'))
    all_tags = set(tag for sublist in tags_splitted for tag in sublist)
    tag_list = list(all_tags)
    tag_list.sort()  # Optional: Sort tags for consistent ordering

    # Create a Multi-Hot Encoded DataFrame for tags
    def encode_tags(tag_row):
        return [1 if tag in tag_row else 0 for tag in tag_list]

    tag_encoded = tags_splitted.apply(encode_tags)
    tag_df = pd.DataFrame(tag_encoded.tolist(), columns=tag_list)
    return tag_df

In [7]:
class IMDB_DataFrame(pd.DataFrame):
    @property
    def _constructor(self):
        return IMDB_DataFrame
    
    def hot_code_property(self, field):
        tags_splitted = self[field].apply(lambda x: x.split('|'))
        all_tags = set(tag for sublist in tags_splitted for tag in sublist)
        tag_list = list(all_tags)
        tag_list.sort()  # Optional: Sort tags for consistent ordering

        # Create a Multi-Hot Encoded DataFrame for tags
        def encode_tags(tag_row):
            return [1 if tag in tag_row else 0 for tag in tag_list]

        tag_encoded = tags_splitted.apply(encode_tags)
        tag_df = pd.DataFrame(tag_encoded.tolist(), columns=tag_list)
        
        return pd.concat([self.drop(field,axis=1), tag_df], axis=1)

In [8]:
# Add preprocessed tags to the dataframe
df_train = IMDB_DataFrame(pd.read_csv(train_data))
df_train = df_train.hot_code_property('genres')
df_train = df_train.hot_code_property('plot_keywords')
df_train = df_train.hot_code_property('director_name')
df_train = df_train.hot_code_property('actor_1_name')
df_train = df_train.hot_code_property('actor_2_name')
df_train = df_train.hot_code_property('actor_3_name')
df_train = df_train.hot_code_property('country')
df_train = df_train.hot_code_property('content_rating')
df_train = df_train.drop(columns=['title_embedding', 'movie_title','language'])

In [9]:
features = df_train.drop(columns=['imdb_score_binned']).astype(np.float32)
ratings = df_train['imdb_score_binned'].astype(np.float32)
print(features.shape, ratings.shape)

(3004, 12808) (3004,)


In [10]:
# prepare the training data
X_train, X_test, y_train, y_test = train_test_split(features, ratings, test_size=0.2, random_state=32)

In [32]:
# Model configuration
model = Sequential([
    Dense(128, activation='relu',kernel_initializer=GlorotUniform()),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dense(64, activation='relu'),
    Dropout(0.1),
    Dense(32, activation='relu'),
    Dense(32, activation='relu'),
    Dropout(0.1),
    Dense(16, activation='relu'),
    Dense(16, activation='relu'),
    # Dense(1, activation='linear'),
    Dense(1, activation='sigmoid'),
    Lambda(lambda x: x * 4)
])

In [33]:
# Compile the model
model.compile(
    optimizer='adam',
    loss='mean_squared_error',
    metrics=['mean_absolute_error'])
# Train the model
model.fit(X_train, y_train, epochs=20, batch_size=16, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x1e1bc399b50>

In [34]:
loss, mae = model.evaluate(X_test, y_test, verbose=0)
print(f'Validation MAE: {mae};Validation loss: {loss}')

Validation MAE: 2.19301176071167;Validation loss: 5.297836780548096


In [38]:
predictions = model.predict(X_test)
#rounded_predictions = np.clip(np.round(predictions).flatten(), 0, 4)
rounded_predictions = np.round(predictions).flatten()
accuracy = np.sum(rounded_predictions == y_test)/len(y_test)
print(accuracy)

0.008319467554076539
