In [27]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, Flatten, Concatenate, Dropout, LSTM, TextVectorization

df=pd.read_csv("top10K-TMDB-movies.csv")

#from sklearn.model_selection import train_test_split

#X_train, X_test, y_train, y_test = train_test_split(df,)


label_encoder = LabelEncoder()

df["genre"] = label_encoder.fit_transform(df['genre'])
df['original_language']= label_encoder.fit_transform(df['original_language'])
df['title'] = label_encoder.fit_transform(df['title'])

scaler = StandardScaler()
df['popularity'] = scaler.fit_transform(df[['popularity']])

df = df.dropna()

#Text Vectorization for Reviews

df['release_date'] = pd.to_datetime(df['release_date'])
df['release_year'] = df['release_date'].dt.year
df['release_month'] = df['release_date'].dt.month
df['release_day'] = df['release_date'].dt.day
df = df.drop(columns=['release_date'])
df=df.dropna()

max_features = 20000
sequence_length = 500

vectorize_layer = TextVectorization(
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length
)

#Adapting the vectorizer to the text data overview
vectorize_layer.adapt(df['overview'])

X=df.drop(columns=['id','vote_average','vote_count'])
y=df['vote_average']
X_train,X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)



#Building the Model

overview_input= Input(shape=(1,),dtype=tf.string, name='overview')
popularity_input= Input(shape=(1,), name='popularity')
year_input= Input(shape=(1,), name='year')
month_input= Input(shape=(1,), name='month')
day_input= Input(shape=(1,), name='day')
genre_input= Input(shape=(1,), name='genre')
language_input= Input(shape=(1,), name='language')
title_input= Input(shape=(1,), name='title')

#Text processing
overview_vector = vectorize_layer(overview_input)
overview_embedding = Embedding(max_features, 64)(overview_vector)
overview_lstm = LSTM(64)(overview_embedding)

#Dense layers for numerical/categorical features
popularity_dense = Dense(32, activation="relu")(popularity_input)
year_dense= Dense(32, activation="relu")(year_input)
month_dense= Dense(32, activation="relu")(month_input)
day_dense= Dense(32, activation="relu")(day_input)
genre_dense=Dense(32, activation="relu")(genre_input)
language_dense= Dense(32, activation="relu")(language_input)
title_dense= Dense(32, activation="relu")(title_input)

#Concatenate all layers
concatenated = Concatenate()([overview_lstm, popularity_dense, year_dense, month_dense, day_dense, genre_dense, language_dense, title_dense])
dense_1 = Dense(64, activation='relu')(concatenated)
dropout = Dropout(0.5)(dense_1)
output = Dense(1)(dropout)

#Model
model = Model(inputs=[overview_input, popularity_input, year_input, month_input, day_input, genre_input, language_input, title_input], outputs=output)
model.compile(optimizer ='adam', loss='mse', metrics=['mae'])

# Prepare input data
train_overview = X_train['overview'].values
train_popularity = X_train['popularity'].values
train_year = X_train['release_year'].values
train_month = X_train['release_month'].values
train_day = X_train['release_day'].values
train_genre = X_train['genre'].values
train_language = X_train['original_language'].values
train_title = X_train['title'].values

# Train the Model
history = model.fit(
    [train_overview, train_popularity, train_year, train_month, train_day, train_genre, train_language, train_title],
    y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.2
)

#prepare test data
test_overview = X_test["overview"].values
test_popularity = X_test['popularity'].values
test_year = X_test['release_year'].values
test_month = X_test['release_month'].values
test_day = X_test['release_day'].values
test_genre = X_test['genre'].values
test_language = X_test['original_language'].values
test_title = X_test['title'].values

# Evaluate the Model
model.evaluate([test_overview, test_popularity, test_year, test_month, test_day, test_genre, test_language, test_title], y_test)



Epoch 1/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 265ms/step - loss: 7299.1289 - mae: 44.3002 - val_loss: 5.3885 - val_mae: 1.9502
Epoch 2/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 254ms/step - loss: 23.4302 - mae: 3.3307 - val_loss: 2.3704 - val_mae: 1.2312
Epoch 3/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 253ms/step - loss: 12.2060 - mae: 2.6671 - val_loss: 2.7724 - val_mae: 1.4521
Epoch 4/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 265ms/step - loss: 9.5454 - mae: 2.4414 - val_loss: 2.0820 - val_mae: 1.2376
Epoch 5/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 270ms/step - loss: 8.2613 - mae: 2.2983 - val_loss: 2.1298 - val_mae: 1.2674
Epoch 6/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 255ms/step - loss: 7.7247 - mae: 2.2187 - val_loss: 1.8493 - val_mae: 1.1666
Epoch 7/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

[0.8992904424667358, 0.7697062492370605]

In [28]:
'''
# Assuming you have already fitted the LabelEncoder to the movie titles
# and stored it in the variable label_encoder_title

# Suppose you have an encoded movie title
encoded_title = 1016  # Replace 123 with the actual encoded title

# Use inverse_transform to get the original movie title
original_title = label_encoder_title.inverse_transform([encoded_title])[0]

print("Original Movie Title:", original_title)
'''

'\n# Assuming you have already fitted the LabelEncoder to the movie titles\n# and stored it in the variable label_encoder_title\n\n# Suppose you have an encoded movie title\nencoded_title = 1016  # Replace 123 with the actual encoded title\n\n# Use inverse_transform to get the original movie title\noriginal_title = label_encoder_title.inverse_transform([encoded_title])[0]\n\nprint("Original Movie Title:", original_title)\n'