# Nerual network

In [1]:
# Load data (from saved but could also do from if you ran everything in one go above)
import pandas as pd
import numpy as np

final_dataset = pd.read_csv('processed_movies.csv')


In [2]:
final_dataset.head()

Unnamed: 0,tconst,primaryTitle,startYear,genres,actor_ids,averageRating,numVotes
0,tt0000009,Miss Jerry,1894,['Romance'],[],5.4,218.0
1,tt0000147,The Corbett-Fitzsimmons Fight,1897,"['Documentary', 'News', 'Sport']",[],5.3,549.0
2,tt0000502,Bohemios,1905,['\\N'],[],3.8,20.0
3,tt0000574,The Story of the Kelly Gang,1906,"['Action', 'Adventure', 'Biography']",[],6.0,969.0
4,tt0000591,The Prodigal Son,1907,['Drama'],[],5.6,30.0


In [3]:
final_dataset.columns

Index(['tconst', 'primaryTitle', 'startYear', 'genres', 'actor_ids',
       'averageRating', 'numVotes'],
      dtype='object')

In [4]:

#### FRONTEND ISSUE
"""
The issue here was that there was incorrect class labeling in the genres in this line:
    genre_features = genre_mlb.fit_transform(final_dataset['genres'])

If you see data above this "genres" is in the form of an array with strings, so we just use ast (libary) to safely eval these as arrays. More clearly this was the issue:

Data inside of genre was:  "['Genre1', 'Genre2']" 

Note the string array string. This just is evaulated to an Array not a string array

"""

import ast

#safely parse string representations of lists/arrays
def parse_list_string(s):
    try:
        #for NaN or empty strings
        if pd.isna(s) or not isinstance(s, str) or not s.strip():
            return []
        #if not nan the evaluate the string literal
        parsed = ast.literal_eval(s)
        return parsed if isinstance(parsed, list) else []
    except (ValueError, SyntaxError):
        # return empty list if parsing fails
        return []

# Parse genres string back to list
final_dataset['genres'] = final_dataset['genres'].apply(parse_list_string)

# Parse actor_ids string back to list
final_dataset['actor_ids'] = final_dataset['actor_ids'].apply(parse_list_string)


In [5]:
### REDO the model params 
from sklearn.preprocessing import MultiLabelBinarizer

# didnt save normalized year so we can redo it 
curr_year = 2025
final_dataset['year_normalized'] = (final_dataset['startYear'] - 1900) / (curr_year - 1900)


# this genres was fine
genre_mlb = MultiLabelBinarizer()
genre_features = genre_mlb.fit_transform(final_dataset['genres'])
genre_features_names = genre_mlb.classes_


# for actors do the same as above
name_df = pd.read_csv('data/name.basics.tsv', sep='\t')
title_principals_df = pd.read_csv('data/title.principals.tsv', sep='\t')




actors_df = title_principals_df[title_principals_df['category'].isin(['actor', 'actress'])]
actors_with_names = pd.merge(actors_df, name_df[['nconst', 'primaryName']], on='nconst', how='left')
actor_counts = actors_with_names['nconst'].value_counts()
top_actors = actor_counts.head(500).index.tolist()  # We'll use top 500 actors as features (500 because theres too many actors )


actor_mlb = MultiLabelBinarizer(classes=top_actors)
# basically the above line was from previous notebook since didnt save but it just boils down to top 500 actors

actor_features = actor_mlb.fit_transform(final_dataset['actor_ids'])
actor_feature_names = actor_mlb.classes_

# Set rating thresholds
rating_threshold_like = 7.0  # only the best movies!
rating_threshold_nolike = .5  # worst ones

## -> final_dataset['averageRating'].mean() gives np.float64(3.32263212820973) so a lot of bad movies??

# Initialize watched column && Set preferences: 1 for liked movies, -1 for disliked movies
final_dataset['watched'] = 0
final_dataset.loc[final_dataset['averageRating'] >= rating_threshold_like, 'watched'] = 1  # Liked
final_dataset.loc[final_dataset['averageRating'] <= rating_threshold_nolike, 'watched'] = -1  # Disliked

# get Y
y = final_dataset['watched']

liked_count = (y == 1).sum()
disliked_count = (y == -1).sum()
neutral_count = (y == 0).sum()



In [6]:
print(f"Pos count: {liked_count}")
print(f"Neg count: {disliked_count}")
print(f"Netural count: {neutral_count}")
print (f"Total count: {len(y)}")


Pos count: 94057
Neg count: 277109
Netural count: 230598
Total count: 601764


In [7]:
rating_normalized = final_dataset['averageRating'] / 10.0
final_dataset['averageRating'] = final_dataset['averageRating'] / 10.0

print(rating_normalized)

0         0.54
1         0.53
2         0.38
3         0.60
4         0.56
          ... 
601759    0.00
601760    0.00
601761    0.77
601762    0.70
601763    0.00
Name: averageRating, Length: 601764, dtype: float64


In [8]:
final_dataset.head()

Unnamed: 0,tconst,primaryTitle,startYear,genres,actor_ids,averageRating,numVotes,year_normalized,watched
0,tt0000009,Miss Jerry,1894,[Romance],[],0.54,218.0,-0.048,0
1,tt0000147,The Corbett-Fitzsimmons Fight,1897,"[Documentary, News, Sport]",[],0.53,549.0,-0.024,0
2,tt0000502,Bohemios,1905,[\N],[],0.38,20.0,0.04,0
3,tt0000574,The Story of the Kelly Gang,1906,"[Action, Adventure, Biography]",[],0.6,969.0,0.048,0
4,tt0000591,The Prodigal Son,1907,[Drama],[],0.56,30.0,0.056,0


In [9]:

X = np.hstack([
    final_dataset[['year_normalized']].values,
    genre_features,
    actor_features,
    final_dataset[['averageRating']].values
    
])
feature_names = ['year_normalized', list(genre_features), list(actor_features), 'averageRating']
#print(feature_names) <- causes lots of lag

In [10]:

# save X and Y so w dont have to rerun the above only load it in if it exists as well as metadata for frontend
import json
import os


X_FILE_PATH = 'model_data/preprocessed_X.npz'
Y_FILE_PATH = 'model_data/preprocessed_Y.npy'
METADATA_FILE_PATH = 'model_data/model_metadata.json'

if not os.path.exists('model_data'):
    os.makedirs('model_data', exist_ok=True)

# Check if files exist before processing
if os.path.exists(X_FILE_PATH) and os.path.exists(Y_FILE_PATH) and os.path.exists(METADATA_FILE_PATH):
    print("Loading preprocessed data from files...")
    # Load X (sparse format for efficiency)
    with np.load(X_FILE_PATH) as data:
        X = np.column_stack([
            data['year_normalized'],
            data['genre_features'],
            data['actor_features'],
            data['rating_features']
        ])
    
    # Load Y
    y = np.load(Y_FILE_PATH)
    
    # Load metadata
    with open(METADATA_FILE_PATH, 'r') as f:
        metadata = json.load(f)
    
    print(f"Data loaded: X shape={X.shape}, y shape={y.shape}")
else:
    print("Preprocessing and saving data...")
    
    # Metadata should include feature information for later use
    metadata = {
        'feature_count': X.shape[1],
        'year_index': 0,
        'genre_start_index': 1,
        'genre_count': genre_features.shape[1],
        'actor_start_index': 1 + genre_features.shape[1],
        'actor_count': actor_features.shape[1],
        'rating_index': X.shape[1] - 1,
        'genre_names': genre_features_names.tolist(),
        'actor_names': actor_feature_names.tolist()
    }
    
    # Save X in compressed format (separate arrays for better compression)
    np.savez_compressed(
        X_FILE_PATH,
        year_normalized=X[:, 0:1],
        genre_features=X[:, 1:1+genre_features.shape[1]],
        actor_features=X[:, 1+genre_features.shape[1]:1+genre_features.shape[1]+actor_features.shape[1]],
        rating_features=X[:, -1:]
    )
    
    # Save Y
    np.save(Y_FILE_PATH, y)
    
    # Save metadata
    with open(METADATA_FILE_PATH, 'w') as f:
        json.dump(metadata, f)
    
    print(f"Data saved: X shape={X.shape}, y shape={y.shape}")

Loading preprocessed data from files...
Data loaded: X shape=(601764, 530), y shape=(601764,)


In [11]:
#simple neural network time
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from sklearn.model_selection import train_test_split

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

# Create model
model = Sequential([
    Input(shape=(X.shape[1],)),
    
    # Layer 1
    Dense(256, activation='relu'),
    BatchNormalization(), # Normalize input before activation
    Dropout(0.4),
    
    # Second hidden layer
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.4),
    
    # Third hidden layer
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    
    # Output layer: using linear activation for a regression output (popularity score which was stated prev cell).
    Dense(1, activation='linear')
])

early_stopping = EarlyStopping( # Stop training if val_loss doesn't improve for 25 epochs
    monitor='val_loss',
    patience=25,
    restore_best_weights=True,
    mode='min'
)
lr_schedule = ReduceLROnPlateau( # Reduce learning rate if val_loss doesn't improve for 5 epochs
    monitor='val_loss',
    factor=0.1,
    patience=5,
    min_lr=0.00001
)

model.compile(optimizer=Adam(0.001), loss='mse', metrics=['mae', 'mse'])

# Train model

model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=1000, batch_size=128, callbacks=[early_stopping, lr_schedule], verbose=2)

# Save model

model.save('improved_movie_rating_model.keras')


Epoch 1/1000
3997/3997 - 18s - 4ms/step - loss: 0.1130 - mae: 0.2215 - mse: 0.1130 - val_loss: 0.0269 - val_mae: 0.0872 - val_mse: 0.0269 - learning_rate: 1.0000e-03
Epoch 2/1000
3997/3997 - 15s - 4ms/step - loss: 0.0329 - mae: 0.1240 - mse: 0.0329 - val_loss: 0.0065 - val_mae: 0.0292 - val_mse: 0.0065 - learning_rate: 1.0000e-03
Epoch 3/1000
3997/3997 - 15s - 4ms/step - loss: 0.0229 - mae: 0.1002 - mse: 0.0229 - val_loss: 0.0081 - val_mae: 0.0414 - val_mse: 0.0081 - learning_rate: 1.0000e-03
Epoch 4/1000
3997/3997 - 15s - 4ms/step - loss: 0.0204 - mae: 0.0930 - mse: 0.0204 - val_loss: 0.0065 - val_mae: 0.0514 - val_mse: 0.0065 - learning_rate: 1.0000e-03
Epoch 5/1000
3997/3997 - 15s - 4ms/step - loss: 0.0187 - mae: 0.0887 - mse: 0.0187 - val_loss: 0.0053 - val_mae: 0.0195 - val_mse: 0.0053 - learning_rate: 1.0000e-03
Epoch 6/1000
3997/3997 - 16s - 4ms/step - loss: 0.0182 - mae: 0.0876 - mse: 0.0182 - val_loss: 0.0113 - val_mae: 0.0281 - val_mse: 0.0113 - learning_rate: 1.0000e-03
Epoc

In [14]:

# Simple accuracy calculation from validation MAE
val_mae = 0.0253 # from above cell

accuracy_percentage = (10 - val_mae*10) / 10 * 100  # Convert from 0-1 scale to percentage

print(f"Model recommendation accuracy: {accuracy_percentage:.1f}%")  # Outputs ~97.5%

Model recommendation accuracy: 97.5%
