In [104]:
import pandas as pd
from numpy import array
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding
import re


In [105]:
# Load datasets
df1 = pd.read_csv('tmdb_5000_credits.csv')
df2 = pd.read_csv('tmdb_5000_movies.csv')

In [106]:
# Merge datasets ('title' is the common identifier)
merged_df = pd.merge(df1[['title', 'cast', 'crew']], df2[['title', 'genres', 'keywords', 'overview']], on='title', how='outer')

In [107]:
# Preprocess data
def preprocess_text(text):
  text = text.lower()  # Lowercase
  text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
  text = ' '.join([word for word in text.split() if len(word) > 2])  # Remove short words
  return text

merged_df['overview'] = merged_df['overview'].astype(str)
merged_df['overview'] = merged_df['overview'].apply(preprocess_text)
merged_df['genres'] = merged_df['genres'].apply(eval)  # genres are stored as lists


In [108]:
print(merged_df.columns)

# Combine features
features = ['overview'] + [' '.join(str(genre_list)) for genre_list in merged_df['genres']] + [' '.join(cast) for cast in merged_df['cast']] + [' '.join(crew) for crew in merged_df['crew']] + [' '.join(keywords) for keywords in merged_df['keywords']]
merged_df['combined_features'] = features[:len(merged_df)]

# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000)
features_matrix = vectorizer.fit_transform(merged_df['combined_features'])

X_train, X_test, y_train, y_test = train_test_split(features_matrix, merged_df['title'], test_size=0.2, random_state=42)


Index(['title', 'cast', 'crew', 'genres', 'keywords', 'overview'], dtype='object')
