Performs train / test split on:   
1. The dataset exported from the Join Datasets notebook
2. The screenplays

In [1]:
# screenplays
import os

def load_screenplays_from_folder(folder_path):
    screenplays = []
    filenames = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, "r", encoding="utf-8") as f:
                text = f.read()
                screenplays.append(text)
                filenames.append(filename)
    return screenplays, filenames

# Define paths
train_path = os.path.join("..", "data", "screenplays", "train")
test_path = os.path.join("..", "data", "screenplays", "test")

# Load data
X_train, train_filenames = load_screenplays_from_folder(train_path)
X_test, test_filenames = load_screenplays_from_folder(test_path)

# Quick checks
print(f"Loaded {len(X_train)} training screenplays")
print(f"Loaded {len(X_test)} test screenplays")


Loaded 98 training screenplays
Loaded 11 test screenplays


In [19]:
# joined dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import os

# Load data
data_path = os.path.join("..", "data", "Cache", "poster3_data.csv")
df = pd.read_csv(data_path)

# First split: 5% test set (stratified on budget_level)
df_train_val, df_test = train_test_split(
    df,
    test_size=0.05,
    stratify=df["from_budget_category"],
    random_state=42
)

# Second split: from the remaining 95%, take 5% for validation
df_train, df_val = train_test_split(
    df_train_val,
    test_size=0.05,
    stratify=df_train_val["from_budget_category"],
    random_state=42
)

# Save the splits
cache_dir = os.path.join("..", "data", "Cache")
df_train.to_csv(os.path.join(cache_dir, "poster3_train.csv"), index=False)
df_val.to_csv(os.path.join(cache_dir, "poster3_val.csv"), index=False)
df_test.to_csv(os.path.join(cache_dir, "poster3_test.csv"), index=False)

# Summary
print(f"Train size: {len(df_train)}")
print(f"Validation size: {len(df_val)}")
print(f"Test size: {len(df_test)}")

Train size: 4230
Validation size: 223
Test size: 235


In [14]:
import re

# Clean and normalize filenames to extract base titles
def clean_title(filename):
    title = re.sub(r"\.txt$", "", filename)       # Remove extension
    title = re.sub(r"[^a-zA-Z0-9 ]", "", title)   # Remove punctuation
    return title.strip().lower()

# Build a set of cleaned test screenplay titles
test_titles_screenplay = set(clean_title(name) for name in test_filenames)

# Clean titles from df_train and remove any that match test screenplay titles
def is_conflict(row):
    if "title" in row:
        title = str(row["title"])
    elif "original_title" in row:
        title = str(row["original_title"])
    else:
        return False
    cleaned = re.sub(r"[^a-zA-Z0-9 ]", "", title).strip().lower()
    return cleaned in test_titles_screenplay

# Filter out overlapping movies
df_train_filtered = df_train[~df_train.apply(is_conflict, axis=1)].copy()

print(f"Original train size: {len(df_train)}")
print(f"Filtered train size: {len(df_train_filtered)}")


Original train size: 4230
Filtered train size: 4230


In [4]:
import pandas as pd

df_long = pd.read_csv("../data/cache/graph_data.csv")
df_long.columns

Index(['from_label', 'from_tconst', 'from_title', 'from_year',
       'from_runtimeMinutes', 'from_averageRating', 'from_numVotes',
       'from_budget', 'from_revenue', 'relationship', 'to_label', 'to_name',
       'to_id', 'to_gender', 'to_popularity', 'to_adult'],
      dtype='object')

### MAKE LONG AGAIN

In [21]:
import pandas as pd

df = df_train_val.copy()
flat_rows = []
n_cast = 20  # top N actors

for _, row in df.iterrows():
    from_movie = {
        'from_label': 'Movie',
        'from_tconst': row['from_tconst'],
        'from_title': row['from_title'],
        'from_year': row['from_year'],
        'from_runtimeMinutes': row['from_runtimeMinutes'],
        'from_averageRating': row['from_averageRating'],
        'from_numVotes': row['from_numVotes'],
        'from_budget': row['from_budget'],
        'from_revenue': row['from_revenue'],
        'from_budget_category': row['from_budget_category'],
        'profit': row['profit'],
        'profit_percent_gain': row['profit_percent_gain']
    }


    # === Subgenres ===
    subgenres = row['subgenres']
    if isinstance(subgenres, str):
        subgenres = subgenres.split(', ')
    for sub in subgenres:
        flat_rows.append({
            **from_movie,
            'relationship': 'HAS_SUBGENRE',
            'to_label': 'Subgenre',
            'to_name': sub.strip(),
            'to_id': '', 'to_gender': '', 'to_popularity': ''
        })

    # === Genres (optional)
    genres = row['genres']
    if isinstance(genres, str):
        genres = genres.split(', ')
    for genre in genres:
        flat_rows.append({
            **from_movie,
            'relationship': 'IN_GENRE',
            'to_label': 'Genre',
            'to_name': genre.strip(),
            'to_id': '', 'to_gender': '', 'to_popularity': ''
        })

    # === Production Companies ===
    if 'production_companies' in row and pd.notna(row['production_companies']):
        try:
            companies = eval(row['production_companies']) if isinstance(row['production_companies'], str) else row['production_companies']
            for company in companies:
                flat_rows.append({
                    **from_movie,
                    'relationship': 'PRODUCED_BY',
                    'to_label': 'ProductionCompany',
                    'to_name': company.strip(),
                    'to_id': '', 'to_gender': '', 'to_popularity': ''
                })
        except:
            pass

    # === Cast ===
    if 'cast' in row and pd.notna(row['cast']):
        try:
            cast = eval(row['cast']) if isinstance(row['cast'], str) else row['cast']
            for actor in cast:
                if int(actor.get('order', 999)) <= n_cast:
                    flat_rows.append({
                        **from_movie,
                        'relationship': 'ACTED_IN',
                        'to_label': 'Actor',
                        'to_name': actor.get('name'),
                        'to_id': actor.get('id'),
                        'to_gender': actor.get('gender'),
                        'to_popularity': actor.get('popularity')
                    })
        except:
            pass

    # === Crew ===
    if 'crew' in row and pd.notna(row['crew']):
        try:
            crew = eval(row['crew']) if isinstance(row['crew'], str) else row['crew']
            job_map = {
                'Director': 'DIRECTED',
                'Producer': 'PRODUCED',
                'Director of Photography': 'PHOTOGRAPHED'
            }
            for member in crew:
                job = member.get('job')
                if job in job_map:
                    flat_rows.append({
                        **from_movie,
                        'relationship': job_map[job],
                        'to_label': job,
                        'to_name': member.get('name'),
                        'to_id': member.get('id'),
                        'to_gender': member.get('gender'),
                        'to_popularity': member.get('popularity')
                    })
        except:
            pass

# === Create Long Format DataFrame ===
df_long = pd.DataFrame(flat_rows)
df_long = df_long.fillna('')

# Save
df_long.to_csv("../data/Cache/graph_poster3_train.csv", index=False)

print("✅ Exported graph_poster3_train.csv with", len(df_long), "rows.")


✅ Exported graph_poster3_train.csv with 21397 rows.


In [15]:
# Unique labels on the "to" side
print(df_train['to_label'].value_counts())


to_label
Genre    4230
Name: count, dtype: int64
