In [1]:
import pandas as pd
from itertools import combinations

# Path to the original dataset
file_path = './movies.csv'

# Reading the CSV file
df = pd.read_csv(file_path)

# Cleaning and transforming the data
# Splitting the actors list and creating new rows for each pair of actors
cleaned_data = []
total_rows = len(df)
print(f"Processing {total_rows} movies...")

for index, row in df.iterrows():
    # Showing progress
    if index % 100000 == 0:
        print(f"Processed {index} / {total_rows} movies...")

    # Handling missing or NaN values in 'credits' column
    if pd.isna(row['credits']):
        continue

    actors = row['credits'].split('-')
    for actor_pair in combinations(actors, 2):
        cleaned_data.append({
            'actor_1': actor_pair[0],
            'actor_2': actor_pair[1],
            'movie_title': row['title'],
            'release_date': row['release_date']
        })

# Converting to DataFrame
cleaned_df = pd.DataFrame(cleaned_data)

# Displaying a few rows to check
print(cleaned_df.head())

# Saving the cleaned data to a new CSV file, in the parent directory
cleaned_df.to_csv('../actor_collaborations.csv', index=False)

# Printing the total number of entries in the cleaned dataset
print(f"Data processing completed. The cleaned dataset contains {len(cleaned_df)} entries and is saved as 'actor_collaborations.csv'.")

Processing 722580 movies...
Processed 0 / 722580 movies...
Processed 100000 / 722580 movies...
Processed 200000 / 722580 movies...
Processed 300000 / 722580 movies...
Processed 400000 / 722580 movies...
Processed 500000 / 722580 movies...
Processed 600000 / 722580 movies...
Processed 700000 / 722580 movies...
         actor_1           actor_2        movie_title release_date
0  Jason Statham           Wu Jing  Meg 2: The Trench   2023-08-02
1  Jason Statham  Shuya Sophia Cai  Meg 2: The Trench   2023-08-02
2  Jason Statham      Sergio Peris  Meg 2: The Trench   2023-08-02
3  Jason Statham          Mencheta  Meg 2: The Trench   2023-08-02
4  Jason Statham    Skyler Samuels  Meg 2: The Trench   2023-08-02
Data processing completed. The cleaned dataset contains 50216705 entries and is saved as 'actor_collaborations.csv'.
