In [1]:
import pandas as pd
import requests
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

class MovieDataFetcher:
    """
    A class to fetch movie data from an external API.
    """
    def __init__(self, api_key):
        self.api_key = api_key

    def get_movie_genres(self, movie_id):
        """
        Retrieves the genres of a movie by its ID.
        """
        try:
            url = f"https://api.themoviedb.org/3/movie/{movie_id}"
            params = {"api_key": self.api_key, "language": "en-US"}
            movie_data = self._make_request(url, params)
    
            if movie_data is None:
                return movie_id, None
    
            movie_genres = movie_data.get("genres")
            genres = [genre['name'] for genre in movie_genres] if movie_genres else []
    
            return movie_id, genres
        except Exception as e:
            print(f"Error fetching movie ID {movie_id}: {e}")
            return movie_id, None

    def _make_request(self, url, params):
        """
        Makes a GET request to the specified URL with given parameters.
        """
        response = requests.get(url, params=params)
        if response.status_code != 200:
            return None
        return response.json()
        
    @staticmethod
    def write_deleted_id_to_file(movie_id, file_name='deleted_movie_ids.txt'):
        """
        Writes a deleted movie ID to a specified file.
        """
        with open(file_name, 'a') as file:
            file.write(f"{movie_id}\n")

# Load the dataset
df = pd.read_csv('movie_details.csv')

# Initialize movie data fetcher
api_key = "6c72b652e1a5b56a3ef1c5989cf1d128" 
fetcher = MovieDataFetcher(api_key)

# Parallel fetching movie genres
movie_ids = df['Movie ID'].tolist()
with ThreadPoolExecutor() as executor: 
    future_to_movie_id = {executor.submit(fetcher.get_movie_genres, movie_id): movie_id for movie_id in movie_ids}
    
    for future in tqdm(as_completed(future_to_movie_id), total=len(movie_ids), desc="Fetching genres"):
        movie_id, genres = future.result()
        if not genres:
            df = df[df['Movie ID'] != movie_id]
            MovieDataFetcher.write_deleted_id_to_file(movie_id)

# Save the cleaned dataset
df.to_csv('cleaned_movie_details.csv', index=False)

# Count the deleted and remaining rows
final_count = len(df)
deleted_count = len(movie_ids) - final_count

print(f"Total deleted rows: {deleted_count}")
print(f"Remaining rows: {final_count}")

Fetching genres: 100%|███████████████████████████████████████████████████████| 554884/554884 [7:40:10<00:00, 20.10it/s]


Total deleted rows: 109210
Remaining rows: 445674


In [1]:
import pandas as pd

cleaned_movie_details = pd.read_csv('../cleaned_movie_details.csv')

def has_three_or_more_duplicates(cast_list):
    name_counts = {}
    for name in cast_list:
        if name in name_counts:
            name_counts[name] += 1
            if name_counts[name] >= 3:
                return True
        else:
            name_counts[name] = 1
    return False

cleaned_movie_details['Cast'] = cleaned_movie_details['Cast'].apply(eval)

initial_count = len(cleaned_movie_details)
cleaned_movie_details = cleaned_movie_details[~cleaned_movie_details['Cast'].apply(has_three_or_more_duplicates)]
final_count = len(cleaned_movie_details)

deleted_rows = initial_count - final_count

print(f"Deleted {deleted_rows} rows with three or more duplicate cast names.")
print(f"Remaining rows in the dataset: {final_count}.")

cleaned_movie_details.to_csv('cleaned_movie_details.csv', index=False)

Deleted 273 rows with three or more duplicate cast names.
Remaining rows in the dataset: 445401.


In [1]:
import pandas as pd
import concurrent.futures
from tqdm import tqdm
from processing_functions import process_row  # Import from the separate script

def main():
    # Load the dataset
    df = pd.read_csv('../cleaned_movie_details.csv')

    # Convert 'Release Date' to datetime
    df['Release Date'] = pd.to_datetime(df['Release Date'], errors='coerce')

    # Filter out movies released before the 1960s
    df = df[df['Release Date'].dt.year >= 1960]

    # Parallel processing
    with concurrent.futures.ProcessPoolExecutor() as executor:
        # Process each row in parallel
        results = list(tqdm(executor.map(process_row, df.to_dict('records')), total=len(df), desc="Processing"))

    # Convert the results back to a DataFrame
    df_processed = pd.DataFrame(results)

    # Save the cleaned dataset
    df_processed.to_csv('cleaned_movie_details.csv', index=False)

    print("Dataset cleaned and saved successfully.")

if __name__ == "__main__":
    main()

Processing: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 389161/389161 [01:33<00:00, 4154.60it/s]


Dataset cleaned and saved successfully.


In [1]:
import pandas as pd
import re
from tqdm import tqdm
import ast

df = pd.read_csv('../cleaned_movie_details.csv')

def is_valid_name(name):
    
    invalid_chars = '1234567890!@#$%^&*()_+=[]{}|\\;:"<>,/?'
    return not any(char in name for char in invalid_chars)


def format_name(name):
    return ' '.join(name.strip().split())

rows_removed = 0
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    
    directors = [format_name(name) for name in eval(row['Directors'])]
    cast = [format_name(name) for name in eval(row['Cast'])]

    df.at[index, 'Directors'] = str(directors)
    df.at[index, 'Cast'] = str(cast)
    
    if not all(is_valid_name(name) for name in directors + cast):
        df.drop(index, inplace=True)
        rows_removed += 1

df.to_csv('cleaned_movie_details.csv', index=False)

rows_remaining = df.shape[0]
print(f"Rows removed: {rows_removed}")
print(f"Rows remaining: {rows_remaining}")


100%|████████████████████████████████████████████████████████████████████████| 389161/389161 [06:05<00:00, 1063.67it/s]


Rows removed: 10528
Rows remaining: 378633
