In [1]:
import pandas as pd
import wikipediaapi as wiki
from imdb import IMDb
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
import time
import glob
import os

In [2]:
movie_df = pd.read_csv("../data/TMDB_movie_dataset_v11.csv")

In [3]:
movie_df = movie_df.drop(["id", "vote_average", "vote_count", "status", "backdrop_path", "homepage",
                "popularity", "tagline", "production_countries", "keywords", "poster_path",
                "original_title", "budget", "genres"], axis=1)
movie_df.head(5)

Unnamed: 0,title,release_date,revenue,runtime,adult,imdb_id,original_language,overview,production_companies,spoken_languages
0,Inception,2010-07-15,825532764,148,False,tt1375666,en,"Cobb, a skilled thief who commits corporate es...","Legendary Pictures, Syncopy, Warner Bros. Pict...","English, French, Japanese, Swahili"
1,Interstellar,2014-11-05,701729206,169,False,tt0816692,en,The adventures of a group of explorers who mak...,"Legendary Pictures, Syncopy, Lynda Obst Produc...",English
2,The Dark Knight,2008-07-16,1004558444,152,False,tt0468569,en,Batman raises the stakes in his war on crime. ...,"DC Comics, Legendary Pictures, Syncopy, Isobel...","English, Mandarin"
3,Avatar,2009-12-15,2923706026,162,False,tt0499549,en,"In the 22nd century, a paraplegic Marine is di...","Dune Entertainment, Lightstorm Entertainment, ...","English, Spanish"
4,The Avengers,2012-04-25,1518815515,143,False,tt0848228,en,When an unexpected enemy emerges and threatens...,Marvel Studios,"English, Hindi, Russian"


In [4]:
movie_df = movie_df[movie_df['adult'] == False].drop(['adult'], axis=1)

In [5]:
movie_df = movie_df.dropna(subset=['imdb_id', 'release_date'])

In [6]:
language_mapping = {
    'en': 'English', 'ko': 'Korean', 'fr': 'French', 'ja': 'Japanese',
    'it': 'Italian', 'es': 'Spanish', 'pl': 'Polish', 'pt': 'Portuguese',
    'hi': 'Hindi', 'tr': 'Turkish', 'da': 'Danish', 'de': 'German',
    'cn': 'Chinese', 'id': 'Indonesian', 'zh': 'Chinese', 'sv': 'Swedish',
    'el': 'Greek', 'ru': 'Russian', 'sr': 'Serbian', 'fa': 'Persian',
    'th': 'Thai', 'ar': 'Arabic', 'no': 'Norwegian', 'nb': 'Norwegian Bokmål',
    'fi': 'Finnish', 'te': 'Telugu', 'la': 'Latin', 'nl': 'Dutch',
    'hu': 'Hungarian', 'he': 'Hebrew', 'is': 'Icelandic', 'ro': 'Romanian',
    'gl': 'Galician', 'uk': 'Ukrainian', 'eu': 'Basque', 'et': 'Estonian',
    'bs': 'Bosnian', 'bn': 'Bengali', 'xx': 'No Language', 'sh': 'Serbo-Croatian',
    'km': 'Khmer', 'cs': 'Czech', 'tn': 'Tswana', 'ml': 'Malayalam',
    'mk': 'Macedonian', 'ga': 'Irish', 'hy': 'Armenian', 'ku': 'Kurdish',
    'ka': 'Georgian', 'ta': 'Tamil', 'kn': 'Kannada', 'tl': 'Tagalog',
    'vi': 'Vietnamese', 'ca': 'Catalan', 'dz': 'Dzongkha', 'sw': 'Swahili',
    'kk': 'Kazakh', 'wo': 'Wolof', 'sk': 'Slovak', 'lv': 'Latvian',
    'mi': 'Maori', 'bo': 'Tibetan', 'mn': 'Mongolian', 'ps': 'Pashto',
    'lt': 'Lithuanian', 'ur': 'Urdu', 'sl': 'Slovenian', 'sc': 'Sardinian',
    'af': 'Afrikaans', 'hr': 'Croatian', 'se': 'Northern Sami', 'iu': 'Inuktitut',
    'ms': 'Malay', 'bm': 'Bambara', 'mr': 'Marathi', 'bg': 'Bulgarian',
    'lo': 'Lao', 'am': 'Amharic', 'cy': 'Welsh', 'xh': 'Xhosa',
    'qu': 'Quechua', 'yi': 'Yiddish', 'yo': 'Yoruba', 'pa': 'Punjabi',
    'sq': 'Albanian', 'eo': 'Esperanto', 'gu': 'Gujarati', 'zu': 'Zulu',
    'st': 'Southern Sotho', 'ne': 'Nepali', 'ak': 'Akan', 'mt': 'Maltese',
    'rw': 'Kinyarwanda', 'ay': 'Aymara', 'ln': 'Lingala', 'as': 'Assamese',
    'si': 'Sinhala', 'mo': 'Moldavian', 'ff': 'Fulah', 'so': 'Somali',
    'ky': 'Kyrgyz', 'ik': 'Inupiaq', 'az': 'Azerbaijani', 'kl': 'Kalaallisut',
    'jv': 'Javanese', 'fo': 'Faroese', 'li': 'Limburgish', 'sn': 'Shona',
    'tg': 'Tajik', 'ks': 'Kashmiri', 'my': 'Burmese', 'su': 'Sundanese',
    'lb': 'Luxembourgish', 'ht': 'Haitian', 'ha': 'Hausa', 'sa': 'Sanskrit',
    'rm': 'Romansh', 'sm': 'Samoan', 'tk': 'Turkmen', 'ab': 'Abkhazian',
    'fy': 'Frisian', 'be': 'Belarusian', 'gd': 'Scottish Gaelic', 'om': 'Oromo',
    'or': 'Oriya', 'ny': 'Chichewa', 'uz': 'Uzbek', 'cr': 'Cree',
    'mg': 'Malagasy', 'gn': 'Guarani', 'tw': 'Twi', 'mh': 'Marshallese',
    'co': 'Corsican', 'ig': 'Igbo', 'os': 'Ossetic', 'nv': 'Navajo',
    'tt': 'Tatar', 'dv': 'Divehi', 'nn': 'Norwegian Nynorsk', 'sg': 'Sango',
    'nd': 'North Ndebele', 'bi': 'Bislama', 'ug': 'Uighur', 'kw': 'Cornish',
    'lg': 'Ganda', 'ti': 'Tigrinya', 'ty': 'Tahitian', 'ba': 'Bashkir',
    'kg': 'Kongo', 'nr': 'South Ndebele', 'ie': 'Interlingue', 'ce': 'Chechen',
    'sd': 'Sindhi', 'to': 'Tonga', 'cv': 'Chuvash', 'ss': 'Swati',
    'ki': 'Kikuyu', 'oj': 'Ojibwa', 'ia': 'Interlingua', 'oc': 'Occitan',
    'ch': 'Chamorro', 'fj': 'Fijian', 'gv': 'Manx', 'ii': 'Sichuan Yi'
}

In [7]:
movie_df['original_language'] = movie_df['original_language'].map(language_mapping)
movie_df.head(5)

Unnamed: 0,title,release_date,revenue,runtime,imdb_id,original_language,overview,production_companies,spoken_languages
0,Inception,2010-07-15,825532764,148,tt1375666,English,"Cobb, a skilled thief who commits corporate es...","Legendary Pictures, Syncopy, Warner Bros. Pict...","English, French, Japanese, Swahili"
1,Interstellar,2014-11-05,701729206,169,tt0816692,English,The adventures of a group of explorers who mak...,"Legendary Pictures, Syncopy, Lynda Obst Produc...",English
2,The Dark Knight,2008-07-16,1004558444,152,tt0468569,English,Batman raises the stakes in his war on crime. ...,"DC Comics, Legendary Pictures, Syncopy, Isobel...","English, Mandarin"
3,Avatar,2009-12-15,2923706026,162,tt0499549,English,"In the 22nd century, a paraplegic Marine is di...","Dune Entertainment, Lightstorm Entertainment, ...","English, Spanish"
4,The Avengers,2012-04-25,1518815515,143,tt0848228,English,When an unexpected enemy emerges and threatens...,Marvel Studios,"English, Hindi, Russian"


In [8]:
def get_movie_info(imdb_id):
    try:
        ia = IMDb()
        movie = ia.get_movie(imdb_id[2:])
        
        plot_summary = movie.get('plot outline', None)
        if not plot_summary and 'plot' in movie and movie['plot']:
            plot_summary = movie['plot'][0] if isinstance(movie['plot'], list) else movie['plot']
        
        plot_synopsis = movie.get('synopsis', None)
        if isinstance(plot_synopsis, list) and len(plot_synopsis) > 0:
            plot_synopsis = plot_synopsis[0]
        elif isinstance(plot_synopsis, list) and len(plot_synopsis) == 0:
            plot_synopsis = None
        
        genres = movie.get('genres', [])
        genres_str = ', '.join(genres) if genres else None
        
        cast = movie.get('cast', [])
        cast_names = [person['name'] for person in cast]
        cast_str = ', '.join(cast_names) if cast_names else None
        
        directors = movie.get('director', [])
        director_names = [person['name'] for person in directors]
        director_str = ', '.join(director_names) if director_names else None
        
        
        return imdb_id, plot_summary, plot_synopsis, genres_str, cast_str, director_str
    except Exception as e:
        print(f"Error fetching movie info for {imdb_id}: {e}")
        return None

In [9]:
def parallel_apply(df, func, num_workers=16):
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        results = list(tqdm(executor.map(func, df['imdb_id']), total=len(df), desc="Processing movies"))
    return pd.DataFrame(results, columns=['imdb_id', 'plot_summary', 'plot_synopsis', 'genres', 'cast', 'directors'])


In [10]:
movie_df = movie_df.drop_duplicates(subset=['imdb_id'])
movie_df

Unnamed: 0,title,release_date,revenue,runtime,imdb_id,original_language,overview,production_companies,spoken_languages
0,Inception,2010-07-15,825532764,148,tt1375666,English,"Cobb, a skilled thief who commits corporate es...","Legendary Pictures, Syncopy, Warner Bros. Pict...","English, French, Japanese, Swahili"
1,Interstellar,2014-11-05,701729206,169,tt0816692,English,The adventures of a group of explorers who mak...,"Legendary Pictures, Syncopy, Lynda Obst Produc...",English
2,The Dark Knight,2008-07-16,1004558444,152,tt0468569,English,Batman raises the stakes in his war on crime. ...,"DC Comics, Legendary Pictures, Syncopy, Isobel...","English, Mandarin"
3,Avatar,2009-12-15,2923706026,162,tt0499549,English,"In the 22nd century, a paraplegic Marine is di...","Dune Entertainment, Lightstorm Entertainment, ...","English, Spanish"
4,The Avengers,2012-04-25,1518815515,143,tt0848228,English,When an unexpected enemy emerges and threatens...,Marvel Studios,"English, Hindi, Russian"
...,...,...,...,...,...,...,...,...,...
1038580,A Good Game: Karloff and Lugosi at Universal,2019-06-18,0,82,tt10482492,English,Documentary covering the relationship of Boris...,,
1038582,River City Drumbeat,2019-11-12,0,95,tt11523094,English,"For 30 years, Ed “Nardie” White has dedicated ...",,English
1038589,So You Like the Neighborhood,2018-04-21,0,17,tt7115040,English,When a girl finds out she is being evicted fro...,,English
1038603,Den siste bohem,2014-04-01,0,47,tt3839972,Norwegian,The last bohemian is a portrait of the Norwegi...,Antipode Films,Norwegian


In [138]:
def process_in_chunks(dataframe, parallel_func, func, output_name, chunk_size, start_chunk=0, pause_duration=300):
    total_rows = dataframe.shape[0]
    total_chunks = (total_rows - 1) // chunk_size + 1

    
    for chunk_index in range(start_chunk, total_chunks):
        start_row = chunk_index * chunk_size
        end_row = min(start_row + chunk_size, total_rows)
        df_chunk = dataframe.iloc[start_row:end_row]

        print(f"Processing chunk {chunk_index + 1}/{total_chunks}...")

        result_df = parallel_func(df_chunk, func)      

        filename = f'{output_name}_{chunk_index + 1}.csv'
        result_df.to_csv(filename, index=False)
        print(f"Chunk {chunk_index + 1} written to {filename}")

        if chunk_index < total_chunks - 1:
            print(f"Pausing for {pause_duration} seconds...")
            time.sleep(pause_duration)

In [None]:
process_in_chunks(movie_df, parallel_apply, get_movie_info, 'output_chunk', chunk_size=1000, start_chunk=0, pause_duration=180)

In [13]:
df = pd.read_csv("../data/title.ratings.tsv", sep="\t")

In [14]:
df = df[df['numVotes'] > 5000]
df

Unnamed: 0,tconst,averageRating,numVotes
9,tt0000010,6.8,7571
11,tt0000012,7.4,12893
13,tt0000014,7.1,5859
300,tt0000417,8.1,55907
310,tt0000439,7.3,21037
...,...,...,...
1437787,tt9900782,8.4,42574
1437949,tt9906260,9.7,127298
1437979,tt9907782,6.2,16874
1438250,tt9913754,7.3,5387


In [None]:
process_in_chunks(movie_df, chunk_size=1000, start_chunk=0, pause_duration=300)

In [15]:
movie_df = movie_df[movie_df['imdb_id'].isin(df['tconst'])].dropna(subset=['overview'])
movie_df

Unnamed: 0,title,release_date,revenue,runtime,imdb_id,original_language,overview,production_companies,spoken_languages
0,Inception,2010-07-15,825532764,148,tt1375666,English,"Cobb, a skilled thief who commits corporate es...","Legendary Pictures, Syncopy, Warner Bros. Pict...","English, French, Japanese, Swahili"
1,Interstellar,2014-11-05,701729206,169,tt0816692,English,The adventures of a group of explorers who mak...,"Legendary Pictures, Syncopy, Lynda Obst Produc...",English
2,The Dark Knight,2008-07-16,1004558444,152,tt0468569,English,Batman raises the stakes in his war on crime. ...,"DC Comics, Legendary Pictures, Syncopy, Isobel...","English, Mandarin"
3,Avatar,2009-12-15,2923706026,162,tt0499549,English,"In the 22nd century, a paraplegic Marine is di...","Dune Entertainment, Lightstorm Entertainment, ...","English, Spanish"
4,The Avengers,2012-04-25,1518815515,143,tt0848228,English,When an unexpected enemy emerges and threatens...,Marvel Studios,"English, Hindi, Russian"
...,...,...,...,...,...,...,...,...,...
1018754,Dark Harvest,2023-10-13,0,0,tt9204328,English,"In a cursed town, the annual harvest becomes a...","Matt Tolmach Productions, Metro-Goldwyn-Mayer",English
1021245,Mean Girls Musical,2024-01-12,0,0,tt11762114,English,"An adaptation of Tina Fey, Jeff Richmond, and ...","Paramount, Little Stranger, Broadway Video",English
1026705,Eileen,2023-12-01,0,98,tt5198890,English,"In 1960s Massachusetts, an unhappy prison secr...","Likely Story, Film4 Productions, Omniscient Pr...",English
1030328,The Hunger Games: The Ballad of Songbirds & Sn...,2023-11-15,0,157,tt10545296,English,64 years before he becomes the tyrannical pres...,"Lionsgate, Color Force, Good Universe",English


In [16]:
def concatenate_chunks(chunk_files_pattern):
    """
    Concatenate all CSV files matching the given pattern into a single DataFrame.
    
    Parameters:
    - chunk_files_pattern: str, pattern to match the chunk files (e.g., 'output_chunk_*.csv')
    
    Returns:
    - pd.DataFrame: concatenated DataFrame
    """
    chunk_files = glob.glob(chunk_files_pattern)
    
    df_list = []
    
    for file in chunk_files:
        df = pd.read_csv(file)
        df_list.append(df)
    
    concatenated_df = pd.concat(df_list, ignore_index=True)
    
    return concatenated_df


In [None]:
def delete_chunk_files(chunk_files_pattern):
    """
    Delete all files matching the given pattern.
    
    Parameters:
    - chunk_files_pattern: str, pattern to match the chunk files (e.g., 'output_chunk_*.csv')
    """
    chunk_files = glob.glob(chunk_files_pattern)
    for file in chunk_files:
        os.remove(file)


In [None]:
chunk_files_pattern = 'output_chunk_*.csv'
scraped_df = concatenate_chunks(chunk_files_pattern)
delete_chunk_files(chunk_files_pattern)

In [23]:
movie_df = movie_df.merge(scraped_df, on='imdb_id').merge(df, left_on='imdb_id', right_on='tconst').drop(columns=['tconst', 'rating'])

In [25]:
movie_df.isna().sum()

title                      0
release_date               0
revenue                    0
runtime                    0
imdb_id                    0
original_language          0
overview                   0
production_companies     373
spoken_languages          59
plot_summary               0
plot_synopsis           7896
genres                     0
cast                      33
directors                 61
averageRating              0
numVotes                   0
dtype: int64

In [141]:
def extract_section_text(section):
    text = ""
    for subsection in section.sections:
        text += f"{subsection.title}\n{subsection.text}\n"
        text += extract_section_text(subsection)
    return text


wiki_wiki = wiki.Wikipedia("MyBot/1.0 (email@yahoo.com)", 'en')


def get_wikipedia_plot(args):
    imdb_id, title, release_date = args
    year = release_date[:4]
    search_terms = [
        f"{title} ({year} film)",
        f"{title} (film)",
        title
    ]
    
    for term in search_terms:
        try:
            page = wiki_wiki.page(term)
            if page.exists():
                for section in page.sections:
                    if 'plot' in section.title.lower() or 'synopsis' in section.title.lower():
                        return imdb_id, section.text + extract_section_text(section)
        except Exception as e:
            print(f"Error retrieving page for {term}: {e}")
    return imdb_id, None

In [136]:
def wiki_parallel_apply(df, func, num_workers=16):
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        args = list(zip(df['imdb_id'], df['title'], df['release_date']))
        results = list(tqdm(executor.map(func, args), total=len(df), desc="Processing movies"))
    return pd.DataFrame(results, columns=['imdb_id', 'wiki_plot'])

In [None]:
process_in_chunks(movie_df, wiki_parallel_apply, get_wikipedia_plot, 'wiki_output_chunk', chunk_size=600, start_chunk=24, pause_duration=120)

In [157]:
chunk_files_pattern = 'wiki_output_chunk_*.csv'
wiki_df = concatenate_chunks(chunk_files_pattern)
delete_chunk_files(chunk_files_pattern)

In [161]:
wiki_df

Unnamed: 0,imdb_id,wiki_plot
0,tt1076252,The unpopular Mandy Gilbert lives with her str...
1,tt3106120,Amy (Danielle Harris) prepares to leave her jo...
2,tt2978102,Nick Barrow plans and sells heists to the high...
3,tt7298400,"In 2001, Qiao and her boyfriend Bin, a mob bos..."
4,tt0449061,
...,...,...
17788,tt0251031,The film is shot in the style of a reality TV ...
17789,tt0053183,"In April 1953, during the Korean War, K Compan..."
17790,tt0061122,"The movie is set at St. Francis Academy, a fic..."
17791,tt0396688,


In [174]:
movie_df = movie_df.merge(wiki_df, on="imdb_id")

In [180]:
movie_df.to_csv("../data/data.csv", index=False)