In [1]:
import joblib
import os
import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import mean_squared_error
# from sklearn.preprocessing import LabelEncoder

if 'filtered_df_synopsis.csv' in os.listdir('../data'):
    print("Partial dataset found!")
    # use partial or full dataset
    df_file_name = 'filtered_df_synopsis.csv'
else:
    # use draft dataset
    df_file_name = 'filtered_df.csv'

print(f"Using {df_file_name} as dataset.")
    
try:
    df = pd.read_csv(f'../data/{df_file_name}', low_memory=False)
except FileNotFoundError as e:
    print("File not found. Download the IMDB_Dataset.")
    raise e

Partial dataset found!
Using filtered_df_synopsis.csv as dataset.


In [2]:
pd.options.display.max_columns = None
pd.options.display.max_rows = 100

[print(i) for i in df.columns]
print()
# df.shape

Best Picture
Certificate (GB)
synopsis
Certificate (US)
Genres (1st)
Genres (2nd)
Genres (3rd)
Genres (full list)
Image Url (Title)
IMDB Url (title)
Plot
Plot (medium)
Production Companies (1st)
Production Companies (2nd)
Production Companies (3rd)
Production Companies (List)
Tagline
Title
Title Id
What did they do ?
Year of Release
IMDB Rating
Number Of Votes
Runtime (Minutes)
Lead Actors



In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def get_imdb_synopsis(driver, movie_url) -> str:
    # Go to the IMDb synopsis page
    driver.get(movie_url + "plotsummary/?ref_=tt_stry_pl")
    try:
        # Wait for the synopsis content to load
        synopsis_divs = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, 'ipc-html-content-inner-div'))
        )
    except:
        print(e)
        print("No synopsis divs found.")
        return "No synopsis divs found."
        

    # Extract and return the text of the last synopsis div
    return synopsis_divs[-1].text if synopsis_divs else "No synopsis divs found."

In [4]:
driver = webdriver.Firefox()

# fliter_df doesn't have synopsis column
if "synopsis" in df.columns:
    print("synopsis column exists")
else:
    df.insert(loc=2, column="synopsis", value=str("null"))
    print("synopsis column NOT exists")


# Looping through the rows of the dataframe

for index, row in df.iterrows():
    try:
        print(f"{index}. getting synopsis for {row['Title']}")
        print("current synopsis: ",repr(row["synopsis"]))
        if type(row["synopsis"]) == float:
            row["synopsis"] = "null"

        if len(row["synopsis"]) <= 23:
            # movie_name = row['IMDB Url (title)']
            movie_url = row["IMDB Url (title)"]

            # Calling the function with the link
            synopsis = get_imdb_synopsis(driver=driver, movie_url=movie_url)
            # Assigning the result to the 'synopsis' column for the current row
            df.at[index, "synopsis"] = synopsis
            print("new synopsis: ", synopsis)
            # print(f"{synopsis=}")
        else:
            print("synopsis column filled already")

        # if index > 2000:
        #     break
    except Exception as e:
        # print(e)
        print("Exception...........")
        from datetime import datetime
        # save each batch for recovery
        df.to_csv(f'../data/filtered_df_synopsis_{datetime.now()}.csv', index=False)

# # save it as a csv file
# df.to_csv('../data/filtered_df_synopsis.csv', index=False)

synopsis column exists
0. getting synopsis for Cocoon: The Return
current synopsis:  "Art, Ben and Joe are back! So are their wives and good friend Bernie in their first adventure since their last! Five years since the senior citizens blasted off into space with the Antareans return to earth because their alien friends have to collect the rest of the cocoons in the ocean, believed to be in danger from an earthquake. Ben and Mary visit their family, while Art and Joe visit Bernie, who's still hangin' on. Art, Ben and Joe had forgotten what it was like on earth and immediately begin to feel their weaknesses, except for Art's wife who's pregnant! Meanwhile in the ocean, a biologist company snatched a cocoon out of the ocean and are doing research on it...\n—Dylan Self <robocoptng986127@aol.com>"
synopsis column filled already
1. getting synopsis for Not Another Teen Movie
current synopsis:  "After breaking up with his girlfriend, Priscilla, a popular jock, Jake Wyler makes a bet with his 

In [None]:
from datetime import datetime
# save each batch for recovery
df.to_csv(f'../data/filtered_df_synopsis_{datetime.now()}.csv', index=False)

In [None]:
null_counts = df.isnull().sum()
null_counts

Best Picture                   17707
Certificate (GB)                3407
synopsis                       17751
Certificate (US)                1466
Genres (1st)                       1
Genres (2nd)                    2842
Genres (3rd)                    8254
Genres (full list)                 1
Image Url (Title)                  1
IMDB Url (title)                   0
Plot                               2
Plot (medium)                   2943
Production Companies (1st)       283
Production Companies (2nd)      3777
Production Companies (3rd)      8121
Production Companies (List)      283
Tagline                         4016
Title                              0
Title Id                           0
What did they do ?                 0
Year of Release                    0
IMDB Rating                        0
Number Of Votes                    0
Runtime (Minutes)                  0
Lead Actors                        0
dtype: int64

In [None]:
df.sort_values(by='synopsis', ascending=False).head(20)

Unnamed: 0,Best Picture,Certificate (GB),synopsis,Certificate (US),Genres (1st),Genres (2nd),Genres (3rd),Genres (full list),Image Url (Title),IMDB Url (title),Plot,Plot (medium),Production Companies (1st),Production Companies (2nd),Production Companies (3rd),Production Companies (List),Tagline,Title,Title Id,What did they do ?,Year of Release,IMDB Rating,Number Of Votes,Runtime (Minutes),Lead Actors
43,,15,Zoro is New York's city's hottest and most elu...,R,Drama,Music,,"Drama,Music",https://m.media-amazon.com/images/M/MV5BMzljMT...,https://www.imdb.com/title/tt0084904/,South Bronx graffiti artist Zoro is commission...,"Universally hailed as the first hip-hop movie,...",Wild Style,,,Wild Style; ;,Break Dancing Graffiti Rap,Wild Style,tt0084904,actor,1982,7.0,2835.0,82.0,"['A.J. Scratch', 'Carlos Morales', 'Dot-a-Rock..."
115,,15,"Written and directed by Shana Feste, in her fe...",R,Drama,Romance,,"Drama,Romance",https://m.media-amazon.com/images/M/MV5BMTgxNj...,https://www.imdb.com/title/tt1226232/,A drama that is centered around a troubled tee...,"Teenagers Rose and Bennett were in love, and t...",Barbarian Films,Oceana Media Finance,Silverwood Films,Barbarian Films; Oceana Media Finance; Silverw...,,The Greatest,tt1226232,actor,2009,6.6,9303.0,99.0,"['Aaron Taylor-Johnson', 'Alexander Flores', '..."
110,,15,Women are dancing. The same obese women are on...,R,Drama,Thriller,,"Drama,Thriller",https://m.media-amazon.com/images/M/MV5BMTYwMz...,https://www.imdb.com/title/tt4550098/,A wealthy art gallery owner is haunted by her ...,"A ""story inside a story,"" in which the first p...",Focus Features,Fade to Black Productions,Artina Films,Focus Features; Fade to Black Productions; Art...,When you love someone you can't just throw it ...,Nocturnal Animals,tt4550098,actor,2016,7.5,293675.0,116.0,"['Aaron Taylor-Johnson', 'Franco Vega', 'Neil ..."
196,,15,"Wisconsin Death Trip is an intimate, shocking ...",Not Rated,Biography,Crime,Drama,"Biography,Crime,Drama,History",https://m.media-amazon.com/images/M/MV5BMTM2OD...,https://www.imdb.com/title/tt0210389/,A series of grisly events that took place in t...,"Wisconsin Death Trip is an intimate, shocking ...",BBC Arena,British Broadcasting Corporation (BBC),Cinemax Reel Life,BBC Arena; British Broadcasting Corporation (B...,,Wisconsin Death Trip,tt0210389,actor,1999,6.6,1566.0,76.0,"['Clay Anton', 'Eddie Kunz', 'Michael Olson', ..."
182,,15,Willis Embry is a psychologist who works at a ...,PG-13,Comedy,Crime,Romance,"Comedy,Crime,Romance",https://m.media-amazon.com/images/M/MV5BMTA2Mz...,https://www.imdb.com/title/tt0105573/,A dying prisoner tells the prison therapist ab...,Willis Embry is a jail psychologist whose girl...,Kings Road Entertainment,Paramount Pictures,,Kings Road Entertainment; Paramount Pictures;,A Hot Comedy About Cold Cash.,There Goes the Neighborhood,tt0105573,actor,1992,5.7,1214.0,88.0,"['Alan Gelfant', 'Bo Sharon', 'Harris Yulin', ..."
163,,A,"Williamsburg, Brooklyn, New York City, 1944. R...",PG,Drama,,,Drama,https://m.media-amazon.com/images/M/MV5BOWNmNG...,https://www.imdb.com/title/tt0082175/,"In 1944, in Brooklyn, two Jewish kids become f...","Brooklyn 1944. Despite being the same age, hav...",Chosen Film Company,,,Chosen Film Company; ;,One boy. Two worlds. A time to choose.,The Chosen,tt0082175,actor,1981,7.2,2323.0,108.0,"['Abraham Katz', 'Bruce MacVittie', 'Douglas W..."
13,,15,While working on a documentary on his old neig...,R,Comedy,,,Comedy,https://m.media-amazon.com/images/M/MV5BMzJlOW...,https://www.imdb.com/title/tt0117577/,Les is making a film about his old neighborhoo...,While working on a documentary on his old neig...,Orenda Films,,,Orenda Films; ;,With this search team... pray you don't get lost.,The Search for One-eye Jimmy,tt0117577,actor,1994,6.0,1809.0,84.0,"[""'Stretch' Merced"", 'Michael Louis Wells', 'A..."
51,,15,While staying at a remote cabin for a week-lon...,R,Horror,Sci-Fi,Thriller,"Horror,Sci-Fi,Thriller",https://m.media-amazon.com/images/M/MV5BM2NkZm...,https://www.imdb.com/title/tt3832096/,While staying at a remote cabin for a week-lon...,,Contend,Armory Films,Pelican Point Media,Contend; Armory Films; Pelican Point Media,You can't run from what's inside.,Cabin Fever,tt3832096,actor,2016,3.7,10705.0,99.0,"['Aaron Trainor', 'Bill Terrell', 'Donald Lee ..."
188,,18,When young and successful reporter Jamie finds...,TV-MA,Drama,Horror,Mystery,"Drama,Horror,Mystery,Thriller",https://m.media-amazon.com/images/M/MV5BMjIxMz...,https://www.imdb.com/title/tt3894404/,When young and successful reporter Jamie finds...,When young and successful reporter Jamie finds...,Boku Films Pte Limited,Boku Films,PEP Pictures,Boku Films Pte Limited; Boku Films; PEP Pictures,The chosen will be sacrificed.,The Faith of Anna Waters,tt3894404,actor,2016,3.8,2068.0,95.0,"['Adrian Pang', 'Crispian Chan', 'Colin Borgon..."
107,,15,"When jaded teens Jim, Eva, Emily and Mo meet W...",R,Drama,Horror,Thriller,"Drama,Horror,Thriller",https://m.media-amazon.com/images/M/MV5BNWQ5Zj...,https://www.imdb.com/title/tt1319704/,Five teenagers with different personalities ar...,,Ruby Films,Film4,UK Film Council,Ruby Films; Film4; UK Film Council,Control. Alter. Delete.,Chatroom,tt1319704,actor,2010,5.4,9473.0,97.0,"['Aaron Taylor-Johnson', 'Gerald Home', 'Alex ..."


In [None]:
# save it as a csv file
df.to_csv('../data/filtered_df_synopsis.csv', index=False)