# Movie Plots Until 2017

In [None]:
import pandas as pd
movies_df = pd.read_csv('data/wiki_movie_plots_deduped.csv')
movies_df.head()

In [None]:
movies_df[movies_df['Release Year'] < 2017]

In [None]:
def filter_by_exact_word(df, column_name, search_word):
    """
    Filters the DataFrame by rows where the specified column contains the exact search word.

    Parameters:
    df (pd.DataFrame): The DataFrame to filter.
    column_name (str): The name of the column to search.
    search_word (str): The exact word to search for.

    Returns:
    pd.DataFrame: The filtered DataFrame.
    """
    # Construct the regular expression for exact word match
    search_pattern = r'\b' + search_word + r'\b'
    
    # Filter the DataFrame
    filtered_df = df[df[column_name].str.contains(search_pattern, case=False, na=False, regex=True)]
    
    return filtered_df

# Use the function to filter the DataFrame
search_word = 'Avengers'
result_df = filter_by_exact_word(movies_df, 'Title', search_word)

result_df

# IMDB Web Scraping BeautifulSoup

## Search

In [None]:
import requests
import re
from bs4 import BeautifulSoup

search_term = 'Thor'

# Escape special characters in the search term (not necessary here but good practice)
escaped_search_term = re.escape(search_term)

# Create a regex pattern to match the whole word "Thor"
pattern = rf'\b{escaped_search_term}\b'

# search_term = f'\b{search_term}\b'

# Define the URL
url = f'https://www.imdb.com/find/?q={search_term}'

# Define headers including a User-Agent
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Make an HTTP request to the URL with headers
response = requests.get(url, headers=headers)
response.raise_for_status()  # Raise an exception for HTTP errors

# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Print out the parsed HTML to inspect it (optional)
# print(soup.prettify())

# Find all <a> tags with the class 'ipc-metadata-list-summary-item__t'
links = soup.find_all('a', class_='ipc-metadata-list-summary-item__t')

# Extract titles and their corresponding title codes
titles_and_codes = []
for link in links:
    title = link.get_text()
    href = link.get('href')
    
    # Extract the title code from the href attribute
    if href and href.startswith('/title/'):
        title_code = href.split('/')[2]
    else:
        title_code = None
    
    titles_and_codes.append((title, title_code))

# Print the results
for title, code in titles_and_codes:
    print(f'Title: {title}, Code: {code}')

## Title Display

In [None]:
import requests
from bs4 import BeautifulSoup


title_code = 'tt0200211'

# Define the URL (use the actual URL if you want to fetch live data)
url = f'https://www.imdb.com/title/{title_code}/'

# Define headers including a User-Agent
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Make an HTTP request to the URL with headers
response = requests.get(url, headers=headers)
response.raise_for_status()  # Raise an exception for HTTP errors

# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Extract the title from the <title> tag
title_tag = soup.find('title')
title = title_tag.get_text() if title_tag else 'No title found'

# Extract the description from the <meta> tag with name='description'
meta_description_tag = soup.find('meta', attrs={'name': 'description'})
description = meta_description_tag.get('content') if meta_description_tag else 'No description found'

# Print the extracted title and description
print('Title:', title)
print('Description:', description)


## Wikipedia API

In [None]:
import requests
from bs4 import BeautifulSoup

def get_film_details(title):
    base_url = "https://en.wikipedia.org/wiki/"
    url = base_url + title.replace(" ", "_")
    
    # Fetch the content from the Wikipedia page
    response = requests.get(url)
    if response.status_code != 200:
        return "Failed to retrieve page."

    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Extract the infobox
    infobox = soup.find('table', {'class': 'infobox'})
    if not infobox:
        return "Infobox not found."

    # Extract data from the infobox
    details = {}
    for row in infobox.find_all('tr'):
        header = row.find('th')
        data = row.find('td')
        if header and data:
            header_text = header.get_text(strip=True)
            data_text = data.get_text(strip=True)
            details[header_text] = data_text

    # Extract the plot section
    # Find the "Plot" section
    plot_section = soup.find('h2', id='Plot')

    # Extract text from the following <p> tags
    plot = ''
    for sibling in plot_section.find_all_next(['p']):
        if sibling.name == 'div' and sibling.find('h2'):
            break
        plot += sibling.get_text()

        # Extract actors section
        actors = "Not Available"
        cast_section = soup.find('span', {'id': 'Cast'})
        if cast_section:
            cast_content = cast_section.find_next('ul')
            if cast_content:
                actors = ', '.join([li.get_text(strip=True) for li in cast_content.find_all('li')])

    # Extract year and genre
    year = details.get('Release date', 'Not Available')
    genre = details.get('Genre', 'Not Available')

    return (f"Title: {title}\n\n"
            f"Extract:\n{details.get('Plot', 'Not Available')}\n\n"
            f"Plot:\n{plot}\n\n"
            f"Actors:\n{actors}\n\n"
            f"Year:\n{year}\n\n"
            f"Genre:\n{genre}")

# Example usage
film_title = "Thor:_Love_and_Thunder"  # Replace with the film you are interested in
print(get_film_details(film_title))


In [None]:
from bs4 import BeautifulSoup

# Parse the HTML
# soup = BeautifulSoup(html_content, 'html.parser')

title = "The_Godfather" 

base_url = "https://en.wikipedia.org/wiki/"
url = base_url + title.replace(" ", "_")
    
    # Fetch the content from the Wikipedia page
response = requests.get(url)
if response.status_code != 200:
    print("Failed to retrieve page.")    

soup = BeautifulSoup(response.content, 'html.parser')

# Find the table with the class "infobox vevent"
table = soup.find('table', class_='infobox vevent')

# Extract all table rows
rows = table.find_all('tr')

# Extract content from each row and store in separate variables
row_data = [row.get_text(separator=' ', strip=True) for row in rows]

# Print each row content
for i, row in enumerate(row_data):
    print(f"Row {i+1}: {row}")


In [None]:
# Find the table with the class "infobox vevent"
table = soup.find('table', class_='infobox vevent')

# Extract all paragraphs that follow this table
paragraphs_after_table = []
for sibling in table.find_all_next():
    if sibling.name == 'p':
        paragraphs_after_table.append(sibling.get_text(strip=True))
    elif sibling.name == 'div' and sibling.find('h2'):
        # Assuming you want to stop collecting paragraphs if another section starts
        break

# Print the paragraphs
for i, paragraph in enumerate(paragraphs_after_table):
    print(f"Paragraph {i+1}: {paragraph} ")

In [None]:
paragraphs_after_table

# Movies TMDB Data

### Review of the dataset


In [3]:
"""
Dataset Analysis and Transformation Script

This script loads a dataset from a specified file, performs data cleaning and transformation, 
and provides descriptive statistics and optional filtering. The script handles missing values, 
splits column values, creates DataFrames for unique values, sorts data, and converts data types 
as needed.

Parameters:
- dataset_path (str): Path to the dataset file.
- file_format (str): Format of the dataset file.
- nan_fill_value (various types): Value to replace NaN values.
- drop_nan (bool): Whether to drop rows with NaN values.
- columns_to_split_values (list of str): List of columns to split.
- cell_value_separator (str): Separator for splitting cell values.
- unique_columns (list of str): List of columns to create DataFrames for unique values.
- sort_column (str): Column to sort by.
- sort_order (str): Sorting order ('ascending' or 'descending').
- sort_column_type (str): Type to convert the sort column values to.
- filter_condition (str or None): Condition for filtering rows/columns.
"""

import pandas as pd

def load_dataset(file_path, file_format):
    """Load dataset from a CSV or Excel file."""
    if file_format == 'csv':
        return pd.read_csv(file_path)
    elif file_format == 'excel':
        return pd.read_excel(file_path)
    else:
        raise ValueError("Unsupported file format. Use 'csv' or 'excel'.")

def clean_missing_values(df, fill_value='', drop_nan=False):
    """Handle missing values by filling or dropping."""
    if drop_nan:
        return df.dropna()
    else:
        return df.fillna(fill_value)

def split_column_values(df, columns, separator):
    """Split column values and create unique value DataFrames."""
    split_dfs = {}
    for column in columns:
        if column in df.columns:
            split_df = df[column].str.split(separator, expand=True).stack()
            split_df.columns = [column,'values']
            unique_values_df = pd.DataFrame(split_df.drop_duplicates().reset_index(drop=True), columns=[column])    
            unique_values_df.replace('', pd.NA, inplace=True)
            unique_values_df.dropna(inplace=True)
            split_dfs[column] = unique_values_df
    return split_dfs

def get_unique_value_dfs(df, columns):
    """Create DataFrames for unique values of specified columns."""
    unique_dfs = {}
    for column in columns:
        if column in df.columns:
            unique_values_df = df[[column]].drop_duplicates().reset_index(drop=True)
            unique_dfs[column] = unique_values_df
    return unique_dfs

def sort_dataframe(df, column, order='descending', dtype='date'):
    """Sort DataFrame by a specified column and convert type."""
    if dtype == 'date':
        df[column] = pd.to_datetime(df[column], errors='coerce')
    df = df.sort_values(by=column, ascending=(order == 'ascending'))
    return df

def filter_dataframe(df, condition):
    """Filter DataFrame based on condition."""
    return df[df['credits'].str.contains(condition, na=False)]

def main():
    # Define parameters
    dataset_path = 'data/movies.csv'
    file_format = 'csv'
    nan_fill_value = ''
    drop_nan = False
    columns_to_split_values = ['genres', 'credits', 'production_companies', 'keywords']
    cell_value_separator = '-'
    unique_columns = ['genres', 'credits', 'production_companies', 'keywords']
    sort_column = 'release_date'
    sort_order = 'descending'
    sort_column_type = 'date'
    filter_condition = 'Woody Allen|Jack Nicholson|Natalie Portman'

    # Load dataset
    df = load_dataset(dataset_path, file_format)

    # Clean missing values
    df = clean_missing_values(df, nan_fill_value, drop_nan)

    # Split column values and get unique value DataFrames
    split_value_dfs = split_column_values(df, columns_to_split_values, cell_value_separator)
    unique_value_dfs = get_unique_value_dfs(df, unique_columns)

    # Sort DataFrame
    df = sort_dataframe(df, sort_column, sort_order, sort_column_type)

    # Optional Filtering
    if filter_condition:
        df = filter_dataframe(df, filter_condition)

    # Collect results into a single variable
    result = {
        'cleaned_df': df,
        'split_value_dfs': split_value_dfs,
        'unique_value_dfs': unique_value_dfs
    }

    # Display results
    print("Results:")
    return result

if __name__ == "__main__":
    result = main()
    for key, val in result.items():
        print(f"\n{key}:")
        if isinstance(val, pd.DataFrame):
            display(val)
        elif isinstance(val, dict):
            for sub_key, sub_val in val.items():
                print(f"\n{sub_key}:")
                display(sub_val)


Results:

cleaned_df:


Unnamed: 0,id,title,genres,original_language,overview,popularity,production_companies,release_date,budget,revenue,runtime,status,tagline,vote_average,vote_count,credits,keywords,poster_path,backdrop_path,recommendations
506,839369,May December,Drama-Comedy,en,Twenty years after their notorious tabloid rom...,113.793,Gloria Sanchez Productions-Killer Films-Mounta...,2023-11-16,20000000.0,223505.0,117.0,Released,Some roles are transformative.,6.889,27.0,Natalie Portman-Chris Tenzis-Charles Melton-Ju...,infidelity-sexual abuse-dark comedy-statutory ...,/fRGB8l78uhPp14CAi2vtouTSQge.jpg,/97MOhHIgU6ZdLcB9DrAhx3WAqrU.jpg,1050035-921452-848439-467244-915935-1069193-99...
100,616037,Thor: Love and Thunder,Fantasy-Action-Comedy,en,After his retirement is interrupted by Gorr th...,394.087,Marvel Studios-Kevin Feige Productions,2022-07-06,250000000.0,760928081.0,119.0,Released,The one is not the only.,6.625,5488.0,Chris Hemsworth-Natalie Portman-Christian Bale...,ex-girlfriend-hero-greek mythology-sequel-supe...,/pIkRyD18kl4FhoCNQuWxWu5cBLM.jpg,/jsoz1HlxczSuTx0mDl2h0lxy36l.jpg,539681-610150-985939-629176-2-45920-438148-782...
85754,926616,Becoming Al Pacino,Documentary,fr,Between the South Bronx and (New) Hollywood a ...,2.258,ARTE-ZED,2022-02-06,0.0,0.0,52.0,Released,,7.200,8.0,Al Pacino-Sarah-Jane Sauvegrain-John Cazale-Le...,,/flkfYWUzrmWqB4XiOeOKQ2H699u.jpg,/3aABjrUwm76OwDA1nxnCWxkTfqm.jpg,507086
69542,890823,Mr. Saturday Night,Documentary-Music,en,The untold story of Robert Stigwood the impres...,2.759,HBO Documentary Films-The Ringer Films-Polygra...,2021-11-13,0.0,0.0,83.0,Released,The untold story of Robert Stigwood and the di...,6.800,4.0,John Travolta-Robert De Niro-Marlon Brando-Jac...,,/sP13duYC8DJLoLq2PEJT6oifvKF.jpg,/5esNMPWDIa67jH1Zzet1Wi1yq1r.jpg,
56421,857729,Untold: Deal with the Devil,Documentary,en,Christy Martin broke boundaries and noses as s...,3.359,Propagate Content-Stardust Frames,2021-08-17,0.0,0.0,77.0,Released,,6.795,44.0,Don King-Mike Tyson-Jack Nicholson-Jay Leno-Ge...,,/j5utOakb8IF10Q0oRiej47dttWH.jpg,/jq6wx8KIb54cP0b3c7q5ih0ebPy.jpg,520901-857497-684700-633517-14144-857732-85773...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81203,84210,The Wild Ride,Drama,en,A rebellious punk of the beat generation spend...,2.381,Harvey Berman Productions-The Filmgroup,1960-06-17,30000.0,0.0,59.0,Released,From roaring hot rods... to the racing big time,5.500,11.0,Jack Nicholson-Georgianna Carter-Robert Bean-C...,kidnapping-hot rod-dirt track racing,/amdiPg2V58lkZ07TumO9nxEmHCL.jpg,/divcsWv5iFfd6GLeSNLImmRIEUt.jpg,
102920,219080,Too Soon to Love,Drama-Romance,en,Unmarried teenage lovers turn to desperate mea...,1.911,Dynasty,1960-02-01,0.0,0.0,85.0,Released,Too young to marry... Too late to turn back...,3.000,3.0,Jennifer West-Richard Evans-Warren Parker-Ralp...,fistfight-teenage love,/4cRwUavII5kfcs0TQL3GkxGOx6L.jpg,/oLV2xJJPCFk77bCVcYBL3rOp5wh.jpg,
80093,346696,Little Shop of Horrors,Horror-Comedy-Music,en,An exotic plant in a downtown flower shop conv...,2.414,Warner Bros. Pictures-Marc Platt Productions,1960-01-01,0.0,0.0,105.0,Released,,7.400,5.0,Jonathan Haze-Jackie Joseph-Mel Welles-Dick Mi...,shop-flower shop-plant-musical-telephone-blond...,/rzVZOmTNWieNaBrB3hQeSUKTsqI.jpg,/vHXX9Ga5Ttu4yhL3PxLntCi58zS.jpg,
41089,172237,The Cry Baby Killer,Crime,en,A teenage boy panics and takes hostages when h...,4.529,Allied Artists Pictures,1958-08-15,50000.0,600000.0,62.0,Released,YESTERDAY a Teenage Rebel... TODAY a mad-dog s...,5.200,11.0,Harry Lauter-Jack Nicholson-Carolyn Mitchell-B...,corruption-revenge-murder-juvenile delinquent,/imFRJwURS0ReMUvV97g5RaFxSTC.jpg,/jXSFMBfKseAeaKmCLkH68NBUHJq.jpg,



split_value_dfs:

genres:


Unnamed: 0,genres
0,Science Fiction
1,Action
2,Adventure
3,Horror
4,Mystery
5,Thriller
6,Drama
7,Animation
8,Family
9,Comedy



credits:


Unnamed: 0,credits
0,Rebecca Hall
1,Brian Tyree Henry
2,Dan Stevens
3,Kaylee Hottle
4,Alex Ferns
...,...
1182535,Brandon Garver
1182536,Doris Champs
1182537,Dete
1182538,Ingrid Barlon



production_companies:


Unnamed: 0,production_companies
0,Legendary Pictures
1,Warner Bros. Pictures
2,Apelles Entertainment
3,di Bonaventura Pictures
4,CMC Pictures
...,...
122209,Flor de Luz
122210,Petate Films
122211,Collage Productions
122212,Karakumfilm



keywords:


Unnamed: 0,keywords
0,giant monster
1,sequel
2,dinosaur
3,kaiju
4,fantasy world
...,...
43460,patrimonio
43461,title spoken by character
43462,anti vaccination
43463,extremism



unique_value_dfs:

genres:


Unnamed: 0,genres
0,Science Fiction-Action-Adventure
1,Action-Science Fiction-Horror
2,Horror-Mystery-Thriller
3,Action-Adventure-Science Fiction
4,Science Fiction-Adventure
...,...
11042,TV Movie-Horror-Fantasy-Drama
11043,Science Fiction-Mystery-Drama-Thriller
11044,Thriller-Adventure-Drama-Fantasy-Horror
11045,Documentary-Horror-Fantasy-Thriller-Drama-Comedy



credits:


Unnamed: 0,credits
0,Rebecca Hall-Brian Tyree Henry-Dan Stevens-Kay...
1,Jason Statham-Wu Jing-Shuya Sophia Cai-Sergio ...
2,Russell Crowe-Daniel Zovatto-Alex Essoe-Franco...
3,Anthony Ramos-Dominique Fishback-Luna Lauren V...
4,Timothée Chalamet-Zendaya-Rebecca Ferguson-Jav...
...,...
423179,Lyudmila Ilyina-Igor Vernik
423180,Phil Deautschle
423181,Nicholas W. Wilson-Alexander Bedranowksy-Micha...
423182,Rubén Albarrán-Emmanuel del Real-Joselo Rangel...



production_companies:


Unnamed: 0,production_companies
0,Legendary Pictures-Warner Bros. Pictures
1,Apelles Entertainment-Warner Bros. Pictures-di...
2,Screen Gems-2.0 Entertainment-Jesus & Mary-Wor...
3,Skydance-Paramount-di Bonaventura Pictures-Bay...
4,Legendary Pictures
...,...
147531,Flor de Luz-Petate Films
147532,Collage Productions
147533,Karakumfilm
147534,First National Pictures-Warner Bros. Entertain...



keywords:


Unnamed: 0,keywords
0,giant monster-sequel-dinosaur-kaiju-fantasy wo...
1,based on novel or book-sequel-kaiju
2,spain-rome italy-vatican-pope-pig-possession-c...
3,peru-alien-end of the world-based on cartoon-b...
4,epic-based on novel or book-fight-sandstorm-sa...
...,...
119728,conspiracy theory-misinformation-fake news-cov...
119729,bullying-slice of life-school life
119730,peace corps
119731,ultraviolence-prowrestling-professional wrestling
