In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import csv
import regex as re
import unicodedata
import seaborn as sns
import matplotlib.pyplot as plt
import networkx as nx 
from collections import Counter
from fa2 import ForceAtlas2
from DataCleaningFunctions import DataCleaner, DataCollection, DataProcesser
import os
import json
import os
import re
import requests
from urllib.parse import quote, unquote

In [3]:
data = pd.read_json('df_movies.json',orient='table')
data = data.reset_index(drop=True)

In [5]:
def remove_comments(text):
    """
    Remove all comments from the wikicontent, so only the plot remains
    """
    return re.sub(r'<!--.*?-->', '', text)

def clean_filename(filename):
    """
    Replace non-alphanumeric characters (except underscores) with underscores
    """
    return re.sub(r'[^a-zA-Z0-9_]', '_', filename)

def get_wikipedia_content(title):
    """
    For the #redirect use these params instead.
    """
    base_url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "titles": title,
        "prop": "revisions",
        "rvprop": "content"}
    
    response = requests.get(base_url, params=params)
    data = response.json()
    page = next(iter(data["query"]["pages"].values()))
    content = page.get("revisions", [{}])[0].get("*", "")
    return content

def contains_redirect(content):
    """
    In case of the lower case #REDIRECT
    """
    return "#redirect" in content.lower()

def resolve_redirects(content):
    """
    In case of #redirect/#REDIRECT in the extracted wikicontent.
    """
    while contains_redirect(content):
        # Extract the target page from the redirect content
        target_title = content.split("[[", 1)[-1].split("]]")[0]
        content = get_wikipedia_content(target_title)
    return content

def get_plot_text(wiki_content):
    """
    Extract only the plot text from the found wikicontent.
    """
    wiki_content_without_comments = remove_comments(wiki_content)
    plot_section_match = re.search(r'==\s*Plot\s*==\s*(.*?)(==|\Z)', wiki_content_without_comments, re.DOTALL | re.IGNORECASE)
    plot_section_match
    if plot_section_match is None:
        plot_section_match = re.search(r'==\s*Plot summary\s*==\s*(.*?)(==|\Z)', wiki_content_without_comments, re.DOTALL | re.IGNORECASE)
    if plot_section_match is None:
        plot_section_match = re.search(r'==\s*Summary\s*==\s*(.*?)(==|\Z)', wiki_content_without_comments, re.DOTALL | re.IGNORECASE)
    if plot_section_match is None:
        plot_section_match = re.search(r'==\s*Story\s*==\s*(.*?)(==|\Z)', wiki_content_without_comments, re.DOTALL | re.IGNORECASE)
    if plot_section_match is None:
        plot_section_match = re.search(r'==\s*Synopsis\s*==\s*(.*?)(==|\Z)', wiki_content_without_comments, re.DOTALL | re.IGNORECASE)
    if plot_section_match is None:
        plot_section_match = re.search(r'==\s*Premise\s*==\s*(.*?)(==|\Z)', wiki_content_without_comments, re.DOTALL | re.IGNORECASE)
    if plot_section_match is None:
        plot_section_match = re.search(r'==\s*Plot\s*==\s*(.*?)(?==\s*[^=]|$)', wiki_content_without_comments, re.DOTALL | re.IGNORECASE)
    plot_section = plot_section_match.group(1) if plot_section_match else None
    return plot_section


In [8]:
def get_movie_plots(data_movie_artist):
    no_plot_counter = 0

    for index, row in data_movie_artist.iterrows():
        title = row['Title']
        wikiname = row['Hyperref']
        genre = row['Genre']
        year = row['Year']

        # Remove plots manually that are in different language with no plots (Thriller)
        if any(data_movie_artist[data_movie_artist["Title"].isin(["Life Is an Art", "A Morass", "Blind Turn", "Requiem pour une tueuse", "She and She", "American Sunset", "Espion(s)"])]["Hyperref"] == wikiname):
            plot_text = "No plot"
            no_plot_counter += 1
            print(f"dropped {title}")
            continue

        # Remove plots manually that are in a different language with no plots (Action)
        if any(data_movie_artist[data_movie_artist["Title"].isin(["Sheenogai", "Vampire Warriors", "Bangkok Revenge", "My Kingdom", "Wild 7", "San Andreas Quake"])]["Hyperref"] == wikiname):
            plot_text = "No plot"
            no_plot_counter += 1
            print(f"dropped {title}")
            continue

        try:
            # Make the request
            baseurl = "https://en.wikipedia.org/w/api.php?"  # the wiki API
            params = {
                "action": "query",
                "prop": "revisions",
                "rvprop": "content",
                "format": "json",
                "titles": unquote(wikiname)
            }

            wikitext = requests.get(baseurl, params=params)
            wikijson = wikitext.json()
            page_id = next(iter(wikijson["query"]["pages"]))
            revisions = wikijson["query"]["pages"][page_id].get("revisions", [])

            if not revisions:
                raise KeyError("No revisions key")

            wiki_content = revisions[0]["*"]
            # If the wiki page is a redirect do
            if "#REDIRECT" or "#redirect" in wiki_content:
                wiki_content = resolve_redirects(wiki_content)
            plot_section = get_plot_text(wiki_content)

            # Extract the clean plot text without comments and hyperlinks
            if plot_section:
                plot_text = remove_comments(plot_section)
                # Remove hyperlinks
                plot_text = re.sub(r'\[\[([^|\]]*?\|)?([^\]]*?)\]\]', r'\2', plot_text)
            else:
                plot_text = "Plot not found."
                no_plot_counter += 1
                print(f"Plot not found for {genre}/{year}/{title}")

            # Create folders based on genre and subfolders based on the year
            genre_folder = clean_filename(genre)
            year_folder = clean_filename(str(year))
            os.makedirs(f'{genre_folder}/{year_folder}', exist_ok=True)

            # Save plot text in a .txt file
            with open(os.path.join(genre_folder, year_folder, f'{clean_filename(title)}.txt'), 'w', encoding='utf-8') as f:
                f.write(plot_text)

        except KeyError as e:
            print(f"Error processing {wikiname}: {e}")
            plot_text = "No plot"
            no_plot_counter += 1
            print(f"dropped {title}")
            continue
        
    print(no_plot_counter)


In [9]:
get_movie_plots(data)

Plot not found for Thriller/2020/The Retreat
Plot not found for Thriller/2021/Akshara
Plot not found for Thriller/2021/Lockdown
Plot not found for Action/2022/Salaar
Error processing /w/index.php?title=Golden_Slumber_(2010_film)&action=edit&redlink=1: 'No revisions key'
dropped Golden Slumber yoshihiro nakamura
dropped Life Is an Art
dropped A Morass
dropped Blind Turn
dropped Requiem pour une tueuse
Plot not found for Horror/2013/The Supernatural Events on Campus
Plot not found for Horror/2014/Bloody Doll
Plot not found for Mix/2014/Bugs
Plot not found for Thriller/2014/The Deathday Party
Plot not found for Horror/2014/Double Exposure
Plot not found for Thriller/2014/The Eighth House
Plot not found for Horror/2014/Flower's Curse
Plot not found for Horror/2014/Lonely Island
Plot not found for Horror/2014/Monsterz
Plot not found for Mix/2014/Mystery
dropped She and She
Plot not found for Horror/2014/The Haunted Cinema
Plot not found for Mix/2014/Tomb Robber
Plot not found for Thriller/2

##### Check for all plots without plots

In [10]:

def get_movie_title_from_file(file_path):
    with open(file_path, 'r') as file:
        # Assuming the title is on the first line
        title = file.readline().strip()
    return title

def search_for_text_in_files(root_folder, target_text):
    for foldername, subfolders, filenames in os.walk(root_folder):
        for filename in filenames:
            if filename.endswith(".txt"):
                file_path = os.path.join(foldername, filename)
                with open(file_path, 'r') as file:
                    content = file.read()
                    if target_text in content:
                        title = get_movie_title_from_file(file_path)
                        print(f'Text "{target_text}" found in {title} ({file_path})')


In [11]:

# Replace 'your_root_folder' with the path to the root folder you want to start the search from
root_folder = '..\Cinemartist-Networks-and-Interactions'
target_text = 'Plot not found'

search_for_text_in_files(root_folder, target_text)


Text "Plot not found" found in Plot not found. (..\Cinemartist-Networks-and-Interactions\Action\2000\Sugo_ng_Tondo.txt)
Text "Plot not found" found in Plot not found. (..\Cinemartist-Networks-and-Interactions\Action\2001\Alas_Dose.txt)
Text "Plot not found" found in Plot not found. (..\Cinemartist-Networks-and-Interactions\Action\2001\Hindi_Sisiw_ang_Kalaban_Mo.txt)
Text "Plot not found" found in Plot not found. (..\Cinemartist-Networks-and-Interactions\Action\2001\Masikip_Na_ang_Mundo_Mo__Labrador.txt)
Text "Plot not found" found in Plot not found. (..\Cinemartist-Networks-and-Interactions\Action\2002\Alive.txt)
Text "Plot not found" found in Plot not found. (..\Cinemartist-Networks-and-Interactions\Action\2002\Ang_Alamat_ng_Lawin.txt)
Text "Plot not found" found in Plot not found. (..\Cinemartist-Networks-and-Interactions\Action\2002\Batas_ng_Lansangan.txt)
Text "Plot not found" found in Plot not found. (..\Cinemartist-Networks-and-Interactions\Action\2002\Mano_Mano_2__Ubusan_ng_Laka

##### further

In [12]:
data[data['Title']== "American Pie Presents: The Naked Mile"]

Unnamed: 0,Title,Director,Cast,Country,Genre,Year,Hyperref
2258,American Pie Presents: The Naked Mile,joe nussbaum,"john white,jessy schram,steve talley,christoph...",united states,Comedy,2007,Joe_Nussbaum


In [37]:
unquote(wikiname)

'Samaritan'

In [93]:
# Make the request
from urllib.parse import quote, unquote

baseurl = "https://en.wikipedia.org/w/api.php?"  # the wiki API
wikiname = "Boarding_Gate"

params = {
    "action": "query",
    "prop": "revisions",
    "rvprop": "content",
    "format": "json",
    "titles": unquote(wikiname)
}
wikitext = requests.get(baseurl, params=params)
wikijson = wikitext.json()
page_id = next(iter(wikijson["query"]["pages"]))


In [94]:
unquote(wikiname)

'Boarding_Gate'

In [None]:
# Check if "revisions" key is present
revisions = wikijson["query"]["pages"][page_id].get("revisions", [])
if not revisions:
    raise KeyError("No revisions key")

wiki_content = revisions[0]["*"]


wiki_content_without_comments = remove_comments(wiki_content)
print(wiki_content_without_comments)
plot_section_match = re.search(r'==\s*Plot\s*==\s*(.*?)(==|\Z)', wiki_content_without_comments, re.DOTALL | re.IGNORECASE)
plot_section_match
if plot_section_match is None:
    plot_section_match = re.search(r'==\s*Plot summary\s*==\s*(.*?)(==|\Z)', wiki_content_without_comments, re.DOTALL | re.IGNORECASE)
if plot_section_match is None:
    plot_section_match = re.search(r'==\s*Synopsis\s*==\s*(.*?)(==|\Z)', wiki_content_without_comments, re.DOTALL | re.IGNORECASE)
if plot_section_match is None:
    plot_section_match = re.search(r'==\s*Premise\s*==\s*(.*?)(==|\Z)', wiki_content_without_comments, re.DOTALL | re.IGNORECASE)
if plot_section_match is None:
    plot_section_match = re.search(r'==\s*Plot\s*==\s*(.*?)(?==\s*[^=]|$)', wiki_content_without_comments, re.DOTALL | re.IGNORECASE)
plot_section = plot_section_match.group(1) if plot_section_match else None


In [100]:
plot_section

''

In [None]:
wiki_text = "Your Wikipedia text here"

pattern = r'==\s*Plot\s*==\s*(.*?)(?==\s*[^=]|$)'

matches = re.search(pattern, wiki_text, re.DOTALL)
if matches:
    plot_content = matches.group(1).strip()
    print(plot_content)

In [101]:

if plot_section == '':
    print('true')
    if "#REDIRECT" or "#redirect" in wiki_content:
        print("yes")
        wiki_content = resolve_redirects(wiki_content)
        wiki_content_without_comments = remove_comments(wiki_content)
        plot_section_match = re.search(r'==\s*Plot\s*==\s*(.*?)(==|\Z)', wiki_content_without_comments, re.DOTALL | re.IGNORECASE)
        plot_section_match
        if plot_section_match is None:
            plot_section_match = re.search(r'==\s*Plot summary\s*==\s*(.*?)(==|\Z)', wiki_content_without_comments, re.DOTALL | re.IGNORECASE)
        if plot_section_match is None:
            plot_section_match = re.search(r'==\s*Synopsis\s*==\s*(.*?)(==|\Z)', wiki_content_without_comments, re.DOTALL | re.IGNORECASE)
        if plot_section_match is None:
            plot_section_match = re.search(r'==\s*Premise\s*==\s*(.*?)(==|\Z)', wiki_content_without_comments, re.DOTALL | re.IGNORECASE)


true
yes


In [102]:
plot_section = plot_section_match.group(1) if plot_section_match else None
plot_section

''