In [2]:
import xml.etree.ElementTree as ET
import pandas as pd

NS = 'http://www.mediawiki.org/xml/export-0.11/'

def parse_wikimedia_xml(filepath):
    tree = ET.parse(filepath)
    root = tree.getroot()
    data = []
    for page in root.findall('{%s}page' % NS):
        ns = page.find('{%s}ns' % NS).text
        if ns != "0":
            continue
        title = page.find('{%s}title' % NS).text
        revision = page.find('{%s}revision' % NS)
        text = revision.find('{%s}text' % NS).text
        data.append({'title': title, 'text': text})
    df = pd.DataFrame(data)
    return df

df = parse_wikimedia_xml('input/cinemorgue_8.3.23.xml')

df.head(5)

Unnamed: 0,title,text
0,Cinemorgue Wiki,<mainpage-leftcolumn-start />\n{{Mainpage welc...
1,Main Page,#REDIRECT [[Cinemorgue Wiki]]
2,Marilyn Monroe,[[File:Marilynmonroe.jpg|frame|Marilyn Monroe ...
3,Joseph Cotten,[[File:Screen Shot 2023-01-05 at 2.33.19 PM.pn...
4,Niagara (1953),[[File:NiagaraLobbyCard.jpg|frame|Lobby card f...


In [3]:
df['text'] = df['text'].str.split("Film Deaths", n=1, expand=True)[1]

In [4]:
df['text'] = df['text'].str.split("Television Deaths", n=1, expand=True)[0]

In [5]:
df['text'] = df['text'].str.split("TV Deaths", n=1, expand=True)[0]

In [6]:
df['text'] = df['text'].str.split("TV Series Deaths", n=1, expand=True)[0]

In [7]:
df['text'] = df['text'].str.split("Video Game Deaths", n=1, expand=True)[0]

In [8]:
df['text'] = df['text'].str.split("Music Video Deaths", n=1, expand=True)[0]

In [9]:
df['text'] = df['text'].str.split("Notable Connections", n=1, expand=True)[0]

In [10]:
df['text'] = df['text'].str.split("Noteworthy Connections", n=1, expand=True)[0]

In [11]:
df['text'] = df['text'].str.split("Gallery", n=1, expand=True)[0]

In [12]:
#Drop all recently nulled rows.
df = df.dropna(subset=['text'])

In [13]:
# New DataFrame to store the split rows
new_rows = {'title': [], 'text': []}

# Iterate through the original DataFrame
for idx, row in df.iterrows():
    title = row['title']
    text_parts = row['text'].split('\n*')
    
    # Append the new rows to the new DataFrame
    # Skip the first element, usually contains gibberish before first line.
    for part in text_parts[1:]:
        new_rows['title'].append(title)
        new_rows['text'].append(part)

# Create the new DataFrame
new_df = pd.DataFrame(new_rows)

# Print the result
new_df.head(3)

Unnamed: 0,title,text
0,Joseph Cotten,'''[[Shadow of a Doubt (1943)|''Shadow of a Do...
1,Joseph Cotten,'''''[[Niagara (1953)]]''''' [''George Loomis'...
2,Joseph Cotten,'''''The Last Sunset'' (1961)''' [''John Breck...


In [14]:
#Creating year column.
import re
def extract_year(text):
    match = re.search(r'\([^()]*\d{1,4}[^()]*\)', text)
    if match:
        return match.group(0)
    else:
        return None
# Apply the function to create the "year" column
new_df['year'] = new_df['text'].apply(extract_year)

new_df.head(3)

Unnamed: 0,title,text,year
0,Joseph Cotten,'''[[Shadow of a Doubt (1943)|''Shadow of a Do...,(1943)
1,Joseph Cotten,'''''[[Niagara (1953)]]''''' [''George Loomis'...,(1953)
2,Joseph Cotten,'''''The Last Sunset'' (1961)''' [''John Breck...,(1961)


In [15]:
#Remove all pairs of apostraphes or quotation marks.
new_df['text'] = new_df['text'].str.replace(r"[''\"\=]", "", regex=True)

In [16]:
# Some titles have stray html formatting tags in them. 
new_df['text'] = new_df['text'].str.replace(r'\s*<.*?>\s*', '', regex=True)

In [17]:
#Some titles will just have the link hardcoded in the title which is pretty impressive.
new_df['text'] = new_df['text'].str.replace(r'https://\S+\s*', '', regex=True)

In [18]:
#Some titles might even hardcode the unsecured link instead.
new_df['text'] = new_df['text'].str.replace(r'http://\S+\s*', '', regex=True)

In [19]:
# If a string starts with a link [], grab the contained string.
# If a string is not a link, grab all textup until the first date ().
def extract_text(row):
    if row.startswith("["):
        # Remove parenthesis and their contents from inside the square brackets
        cleaned_text = re.sub(r'\([^()]*\)', '', row)
        match = re.search(r'\[(.*?)\]', cleaned_text)
        if match:
            return match.group(1)
    else:
        match = re.search(r'^([^()]*)', row)
        if match:
            return match.group(1).strip()
    return ''  # Return an empty string if no match is found

new_df['text'] = new_df['text'].apply(extract_text)

In [20]:
# Some titles are hyper links, and will have the title twice inbetween the container apostraphes.
new_df['text'] = new_df['text'].str.split("|", n=1, expand=True)[0]

In [21]:
#Categories sneak their way in to the poorly formatted, pages.
new_df['text'] = new_df['text'].str.split("{", n=1, expand=True)[0]

In [22]:
# Some links are double banded. [[]].
new_df['text'] = new_df['text'].str.replace(r'\[|\]', '', regex=True)

In [23]:
# Remove all white space at end of string.
new_df['text'] = new_df['text'].str.strip()

In [24]:
# Clean up year column.

In [25]:
# Remove all non numeric characters from date dolumn. (You'd be surprised.)
def remove_non_numeric(year):
    if year is not None:
        return re.sub(r'[^\d]', '', year)
    else:
        return year

# Apply the function to the 'year' column
new_df['year'] = new_df['year'].apply(remove_non_numeric)

In [26]:
# Remove all blank rows.
new_df = new_df[new_df['year'] != '']

In [27]:
#Remove all null rows.
new_df = new_df.dropna(subset=['year'])

In [28]:
#Final export.
new_df.to_csv('output/Cinemorgue.csv', index=False)