In [1]:
import xml.etree.ElementTree as ET
import pandas as pd
import re
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import matplotlib.pyplot as plt
#nltk.download('punkt')
#nltk.download('stopwords')

In [2]:
NS = 'http://www.mediawiki.org/xml/export-0.11/'

def parse_wikimedia_xml(filepath):
    tree = ET.parse(filepath)
    root = tree.getroot()
    data = []
    for page in root.findall('{%s}page' % NS):
        ns = page.find('{%s}ns' % NS).text
        if ns != "0":
            continue
        title = page.find('{%s}title' % NS).text
        revision = page.find('{%s}revision' % NS)
        text = revision.find('{%s}text' % NS).text
        data.append({'title': title, 'text': text})
    df = pd.DataFrame(data)
    return df

df = parse_wikimedia_xml('input/cinemorgue_8.3.23.xml')

df.head(3)

Unnamed: 0,title,text
0,Cinemorgue Wiki,<mainpage-leftcolumn-start />\n{{Mainpage welc...
1,Main Page,#REDIRECT [[Cinemorgue Wiki]]
2,Marilyn Monroe,[[File:Marilynmonroe.jpg|frame|Marilyn Monroe ...


In [3]:
# Delete everything before Film Deaths.
df['text'] = df['text'].str.split("Film Deaths", n=1, expand=True)[1]

In [4]:
#Delete everything below TV Deaths.
df['text'] = df['text'].str.split("Television Deaths", n=1, expand=True)[0]

In [5]:
df['text'] = df['text'].str.split("TV Deaths", n=1, expand=True)[0]

In [6]:
df['text'] = df['text'].str.split("TV Series Deaths", n=1, expand=True)[0]

In [7]:
df['text'] = df['text'].str.split("Video Game Deaths", n=1, expand=True)[0]

In [8]:
df['text'] = df['text'].str.split("Music Video Deaths", n=1, expand=True)[0]

In [9]:
df['text'] = df['text'].str.split("Notable Connections", n=1, expand=True)[0]

In [10]:
df['text'] = df['text'].str.split("Noteworthy Connections", n=1, expand=True)[0]

In [11]:
df['text'] = df['text'].str.split("Gallery", n=1, expand=True)[0]

In [12]:
df['text'] = df['text'].str.split("Categories", n=1, expand=True)[0]

In [13]:
df['text'] = df['text'].str.split("DEFAULTSORT:", n=1, expand=True)[0]

In [14]:
#Drop all recently nulled rows. Ready for splitting.
df = df.dropna(subset=['text'])

In [15]:
# New DataFrame to store the split rows
new_rows = {'title': [], 'text': []}

# Iterate through the original DataFrame
for idx, row in df.iterrows():
    title = row['title']
    text_parts = row['text'].split('\n*')
    
    # Append the new rows to the new DataFrame
    # Skip the first element, usually contains gibberish before first line.
    for part in text_parts[1:]:
        new_rows['title'].append(title)
        new_rows['text'].append(part)

# Create the new DataFrame
new_df = pd.DataFrame(new_rows)

# Print the result
new_df.head(3)

Unnamed: 0,title,text
0,Joseph Cotten,'''[[Shadow of a Doubt (1943)|''Shadow of a Do...
1,Joseph Cotten,'''''[[Niagara (1953)]]''''' [''George Loomis'...
2,Joseph Cotten,'''''The Last Sunset'' (1961)''' [''John Breck...


In [16]:
#Creating year column.
def extract_year(text):
    match = re.search(r'\((\d{4})\)', text)
    if match:
        return match.group(1)
    else:
        return None

# Apply the function to create the "year" column
new_df['year'] = new_df['text'].apply(extract_year)

new_df.head(3)

Unnamed: 0,title,text,year
0,Joseph Cotten,'''[[Shadow of a Doubt (1943)|''Shadow of a Do...,1943
1,Joseph Cotten,'''''[[Niagara (1953)]]''''' [''George Loomis'...,1953
2,Joseph Cotten,'''''The Last Sunset'' (1961)''' [''John Breck...,1961


In [17]:
#Remove all pairs of apostraphes or quotation marks.
new_df['text'] = new_df['text'].str.replace(r"[''\"\=]", "", regex=True)

In [18]:
# Some titles have stray html formatting tags in them. 
new_df['text'] = new_df['text'].str.replace(r'\s*<.*?>\s*', '', regex=True)

In [19]:
#Some titles will just have the link hardcoded in the title which is pretty impressive.
new_df['text'] = new_df['text'].str.replace(r'https://\S+\s*', '', regex=True)

In [20]:
#Some titles might even hardcode the unsecured link instead.
new_df['text'] = new_df['text'].str.replace(r'http://\S+\s*', '', regex=True)

In [21]:
#Save line contents for additional parsing later.
new_df['raw_text'] = new_df['text']

In [22]:
# If a string starts with a link [], grab the contained string.
# If a string is not a link, grab all textup until the first date ().
def extract_text(row):
    if row.startswith("["):
        # Remove parenthesis and their contents from inside the square brackets
        cleaned_text = re.sub(r'\([^()]*\)', '', row)
        match = re.search(r'\[(.*?)\]', cleaned_text)
        if match:
            return match.group(1)
    else:
        match = re.search(r'^([^()]*)', row)
        if match:
            return match.group(1).strip()
    return ''  # Return an empty string if no match is found

new_df['text'] = new_df['text'].apply(extract_text)

In [23]:
# For links, delete everything after the first instance of a |.
new_df['text'] = new_df['text'].str.split("|", n=1, expand=True)[0]

In [24]:
# Remove all remaining [[]].
new_df['text'] = new_df['text'].str.replace(r'\[|\]', '', regex=True)

In [25]:
#Categories sneak their way into some titles, so remove these as well.
new_df['text'] = new_df['text'].str.split("{", n=1, expand=True)[0]

In [26]:
# Remove all white space at end of string.
new_df['text'] = new_df['text'].str.strip()

In [27]:
# Remove all blank rows.
new_df = new_df[new_df['year'] != '']

In [28]:
#Remove all null rows that don't contain a year.
new_df = new_df.dropna(subset=['year'])

In [29]:
new_df.head(3)

Unnamed: 0,title,text,year,raw_text
0,Joseph Cotten,Shadow of a Doubt,1943,[[Shadow of a Doubt (1943)|Shadow of a Doubt (...
1,Joseph Cotten,Niagara,1953,[[Niagara (1953)]] [George Loomis]: Drowned wh...
2,Joseph Cotten,The Last Sunset,1961,The Last Sunset (1961) [John Breckenridge]: Sh...


In [30]:
# Remove all alphanumeric characters from the column.
new_df['raw_text'] = new_df['raw_text'].str.replace(r'[^a-zA-Z#\s]', ' ', regex=True)

# Removes spaces from above. Easier this way.
new_df['raw_text'] = new_df['raw_text'].str.replace(r'  +', ' ', regex=True)

In [31]:
# Remove stop words to make stemming less painful.
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = word_tokenize(text)
    filtered = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered)

new_df['processed_text'] = new_df['raw_text'].apply(remove_stopwords)

new_df.head(3)

Unnamed: 0,title,text,year,raw_text,processed_text
0,Joseph Cotten,Shadow of a Doubt,1943,Shadow of a Doubt Shadow of a Doubt Uncle Cha...,Shadow Doubt Shadow Doubt Uncle Charlie Falls ...
1,Joseph Cotten,Niagara,1953,Niagara George Loomis Drowned when his boat s...,Niagara George Loomis Drowned boat sinks going...
2,Joseph Cotten,The Last Sunset,1961,The Last Sunset John Breckenridge Shot in the ...,Last Sunset John Breckenridge Shot back Adam W...


In [32]:
# Stem everything to get to the root word. It makes what we're doing next way less insane.
stemmer = PorterStemmer()

def stem_text(text):
    words = word_tokenize(text)
    stemmed = [stemmer.stem(word) for word in words]
    return ' '.join(stemmed)

new_df['processed_text'] = new_df['processed_text'].apply(stem_text)

new_df.head(3)

Unnamed: 0,title,text,year,raw_text,processed_text
0,Joseph Cotten,Shadow of a Doubt,1943,Shadow of a Doubt Shadow of a Doubt Uncle Cha...,shadow doubt shadow doubt uncl charli fall tra...
1,Joseph Cotten,Niagara,1953,Niagara George Loomis Drowned when his boat s...,niagara georg loomi drown boat sink go niagara...
2,Joseph Cotten,The Last Sunset,1961,The Last Sunset John Breckenridge Shot in the ...,last sunset john breckenridg shot back adam wi...


In [33]:
# Deviated from the  FBI list a little bit.
keywords = {
    'Firearms': ['gun', 'shot','shoot', 'shootout', 'sniper'],
    'Knives/Cutting Instruments': ['knife', 'stab', 'slash', 'decapit', 'slit', 'impal', 'cut', 'sword'],
    'Blunt Objects': ['club', 'hammer', 'bludgeon'],
    'Personal Weapons': ['fist', 'kick', 'punch'],
    'Poison': ['poison', 'cyanid'],
    'Explosives': ['explos', 'bomb', 'deton'],
    'Fire': ['fire', 'melt', 'burn'],
    'Narcotics': ['overdos'],
    'Drowning': ['drown', 'sink'],
    'Strangulation': ['strangl', 'choke', 'asphyxi', 'hang'],
    'Impact': ['fall', 'thrown', 'crash', 'crush'],
    'Natural Disaster': ['earthquak', 'tornado'],
    'Ailment': ['sick', 'cancer', 'infect', 'heart attack', 'stroke']
}

# We want to capture the first word that appears in the string that exists in the dictionary.
# Start with earliest_index being equal to inf so all valid index will be smaller.
# Earliest_index grabs the first index of a word that exists in the dictionary.
# Update values whenever a smaller index exists, otherwise skip.

def identify_cause(text):
    text = word_tokenize(text.lower())
    first_instance = None
    earliest_index = float('inf')
    
    for cause, keys in keywords.items():
        for key in keys:
            if key in text:
                index = text.index(key)
                if index < earliest_index:
                    earliest_index = index
                    first_instance = cause
                    
    if first_instance:
        return first_instance
    return 'Other'

new_df['cause_of_death'] = new_df['processed_text'].apply(identify_cause)

In [34]:
new_df.rename(columns={'title': 'Name', 'text': 'Movie', 'year': 'Year'}, inplace=True)
new_df.to_csv('output/Cinemorgue.csv', index=False)

In [35]:
# Use the value_counts() method to count occurrences of each unique value
count_series = new_df['cause_of_death'].value_counts()

# count_series will already be sorted in descending order by default.
print(count_series)

cause_of_death
Firearms                      20988
Other                         20028
Knives/Cutting Instruments    11575
Impact                         4835
Explosives                     2406
Strangulation                  2324
Fire                           2264
Drowning                       1365
Poison                          915
Ailment                         763
Blunt Objects                   688
Personal Weapons                439
Narcotics                       356
Natural Disaster                 58
Name: count, dtype: int64
