In [1]:
import xml.etree.ElementTree as ET
import pandas as pd
import re
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import matplotlib.pyplot as plt
#nltk.download('punkt')
#nltk.download('stopwords')

pd.set_option('display.max_colwidth', 1000)

pd.set_option('display.max_columns', None)

pd.set_option('display.max_rows', None)

In [2]:
NS = 'http://www.mediawiki.org/xml/export-0.11/'

def parse_wikimedia_xml(filepath):
    tree = ET.parse(filepath)
    root = tree.getroot()
    data = []
    for page in root.findall('{%s}page' % NS):
        ns = page.find('{%s}ns' % NS).text
        if ns != "0":
            continue
        title = page.find('{%s}title' % NS).text
        revision = page.find('{%s}revision' % NS)
        text = revision.find('{%s}text' % NS).text
        data.append({'title': title, 'text': text})
    df = pd.DataFrame(data)
    return df

df = parse_wikimedia_xml('input/cinemorgue_8.3.23.xml')

df.head(3)

Unnamed: 0,title,text
0,Cinemorgue Wiki,"<mainpage-leftcolumn-start />\n{{Mainpage welcome}}\n{{Heading|Index}}\n<div style=""font-family: 'Graveyard', sans-serif; padding: 0 1.2em; border-left: 4px solid #000000; margin-left: 1.2em; text-align:center; text-transform:uppercase; color:--theme-body-text-color;"">\n<gallery widths=""330"" hideaddbutton=""true"" position=""center"" spacing=""small"" orientation=""square"" bordersize=""large"" bordercolor=""#000000"" captionalign=""center"" captionposition=""within"" columns=""2"" navigation=""true"">\nFile:A pic for Cinemorgue actors.jpg|link=Category:Actors|<span style=""font-size:24px; font-family: 'Graveyard'; letter-spacing: 2px; color:#fff; "">Actor index</span>\nFile:Karolineherfurthperfume3.jpg|link=Category:Actresses|<span style=""font-size:24px; font-family: 'Graveyard'; letter-spacing: 2px; color:#fff;"">Actress index</span>\n</gallery>\n<gallery widths=""220"" hideaddbutton=""true"" position=""center"" spacing=""small"" orientation=""square"" bordersize=""large"" bordercolor=""#000000"" captionalign=""cente..."
1,Main Page,#REDIRECT [[Cinemorgue Wiki]]
2,Marilyn Monroe,"[[File:Marilynmonroe.jpg|frame|Marilyn Monroe in ''Niagara'']]\n\n[http://www.imdb.com/name/nm0000054/ Marilyn Monroe] (1926 - 1962) \n\nPlayboy Sweetheart of the Month December 1953 (Historically considered the first ever Playboy Playmate)\n\nIt's noted that Niagara is the only film which Marilyn Monroe ""died"" in.\n\nNot to be confused with [[Marilyn Manhoe]].\n\n==Film Deaths==\n*'''''[[Niagara (1953)]]''''' [''Rose Loomis'']: Strangled by [[Joseph Cotten]] in a bell tower. The murder is shown in shadow, and her body falls into the frame afterwards.\n\n==Noteworthy Connections==\n*Foster sister of [[Jody Lawrance]]\n*Ex-wife of Joe DiMaggio (famed baseball player)\n*Ex-wife of Arthur Miller (famed playwright)\n*Mistress of President John F. Kennedy\n*Ex-girlfriend of Jorge Guinile (Brazilian billionaire)\n*'''No relation''' to [[Carolyn Monroe]].\n{{DEFAULTSORT:Monroe, Marilyn}}\n[[Category:Actresses|Monroe, Marilyn]]\n[[Category:Listed on Original Cinemorgue|Monroe, Marilyn]]\n[..."


In [3]:
# Delete everything before Film Deaths.
df['text'] = df['text'].str.split("Film Deaths", n=1, expand=True)[1]

In [4]:
#Delete everything below TV Deaths.
df['text'] = df['text'].str.split("Television Deaths", n=1, expand=True)[0]

In [5]:
df['text'] = df['text'].str.split("TV Deaths", n=1, expand=True)[0]

In [6]:
df['text'] = df['text'].str.split("TV Series Deaths", n=1, expand=True)[0]

In [7]:
df['text'] = df['text'].str.split("Video Game Deaths", n=1, expand=True)[0]

In [8]:
df['text'] = df['text'].str.split("Music Video Deaths", n=1, expand=True)[0]

In [9]:
df['text'] = df['text'].str.split("Notable Connections", n=1, expand=True)[0]

In [10]:
df['text'] = df['text'].str.split("Noteworthy Connections", n=1, expand=True)[0]

In [11]:
df['text'] = df['text'].str.split("Gallery", n=1, expand=True)[0]

In [12]:
df['text'] = df['text'].str.split("DEFAULTSORT:", n=1, expand=True)[0]

In [13]:
df['text'] = df['text'].str.split("Category", n=1, expand=True)[0]

In [14]:
#Drop all recently nulled rows. Ready for splitting.
df = df.dropna(subset=['text'])

In [15]:
# New DataFrame to store the split rows
new_rows = {'title': [], 'text': []}

# Iterate through the original DataFrame
for idx, row in df.iterrows():
    title = row['title']
    text_parts = row['text'].split('\n*')
    
    # Append the new rows to the new DataFrame
    # Skip the first element, usually contains gibberish before first line.
    for part in text_parts[1:]:
        new_rows['title'].append(title)
        new_rows['text'].append(part)

# Create the new DataFrame
new_df = pd.DataFrame(new_rows)

# Print the result
new_df.head(3)

Unnamed: 0,title,text
0,Joseph Cotten,'''[[Shadow of a Doubt (1943)|''Shadow of a Doubt'' (1943)]]''' [''Uncle Charlie'']: Falls out of a train and into the path of another train during a struggle with [[Teresa Wright]].
1,Joseph Cotten,'''''[[Niagara (1953)]]''''' [''George Loomis'']: Drowned when his boat sinks while going over Niagara Falls.
2,Joseph Cotten,"'''''The Last Sunset'' (1961)''' [''John Breckenridge'']: Shot in the back [[Adam Williams]] as he leaves the cantina, as he is flanked by [[Rock Hudson]] and [[Kirk Douglas]]. (''Thanks to Brian'')."


In [16]:
#Creating year column.
def extract_year(text):
    match = re.search(r'\((\d{4})\)', text)
    if match:
        return match.group(1)
    else:
        return None

# Apply the function to create the "year" column
new_df['year'] = new_df['text'].apply(extract_year)

new_df.head(3)

Unnamed: 0,title,text,year
0,Joseph Cotten,'''[[Shadow of a Doubt (1943)|''Shadow of a Doubt'' (1943)]]''' [''Uncle Charlie'']: Falls out of a train and into the path of another train during a struggle with [[Teresa Wright]].,1943
1,Joseph Cotten,'''''[[Niagara (1953)]]''''' [''George Loomis'']: Drowned when his boat sinks while going over Niagara Falls.,1953
2,Joseph Cotten,"'''''The Last Sunset'' (1961)''' [''John Breckenridge'']: Shot in the back [[Adam Williams]] as he leaves the cantina, as he is flanked by [[Rock Hudson]] and [[Kirk Douglas]]. (''Thanks to Brian'').",1961


In [17]:
#Remove all pairs of apostraphes or quotation marks.
new_df['text'] = new_df['text'].str.replace(r"[''\"\=]", "", regex=True)

In [18]:
# Some titles have stray html formatting tags in them. 
new_df['text'] = new_df['text'].str.replace(r'\s*<.*?>\s*', '', regex=True)

In [19]:
#Some titles will just have the link hardcoded in the title which is pretty impressive.
new_df['text'] = new_df['text'].str.replace(r'https://\S+\s*', '', regex=True)

In [20]:
#Some titles might even hardcode the unsecured link instead.
new_df['text'] = new_df['text'].str.replace(r'http://\S+\s*', '', regex=True)

In [21]:
#Save line contents for additional parsing later.
new_df['raw_text'] = new_df['text']

In [22]:
# If a string starts with a link [], grab the contained string.
# If a string is not a link, grab all textup until the first date ().
def extract_text(row):
    if row.startswith("["):
        # Remove parenthesis and their contents from inside the square brackets
        cleaned_text = re.sub(r'\([^()]*\)', '', row)
        match = re.search(r'\[(.*?)\]', cleaned_text)
        if match:
            return match.group(1)
    else:
        match = re.search(r'^([^()]*)', row)
        if match:
            return match.group(1).strip()
    return ''  # Return an empty string if no match is found

new_df['text'] = new_df['text'].apply(extract_text)

In [23]:
# For links, delete everything after the first instance of a |.
new_df['text'] = new_df['text'].str.split("|", n=1, expand=True)[0]

In [24]:
# Remove all remaining [[]].
new_df['text'] = new_df['text'].str.replace(r'\[|\]', '', regex=True)

In [25]:
#Categories sneak their way into some titles, so remove these as well.
new_df['text'] = new_df['text'].str.split("{", n=1, expand=True)[0]

In [26]:
# Remove all white space at end of string.
new_df['text'] = new_df['text'].str.strip()

In [27]:
# Remove all blank rows.
new_df = new_df[new_df['year'] != '']

In [28]:
#Remove all null rows that don't contain a year.
new_df = new_df.dropna(subset=['year'])

In [29]:
new_df.head(3)

Unnamed: 0,title,text,year,raw_text
0,Joseph Cotten,Shadow of a Doubt,1943,[[Shadow of a Doubt (1943)|Shadow of a Doubt (1943)]] [Uncle Charlie]: Falls out of a train and into the path of another train during a struggle with [[Teresa Wright]].
1,Joseph Cotten,Niagara,1953,[[Niagara (1953)]] [George Loomis]: Drowned when his boat sinks while going over Niagara Falls.
2,Joseph Cotten,The Last Sunset,1961,"The Last Sunset (1961) [John Breckenridge]: Shot in the back [[Adam Williams]] as he leaves the cantina, as he is flanked by [[Rock Hudson]] and [[Kirk Douglas]]. (Thanks to Brian)."


In [30]:
# Remove all alphanumeric characters from the column.
new_df['raw_text'] = new_df['raw_text'].str.replace(r'[^a-zA-Z#\s]', ' ', regex=True)

# Removes spaces from above. Easier this way.
new_df['raw_text'] = new_df['raw_text'].str.replace(r'  +', ' ', regex=True)

In [31]:
# Remove stop words to make stemming less painful.
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = word_tokenize(text)
    filtered = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered)

new_df['processed_text'] = new_df['raw_text'].apply(remove_stopwords)

new_df.head(3)

Unnamed: 0,title,text,year,raw_text,processed_text
0,Joseph Cotten,Shadow of a Doubt,1943,Shadow of a Doubt Shadow of a Doubt Uncle Charlie Falls out of a train and into the path of another train during a struggle with Teresa Wright,Shadow Doubt Shadow Doubt Uncle Charlie Falls train path another train struggle Teresa Wright
1,Joseph Cotten,Niagara,1953,Niagara George Loomis Drowned when his boat sinks while going over Niagara Falls,Niagara George Loomis Drowned boat sinks going Niagara Falls
2,Joseph Cotten,The Last Sunset,1961,The Last Sunset John Breckenridge Shot in the back Adam Williams as he leaves the cantina as he is flanked by Rock Hudson and Kirk Douglas Thanks to Brian,Last Sunset John Breckenridge Shot back Adam Williams leaves cantina flanked Rock Hudson Kirk Douglas Thanks Brian


In [32]:
# Stem everything to get to the root word. It makes what we're doing next way less insane.
stemmer = PorterStemmer()

def stem_text(text):
    words = word_tokenize(text)
    stemmed = [stemmer.stem(word) for word in words]
    return ' '.join(stemmed)

new_df['processed_text'] = new_df['processed_text'].apply(stem_text)

new_df.head(3)

Unnamed: 0,title,text,year,raw_text,processed_text
0,Joseph Cotten,Shadow of a Doubt,1943,Shadow of a Doubt Shadow of a Doubt Uncle Charlie Falls out of a train and into the path of another train during a struggle with Teresa Wright,shadow doubt shadow doubt uncl charli fall train path anoth train struggl teresa wright
1,Joseph Cotten,Niagara,1953,Niagara George Loomis Drowned when his boat sinks while going over Niagara Falls,niagara georg loomi drown boat sink go niagara fall
2,Joseph Cotten,The Last Sunset,1961,The Last Sunset John Breckenridge Shot in the back Adam Williams as he leaves the cantina as he is flanked by Rock Hudson and Kirk Douglas Thanks to Brian,last sunset john breckenridg shot back adam william leav cantina flank rock hudson kirk dougla thank brian


In [33]:
# Deviated from the  FBI list a little bit.
keywords = {
    'Firearms': ['gun', 'shot','shoot', 'shootout', 'gunshot', 'sniper'],
    'Knives or Cutting Instruments': ['knife', 'stab', 'slash', 'decapit', 'slit', 'impal', 'cut', 'sword', 'hack', 'axe', 'slice'],
    'Blunt Objects': ['club', 'hammer', 'mallet', 'bludgeon', 'beaten', 'flog'],
    'Personal Weapons': ['fist', 'kick', 'punch', 'neck snap'],
    'Poison': ['poison', 'cyanid', 'lethal injection', 'drain cleaner'],
    'Explosives': ['explos', 'bomb', 'deton', 'explod'],
    'Fire': ['fire', 'melt', 'burn', 'inciner'],
    'Narcotics': ['overdos'],
    'Drowning': ['drown', 'sink'],
    'Strangulation': ['strangl', 'choke', 'asphyxi', 'hang', 'suffoc'],
    'Impact': ['fall', 'thrown', 'crush', 'push', 'jump'],
    'Ailment': ['sick', 'cancer', 'infect', 'heart attack', 'stroke', 'ill', 'old age', 'viru', 'leukemia'],
    'Car': ['car crash', 'crash car', 'hit car', 'car accid', 'truck collid']
}

# We want to capture the first word that appears in the string that exists in the dictionary.
# Start with earliest_index being equal to inf so all valid index will be smaller.
# Earliest_index grabs the first index of a word that exists in the dictionary.
# Update values whenever a smaller index exists, otherwise skip.

def identify_cause(text):
    text = word_tokenize(text.lower())
    first_instance = None
    earliest_index = float('inf')
    
    for cause, keys in keywords.items():
        for key in keys:
            # Handling for compound words
            if ' ' in key:
                compound_words = key.split(' ')
                for i in range(len(text) - len(compound_words) + 1):
                    if all(text[i+j] == compound_word for j, compound_word in enumerate(compound_words)):
                        index = i
                        if index < earliest_index:
                            earliest_index = index
                            first_instance = cause
            else:
                # Original single-word handling
                if key in text:
                    index = text.index(key)
                    if index < earliest_index:
                        earliest_index = index
                        first_instance = cause
                    
    if first_instance:
        return first_instance
    return 'Other'

new_df['cause_of_death'] = new_df['processed_text'].apply(identify_cause)

In [34]:
# Spot check what is left in other.
filtered_df = new_df[new_df['cause_of_death'] == 'Other']

sample_df = filtered_df.sample(n=min(1, len(filtered_df)))

# Print only the 'raw_text' and 'processed_text' columns of the sampled DataFrame
print(sample_df[['raw_text', 'processed_text']])

                                                                                                                                raw_text  \
56091   Saving Mr Banks Saving Mr Banks   Margaret Goff Dies sometime before the film she is seen in flashbacks had by  Emma Thompson \n   

                                                                              processed_text  
56091  save mr bank save mr bank margaret goff die sometim film seen flashback emma thompson  


In [35]:
# Use the value_counts() method to count occurrences of each unique value
count_series = new_df['cause_of_death'].value_counts()

# count_series will already be sorted in descending order by default.
print(count_series)

cause_of_death
Firearms                         20825
Other                            15919
Knives or Cutting Instruments    12009
Impact                            4235
Explosives                        2838
Strangulation                     2572
Ailment                           2335
Fire                              2220
Blunt Objects                     1676
Drowning                          1336
Personal Weapons                   915
Poison                             901
Car                                890
Narcotics                          330
Name: count, dtype: int64


In [36]:
new_df.rename(columns={'title': 'Name', 'text': 'Movie', 'year': 'Year', 'cause_of_death':'Cause of Death'}, inplace=True)
new_df.to_csv('output/Cinemorgue.csv', columns=['Name', 'Movie', 'Year', 'Cause of Death'], index=False)

Top 10 actors, and their top 10 causes of death.

In [39]:
# Group by 'Name' and 'Movie' and count the occurrences
top_10 = new_df.groupby(['Name']).size().reset_index(name='Count')

# Sort by Count in descending order and take the top 10
top_10 = top_10.sort_values(by='Count', ascending=False).head(10)

print(top_10)

                  Name  Count
6323       Danny Trejo     71
5129   Christopher Lee     69
17288  Lance Henriksen     56
30173         Udo Kier     55
14336   John Carradine     47
8588      Eric Roberts     43
3241     Boris Karloff     42
30695    Vincent Price     42
11338     Hideo Murota     41
7034     Dennis Hopper     41


In [40]:
danny_df = (new_df.query("`Name` == 'Danny Trejo'")
             .groupby(['Name', 'Cause of Death'])
             .size()
             .reset_index(name='Count')
             .sort_values('Count', ascending=False))

print(danny_df)

           Name                 Cause of Death  Count
5   Danny Trejo                       Firearms     27
8   Danny Trejo                          Other     16
7   Danny Trejo  Knives or Cutting Instruments     11
3   Danny Trejo                     Explosives      4
4   Danny Trejo                           Fire      3
6   Danny Trejo                         Impact      3
2   Danny Trejo                       Drowning      2
9   Danny Trejo               Personal Weapons      2
0   Danny Trejo                  Blunt Objects      1
1   Danny Trejo                            Car      1
10  Danny Trejo                  Strangulation      1


In [42]:
christopher_df = (new_df.query("`Name` == 'Christopher Lee'")
             .groupby(['Name', 'Cause of Death'])
             .size()
             .reset_index(name='Count')
             .sort_values('Count', ascending=False))

print(christopher_df)

              Name                 Cause of Death  Count
7  Christopher Lee  Knives or Cutting Instruments     20
8  Christopher Lee                          Other     13
4  Christopher Lee                           Fire     10
6  Christopher Lee                         Impact      8
5  Christopher Lee                       Firearms      6
2  Christopher Lee                       Drowning      4
0  Christopher Lee                        Ailment      3
3  Christopher Lee                     Explosives      2
9  Christopher Lee                  Strangulation      2
1  Christopher Lee                            Car      1


In [43]:
lance_df = (new_df.query("`Name` == 'Lance Henriksen'")
             .groupby(['Name', 'Cause of Death'])
             .size()
             .reset_index(name='Count')
             .sort_values('Count', ascending=False))

print(lance_df)

              Name                 Cause of Death  Count
3  Lance Henriksen                       Firearms     17
6  Lance Henriksen                          Other     15
5  Lance Henriksen  Knives or Cutting Instruments     14
1  Lance Henriksen                     Explosives      3
4  Lance Henriksen                         Impact      3
0  Lance Henriksen                        Ailment      2
2  Lance Henriksen                           Fire      1
7  Lance Henriksen                         Poison      1


In [44]:
udo_df = (new_df.query("`Name` == 'Udo Kier'")
             .groupby(['Name', 'Cause of Death'])
             .size()
             .reset_index(name='Count')
             .sort_values('Count', ascending=False))

print(udo_df)

       Name                 Cause of Death  Count
7  Udo Kier                          Other     15
4  Udo Kier                       Firearms     14
6  Udo Kier  Knives or Cutting Instruments     10
5  Udo Kier                         Impact      4
1  Udo Kier                  Blunt Objects      3
2  Udo Kier                     Explosives      3
8  Udo Kier               Personal Weapons      2
9  Udo Kier                         Poison      2
0  Udo Kier                        Ailment      1
3  Udo Kier                           Fire      1


In [45]:
john_df = (new_df.query("`Name` == 'John Carradine'")
             .groupby(['Name', 'Cause of Death'])
             .size()
             .reset_index(name='Count')
             .sort_values('Count', ascending=False))

print(john_df)

              Name                 Cause of Death  Count
6   John Carradine                       Firearms     12
9   John Carradine                          Other     12
7   John Carradine                         Impact      5
0   John Carradine                        Ailment      4
8   John Carradine  Knives or Cutting Instruments      4
5   John Carradine                           Fire      3
1   John Carradine                  Blunt Objects      2
4   John Carradine                     Explosives      2
2   John Carradine                            Car      1
3   John Carradine                       Drowning      1
10  John Carradine                  Strangulation      1


In [46]:
eric_df = (new_df.query("`Name` == 'Eric Roberts'")
             .groupby(['Name', 'Cause of Death'])
             .size()
             .reset_index(name='Count')
             .sort_values('Count', ascending=False))

print(eric_df)

           Name                 Cause of Death  Count
4  Eric Roberts                       Firearms     16
8  Eric Roberts                          Other      9
6  Eric Roberts  Knives or Cutting Instruments      6
0  Eric Roberts                  Blunt Objects      3
2  Eric Roberts                     Explosives      3
5  Eric Roberts                         Impact      2
1  Eric Roberts                            Car      1
3  Eric Roberts                           Fire      1
7  Eric Roberts                      Narcotics      1
9  Eric Roberts               Personal Weapons      1


In [47]:
boris_df = (new_df.query("`Name` == 'Boris Karloff'")
             .groupby(['Name', 'Cause of Death'])
             .size()
             .reset_index(name='Count')
             .sort_values('Count', ascending=False))

print(boris_df)

            Name                 Cause of Death  Count
5  Boris Karloff                       Firearms     12
7  Boris Karloff  Knives or Cutting Instruments      6
8  Boris Karloff                          Other      6
4  Boris Karloff                           Fire      4
6  Boris Karloff                         Impact      4
3  Boris Karloff                     Explosives      3
9  Boris Karloff                  Strangulation      3
2  Boris Karloff                       Drowning      2
0  Boris Karloff                        Ailment      1
1  Boris Karloff                  Blunt Objects      1


In [49]:
vincent_df = (new_df.query("`Name` == 'Vincent Price'")
             .groupby(['Name', 'Cause of Death'])
             .size()
             .reset_index(name='Count')
             .sort_values('Count', ascending=False))

print(vincent_df)

            Name                 Cause of Death  Count
4  Vincent Price                       Firearms     12
3  Vincent Price                           Fire      7
5  Vincent Price                         Impact      6
7  Vincent Price                          Other      5
0  Vincent Price                        Ailment      3
2  Vincent Price                     Explosives      3
1  Vincent Price                       Drowning      2
8  Vincent Price                         Poison      2
6  Vincent Price  Knives or Cutting Instruments      1
9  Vincent Price                  Strangulation      1


In [50]:
hideo_df = (new_df.query("`Name` == 'Hideo Murota'")
             .groupby(['Name', 'Cause of Death'])
             .size()
             .reset_index(name='Count')
             .sort_values('Count', ascending=False))

print(hideo_df)

           Name                 Cause of Death  Count
3  Hideo Murota  Knives or Cutting Instruments     15
1  Hideo Murota                       Firearms     14
4  Hideo Murota                          Other      5
0  Hideo Murota                  Blunt Objects      4
2  Hideo Murota                         Impact      2
5  Hideo Murota                  Strangulation      1


In [51]:
dennis_df = (new_df.query("`Name` == 'Dennis Hopper'")
             .groupby(['Name', 'Cause of Death'])
             .size()
             .reset_index(name='Count')
             .sort_values('Count', ascending=False))

print(dennis_df)

            Name                 Cause of Death  Count
4  Dennis Hopper                       Firearms     16
6  Dennis Hopper  Knives or Cutting Instruments      6
3  Dennis Hopper                     Explosives      5
7  Dennis Hopper                          Other      4
1  Dennis Hopper                  Blunt Objects      3
0  Dennis Hopper                        Ailment      2
5  Dennis Hopper                         Impact      2
2  Dennis Hopper                            Car      1
8  Dennis Hopper                         Poison      1
9  Dennis Hopper                  Strangulation      1
