# Import necessary libraries

In [56]:
import pandas as pd
import os
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import spacy
import numpy as np
from spacy import displacy
import re

In [49]:
!python3 -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [50]:
NER = spacy.load("en_core_web_sm")

# Read-in data

In [51]:
df = pd.read_csv('Raw_data/scripts.csv')

In [None]:
# Quick glance at data

df.head()

# Sentence tokenisation

In [52]:
# Let's have a look at the first script for S1:E1

df.scripts[0]

'Cartman Gets an Anal Probe/Script | South Park Archives | Fandom\nSouth Park ArchivesA new Paramount Plus Special Premieres May 24th on Paramount Plus! Click to learn more HERE\nJoin the SPA Discord to discuss the announcement with other fans and wiki editors. HEREREAD MORE\nSouth Park Archives\nExplore\nMain Page\nAll Pages\nCommunity\nInteractive Maps\nRecent Blog Posts\nSouth Park: The End of Obesity\nCharacters\nMain Characters\nEric Cartman\nStan Marsh\nKyle Broflovski\nKenny McCormick\nMajor Characters\nButters Stotch\nHerbert Garrison\nRandy Marsh\nFamilies\nMain Characters\' Families\nOther Characters\' Families\nPossible Families\nSupporting Characters\nAnnie Knitts\nBebe Stevens\nClyde Donovan\nCraig Tucker\nDougie O\'Connell\nMore...\nOther Characters\nSchool Characters\nCreatures\nCelebrities\nAlter Egos\nGroups\nUnnamed Characters\nAll Characters\nEpisodes\nLatest Episode\nSeasons 1-5\nSeason One\nSeason Two\nSeason Three\nSeason Four\nSeason Five\nSeasons 6-10\nSeason Si

In [53]:
# And let's look at one of the later ones..

df.scripts[300]

'Let Them Eat Goo/Script | South Park Archives | Fandom\nSouth Park ArchivesA new Paramount Plus Special Premieres May 24th on Paramount Plus! Click to learn more HERE\nJoin the SPA Discord to discuss the announcement with other fans and wiki editors. HEREREAD MORE\nSouth Park Archives\nExplore\nMain Page\nAll Pages\nCommunity\nInteractive Maps\nRecent Blog Posts\nSouth Park: The End of Obesity\nCharacters\nMain Characters\nEric Cartman\nStan Marsh\nKyle Broflovski\nKenny McCormick\nMajor Characters\nButters Stotch\nHerbert Garrison\nRandy Marsh\nFamilies\nMain Characters\' Families\nOther Characters\' Families\nPossible Families\nSupporting Characters\nAnnie Knitts\nBebe Stevens\nClyde Donovan\nCraig Tucker\nDougie O\'Connell\nMore...\nOther Characters\nSchool Characters\nCreatures\nCelebrities\nAlter Egos\nGroups\nUnnamed Characters\nAll Characters\nEpisodes\nLatest Episode\nSeasons 1-5\nSeason One\nSeason Two\nSeason Three\nSeason Four\nSeason Five\nSeasons 6-10\nSeason Six\nSeason 

Okay, we can start to see some common features and areas where we can trim the scripts. It seems that each script:

* Starts after the "Cast" of characters ends
* Ends with: "End of" followed by the episode name

So maybe we keep everything after the cast of characters and before the "end of".

In [57]:
df['scripts'] = df['scripts'].apply(lambda x: re.sub(r'^.*?Cast','Cast', x, flags = re.DOTALL))

In [58]:
df

Unnamed: 0,season,episode,title,url,scripts
0,SEASON 1,1,Cartman Gets an Anal Probe,https://southpark.fandom.com/wiki/Cartman_Gets...,Cast\nStan Marsh\nKyle Broflovski\nEric Cartma...
1,SEASON 1,2,Weight Gain 4000,https://southpark.fandom.com/wiki/Weight_Gain_...,Cast\nStan Marsh\nKyle Broflovski\nEric Cartma...
2,SEASON 1,3,Volcano,https://southpark.fandom.com/wiki/Volcano/Script,Cast\nStan Marsh\nKyle Broflovski\nEric Cartma...
3,SEASON 1,4,Big Gay Al's Big Gay Boat Ride,https://southpark.fandom.com/wiki/Big_Gay_Al%2...,Cast\nStan Marsh\nKyle Broflovski\nEric Cartma...
4,SEASON 1,5,An Elephant Makes Love to a Pig,https://southpark.fandom.com/wiki/An_Elephant_...,Cast\nStan Marsh\nKyle Broflovski\nEric Cartma...
...,...,...,...,...,...
316,SEASON 26,2,The Worldwide Privacy Tour,https://southpark.fandom.com/wiki/The_Worldwid...,Cast\nKyle Broflovski\nStan Marsh\nEric Cartma...
317,SEASON 26,3,Japanese Toilet,https://southpark.fandom.com/wiki/Japanese_Toi...,Cast\nRandy Marsh\nSharon Marsh\nShelley Marsh...
318,SEASON 26,4,Deep Learning,https://southpark.fandom.com/wiki/Deep_Learnin...,Cast\nBebe Stevens\nNelly\nRed McArthur\nNicho...
319,SEASON 26,5,DikinBaus Hot Dogs,https://southpark.fandom.com/wiki/DikinBaus_Ho...,Cast\nButters Stotch\nEric Cartman\nStan Marsh...


In [75]:
fep = "DikinBaus Hot Dogs"

In [76]:
pat = rf"End of {fep}.*"

In [77]:
test_case = df.scripts[319]

In [78]:
new = re.sub(pat, '', test_case, flags = re.DOTALL)

In [79]:
new

'Cast\nButters Stotch\nEric Cartman\nStan Marsh\nKyle Broflovski\nLiane Cartman\nManager\nChild\nKenny McCormick\nBanker\nReporter\nConstruction Worker\nStephen Stotch\nLinda Stotch\nTeenager\nDarryl Weathers\nScott Malkinson\nToddler\nClyde Donovan\nFemale Customer\nMale Customer\nJimbo Kern\nMayor McDaniels\nMover\nScript\nDikinBaus Hot Dogs\nThe four boys are playing basketball, when Butters runs up to them.\nButters\nFellas! Hey, fellas, you\'re not gonna believe it!\nCartman\nButters, what the hell are you wearing?\nButters\nI got a job over at the ice cream shop! Guess what? I got my very first paycheck!\nCartman\nThat\'s not fair! I want a paycheck!\nButters\nMy dad told me if I got a job, we could put my paychecks in my very own bank account.\nCartman\nI want a bank account!\nStan\nI didn\'t know kids could get jobs.\nButters\nYeah, well, I guess these days it\'s really hard for businesses to find people to work, so they\'ll take whatever they can get.\nCartman\nI want to take 

In [80]:
def match_title(script):
    for title in df.title:
        if re.search(title, script, flags=re.DOTALL):
            script = re.sub(rf"End of {title}.*", f'End of {title}', script, flags=re.DOTALL)
    return script
# FINISH UP


In [81]:
df['scripts'] = df['scripts'].apply(match_title)

In [82]:
df

Unnamed: 0,season,episode,title,url,scripts
0,SEASON 1,1,Cartman Gets an Anal Probe,https://southpark.fandom.com/wiki/Cartman_Gets...,Cast\nStan Marsh\nKyle Broflovski\nEric Cartma...
1,SEASON 1,2,Weight Gain 4000,https://southpark.fandom.com/wiki/Weight_Gain_...,Cast\nStan Marsh\nKyle Broflovski\nEric Cartma...
2,SEASON 1,3,Volcano,https://southpark.fandom.com/wiki/Volcano/Script,Cast\nStan Marsh\nKyle Broflovski\nEric Cartma...
3,SEASON 1,4,Big Gay Al's Big Gay Boat Ride,https://southpark.fandom.com/wiki/Big_Gay_Al%2...,Cast\nStan Marsh\nKyle Broflovski\nEric Cartma...
4,SEASON 1,5,An Elephant Makes Love to a Pig,https://southpark.fandom.com/wiki/An_Elephant_...,Cast\nStan Marsh\nKyle Broflovski\nEric Cartma...
...,...,...,...,...,...
316,SEASON 26,2,The Worldwide Privacy Tour,https://southpark.fandom.com/wiki/The_Worldwid...,Cast\nKyle Broflovski\nStan Marsh\nEric Cartma...
317,SEASON 26,3,Japanese Toilet,https://southpark.fandom.com/wiki/Japanese_Toi...,Cast\nRandy Marsh\nSharon Marsh\nShelley Marsh...
318,SEASON 26,4,Deep Learning,https://southpark.fandom.com/wiki/Deep_Learnin...,Cast\nBebe Stevens\nNelly\nRed McArthur\nNicho...
319,SEASON 26,5,DikinBaus Hot Dogs,https://southpark.fandom.com/wiki/DikinBaus_Ho...,Cast\nButters Stotch\nEric Cartman\nStan Marsh...


In [89]:
df.scripts[288]

'Cast[]\nEric Cartman\nButters Stotch\nStan Marsh\nKyle Broflovski\nClyde Donovan\nFather Maxi\nRandy Marsh\nSharon Marsh\nShelly Marsh\nStephen Stotch\nLinda Stotch\nRyan Valmer\nStuart McCormick\nMr. Mackey\nJimbo Kern\nTed and Hazel\nJosie and townsman (her husband)\nTownsfolk\nCatholic Cleanup Crew\nScript[]\nA Boy And A Priest\nThe Marsh house, day. The family exits and goes to the car. Stan\'s right arm is in a cast and sling from last week\'s gunshot at the end of the episode.\nRandy\nCome on, guys, we don\'t wanna be late!\nStan\n[trailing behind] Do I have to go? It\'s the only day I get to play games.\nRandy\nWe\'re all going. Come on! [the others get in]\nOn the road. Stan sits behind Randy, Shelley sits behind Sharon\nStan\n[upset] ...Don\'t understand why we have to go to church every Sunday.\nRandy\nChurch is important, Stan. Way more important than video games and TV. Church is about community and coming together. A lot of things.\nSharon\nAll I know is that after church

# Export csv.. for now


In [91]:
os.getcwd()

'/Users/loucap/Documents/GitWork/SNA'

In [96]:
regex_df = df.to_csv('Data/regex_df.csv', index = False)

In [46]:
doc = NER(df.scripts[320])

In [47]:
html = displacy.render(doc, style="ent")

In [None]:
character_df = pd.read_csv('../Data/main_cat.csv')

In [None]:
character_df.loc[1176, 'Character'] = 'Member Berries'

In [None]:
character_df[character_df.Character.isna()]

In [None]:
import re
character_df['Character'] = character_df['Character'].apply(lambda x: re.sub("[\(].*?[\)]", "", x))
character_df['First_name'] = character_df['Character'].apply(lambda x: x.split(' ',1)[0])

In [None]:
filtered = character_df[character_df['Character'].str.contains(r'\b(Mr\. |Dr\. |[a-zA-Z]+\'s)\b')]


In [None]:
filtered[filtered.Character.str.contains('Mr.')].head(20)

In [None]:
s = character_df.Character

In [None]:
s.str.split(pat = None, n = 1).str[0]

In [None]:
test = test.replace('\n', ' ')

In [None]:
test

In [None]:
sample = "Oh crap. Here we go again...I don't believe this. Believe what?"

In [None]:
sent_tokenize(sample)

In [None]:
sample = "The young girl took a walk. Hey you! Get over here now! The man yelled at the girl. I must've told you I talk to you...NOW"

In [None]:
sent_tokenize(sample)

In [None]:


sentence_tokens = sent_tokenize(test)
print("Sentence Tokens:", sentence_tokens)