In [1]:
import re
import pandas as pd
import os

In [2]:
df = pd.read_csv('Data/regex_df.csv')

In [3]:
df

Unnamed: 0,season,episode,title,url,scripts
0,SEASON 1,1,Cartman Gets an Anal Probe,https://southpark.fandom.com/wiki/Cartman_Gets...,Cast\nStan Marsh\nKyle Broflovski\nEric Cartma...
1,SEASON 1,2,Weight Gain 4000,https://southpark.fandom.com/wiki/Weight_Gain_...,Cast\nStan Marsh\nKyle Broflovski\nEric Cartma...
2,SEASON 1,3,Volcano,https://southpark.fandom.com/wiki/Volcano/Script,Cast\nStan Marsh\nKyle Broflovski\nEric Cartma...
3,SEASON 1,4,Big Gay Al's Big Gay Boat Ride,https://southpark.fandom.com/wiki/Big_Gay_Al%2...,Cast\nStan Marsh\nKyle Broflovski\nEric Cartma...
4,SEASON 1,5,An Elephant Makes Love to a Pig,https://southpark.fandom.com/wiki/An_Elephant_...,Cast\nStan Marsh\nKyle Broflovski\nEric Cartma...
...,...,...,...,...,...
316,SEASON 26,2,The Worldwide Privacy Tour,https://southpark.fandom.com/wiki/The_Worldwid...,Cast\nKyle Broflovski\nStan Marsh\nEric Cartma...
317,SEASON 26,3,Japanese Toilet,https://southpark.fandom.com/wiki/Japanese_Toi...,Cast\nRandy Marsh\nSharon Marsh\nShelley Marsh...
318,SEASON 26,4,Deep Learning,https://southpark.fandom.com/wiki/Deep_Learnin...,Cast\nBebe Stevens\nNelly\nRed McArthur\nNicho...
319,SEASON 26,5,DikinBaus Hot Dogs,https://southpark.fandom.com/wiki/DikinBaus_Ho...,Cast\nButters Stotch\nEric Cartman\nStan Marsh...


In [4]:
script = df.scripts[0]

In [5]:
script

'Cast\nStan Marsh\nKyle Broflovski\nEric Cartman\nKenny McCormick\nIke Broflovski\nChef\nMs. Crabtree\nBill Denkins\nOfficer Barbrady\nMr. Garrison\nMr. Hat\nTrain conductor\nA kid\nJason White\nWendy Testaburger\nLiane Cartman\nNews reporter\nMr. Kitty\nCarl the Visitor\nBlonde woman\nVisitors\nCows\nScript\nCartman Gets an Anal Probe\nAt the bus stop.\nThe boys\nSchool days, school days, teacher\'s golden ru... [Ike runs to the team]\nKyle Broflovski\nAh, damn it! My little brother\'s trying to follow me to school again.\nIke Broflovski\nEe gko zeeponanner.\nKyle\nIke, you can\'t come to school with me. [Ike chortles.]\nEric Cartman\nYeah, go home you little dildo!\nKyle\nDude, don\'t call my brother a dildo!\nStan Marsh\nWhat\'s a dildo?\nKyle\nWell, I don\'t know...[faces Cartman and points at him.] and I\'ll bet Cartman doesn\'t know either!\nCartman\nI know what it means!\nKyle\nWell, what?\nCartman\nI\'m not telling you.\nStan\nWhat\'s a dildo, Kenny?\nKenny McCormick\n(It\'s a 

# Remove anything inbetween square brackets

(Including the square brackets themselves).

You'll notice in the script above that we have some text within brackets which describes character's behaviour.

E.g.:

* [Ike chortles.]
* [They laugh.]
* [His voice is rising to an audible level.]

This is interesting info but nonetheless irrelevant for our main aims, so we're going to trawl through every script in our dataframe and remove text within the square brackets, as well as the square brackets themselves.

REGEX PATTERN = \[.*?]

* \[ - finds the first opening square bracket. \ is an escape sequence used to match a character which usually has a special meaning. Square brackets usually indicate a character class.

* . - meta-character which matches any one character

* asterix is an occurance indicator which indicates to match 0 or more times

* ? - is a lazy operator which when used after * makes the occurance indicator non-greedy, meaning it'll match as few characters as possible. 

Removing the ? would make this regex pattern greedy, meaning it matched everything from the first open bracket to the last open bracket. E.g. it'd return this: ['[example1] some text [example2]']

Whereas, keeping it means we match as few characters as possible to stop at the first closing bracket, allowing for multiple matches. E.g. we'd get separate matches: ['[example1]', '[example2]']

**NOTE:** If we just wanted to capture the text within the square brackets, we'd use '()' to capture a group.

## Small test

In [38]:
df.scripts[9] = df.scripts[9].replace("[Clouds begin to conceal the sun.", "[Clouds begin to conceal the sun.]")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.scripts[9] = df.scripts[9].replace("[Clouds begin to conceal the sun.", "[Clouds begin to conceal the sun.]")


In [39]:
test = re.sub("\[.*?]", '', script, flags = re.DOTALL)

In [40]:
test

'Cast\nStan Marsh\nKyle Broflovski\nEric Cartman\nKenny McCormick\nIke Broflovski\nChef\nMs. Crabtree\nBill Denkins\nOfficer Barbrady\nMr. Garrison\nMr. Hat\nTrain conductor\nA kid\nJason White\nWendy Testaburger\nLiane Cartman\nNews reporter\nMr. Kitty\nCarl the Visitor\nBlonde woman\nVisitors\nCows\nScript\nCartman Gets an Anal Probe\nAt the bus stop.\nThe boys\nSchool days, school days, teacher\'s golden ru... \nKyle Broflovski\nAh, damn it! My little brother\'s trying to follow me to school again.\nIke Broflovski\nEe gko zeeponanner.\nKyle\nIke, you can\'t come to school with me. \nEric Cartman\nYeah, go home you little dildo!\nKyle\nDude, don\'t call my brother a dildo!\nStan Marsh\nWhat\'s a dildo?\nKyle\nWell, I don\'t know... and I\'ll bet Cartman doesn\'t know either!\nCartman\nI know what it means!\nKyle\nWell, what?\nCartman\nI\'m not telling you.\nStan\nWhat\'s a dildo, Kenny?\nKenny McCormick\n(It\'s a giant stick that goes inside the mom’s vagina.) \nCartman\nHe-yeah, tha

In [41]:
matches = re.findall("\[.*?]", script, flags=re.DOTALL)

In [42]:
s = df.scripts

In [43]:
brackets = s.str.findall("\[.*?]", flags = re.DOTALL)

In [51]:
from __future__ import unicode_literals
import spacy,en_core_web_sm
import textacy
nlp = en_core_web_sm.load()

In [84]:
import spacy   
from spacy.matcher import Matcher
from spacy.util import filter_spans
# instantiate a Matcher instance
matcher = Matcher(nlp.vocab)
pattern = [{'POS':'VERB'}]
matcher.add("Verb", [pattern])
nlp = spacy.load('en_core_web_sm') 
verbs = []

for i in brackets:
    for x in i:
        doc = nlp(x)
        matches = matcher(doc)
        spans = [doc[start:end] for _, start, end in matches]
        for verb in spans:
            if verb != None:
                verbs.append(spans)

In [95]:
brackets[69]

['[voice only]',
 "[shot of the TV, with the camera panning up to reveal Punky Brewster sitting in a director's chair]",
 "[image of Punky makes room for the show's logo on the right side]",
 '[Cartman changes the channel]',
 '[Shot of TV as "The Terrance & Phillip Show" begins]',
 '[farts]',
 '[both laugh]',
 '[faaaaaaaaaaaaaaaart. Whatever Terrance is saying is drowned out by the fart]',
 '[boys join in]',
 '[Kyle chimes in]',
 '[Terrance lowers his pants enough to show his ass; Phillip sticks his finger in there]',
 "[now wearing a mechanic's hat]",
 "[the two laugh, then the boys laugh, then the laughter dies down. Terrance farts again, and they both laugh as the camera's iris shuts slowly.]",
 '[toilet water erupts from the toilet and blows them off camera]',
 '[On TV, "HEY KIDS!!," white on green, appears and gets bigger, then dances on a blinking background]',
 '["Terrance and Phillip LIVE! And in person" appears, then Terrance and Phillip flank the text on either side]',
 '[sce

In [97]:
fin_verbs = []

for i in verbs:
    for words in i:
        print(words)
        fin_verbs.append(words)

runs
chortles
faces
laugh
knocking
laughs
drives
gets
drives
gets
Waddles
Turning
see
Turning
see
runs
laugh
drowned
rising
walks
shouts
fire
Drives
flashing
Drives
flashing
pursues
fire
hands
leaves
follow
glances
gets
shouts
hear
shouts
hear
Turning
face
Turning
face
Pulling
gets
gets
walks
hurries
walks
stops
Turns
moves
walks
following
walks
following
walks
fire
looks
Driving
stops
Driving
stops
appear
shrugs
leave
Thinking
Passing
Stopping
singing
Turning
face
Turning
face
Shrieking
Hushed
Rushing
Takes
run
Flashes
Crowd
Snapping
Muffled
Whining
Arresting
Drops
Beeps
Thump
turns
Rowing
Shoots
Sees
dressed
Sees
dressed
Hushed
drops
leave
Looking
distracted
sees
laugh
starts
punches
punches
leaves
enter
heckling
rushes
shaking
mocking
Whining
laughs
Wagging
Pissed
Swooning
Heading
labored
stopping
hushed
pointing
concerned
darting
leans
Getting
Taunting
torches
begin
conceal
begin
conceal
Holding
Sees
coming
Sees
coming
moves
Falls
Drops
leaves
leaves
Thumps
Tosses
pinned
Slapping
G

begins
play
begin
talk
activates
appears
begin
talk
activates
appears
begin
talk
activates
appears
begin
talk
activates
appears
walks
sticks
starting
rising
leaves
faces
starting
rising
leaves
faces
starting
rising
leaves
faces
starting
rising
leaves
faces
collapses
begins
cry
collapses
begins
cry
collapses
begins
cry
rush
surround
pour
play
stops
buys
hear
look
surround
rush
surround
pour
play
stops
buys
hear
look
surround
rush
surround
pour
play
stops
buys
hear
look
surround
rush
surround
pour
play
stops
buys
hear
look
surround
rush
surround
pour
play
stops
buys
hear
look
surround
rush
surround
pour
play
stops
buys
hear
look
surround
rush
surround
pour
play
stops
buys
hear
look
surround
rush
surround
pour
play
stops
buys
hear
look
surround
rush
surround
pour
play
stops
buys
hear
look
surround
sticks
subdued
surrounded
subdued
surrounded
subdued
collect
draws
turns
moves
remains
glances
remains
glances
starts
taking
starts
taking
shown
arrested
shown
arrested
appears
says
appears
says

In [103]:
fin_verbs

[runs,
 chortles,
 faces,
 laugh,
 knocking,
 laughs,
 drives,
 gets,
 drives,
 gets,
 Waddles,
 Turning,
 see,
 Turning,
 see,
 runs,
 laugh,
 drowned,
 rising,
 walks,
 shouts,
 fire,
 Drives,
 flashing,
 Drives,
 flashing,
 pursues,
 fire,
 hands,
 leaves,
 follow,
 glances,
 gets,
 shouts,
 hear,
 shouts,
 hear,
 Turning,
 face,
 Turning,
 face,
 Pulling,
 gets,
 gets,
 walks,
 hurries,
 walks,
 stops,
 Turns,
 moves,
 walks,
 following,
 walks,
 following,
 walks,
 fire,
 looks,
 Driving,
 stops,
 Driving,
 stops,
 appear,
 shrugs,
 leave,
 Thinking,
 Passing,
 Stopping,
 singing,
 Turning,
 face,
 Turning,
 face,
 Shrieking,
 Hushed,
 Rushing,
 Takes,
 run,
 Flashes,
 Crowd,
 Snapping,
 Muffled,
 Whining,
 Arresting,
 Drops,
 Beeps,
 Thump,
 turns,
 Rowing,
 Shoots,
 Sees,
 dressed,
 Sees,
 dressed,
 Hushed,
 drops,
 leave,
 Looking,
 distracted,
 sees,
 laugh,
 starts,
 punches,
 punches,
 leaves,
 enter,
 heckling,
 rushes,
 shaking,
 mocking,
 Whining,
 laughs,
 Wagging,
 Piss

In [105]:
from collections import Counter

Counter(fin_verbs)

Counter({plays: 27,
         following: 27,
         flows: 27,
         closes: 27,
         handwritten: 27,
         looks: 27,
         playing: 27,
         like: 27,
         listen: 27,
         looks: 27,
         returns: 27,
         stops: 27,
         putting: 27,
         listen: 27,
         wearing: 27,
         dancing: 27,
         looks: 27,
         waves: 27,
         conducting: 27,
         listening: 27,
         wearing: 27,
         dancing: 27,
         looking: 27,
         moves: 27,
         delete: 27,
         gone: 27,
         shuts: 27,
         arrives: 24,
         grabs: 24,
         butters: 24,
         leaves: 24,
         rubs: 24,
         struggles: 24,
         pops: 24,
         falling: 24,
         stumbles: 24,
         rises: 24,
         stumbles: 24,
         stands: 24,
         raises: 24,
         raises: 24,
         exposed: 24,
         notices: 24,
         lowers: 24,
         raises: 24,
         rises: 24,
         lowers: 24

## Custom function to remove brackets

In [None]:
df['scripts'].replace(to_replace="\[.*?]", value="", regex=True, inplace=True)

In [None]:
df.scripts[2]

In [None]:
from __future__ import unicode_literals
import spacy,en_core_web_sm
import textacy

In [None]:
type(matches)

In [None]:
for i in matches:
    print(i)

In [None]:
re.sub("End of {title}.*", '', script, flags=re.DOTALL)

In [None]:
pattern = 'Cartman'
c_matches = re.findall(pattern, script)

In [None]:
pattern = 'Kenny'
k_matches = re.findall(pattern, script)

In [None]:
pattern = 'Stan'
s_matches = re.findall(pattern, script)

In [None]:
pattern = 'Kyle'
ky_matches = re.findall(pattern, script)

In [None]:
len(matches)

In [None]:
len(k_matches)

In [None]:
len(s_matches)

In [None]:
len(ky_matches)