In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
files = ["SW_EpisodeIV.txt", "SW_EpisodeV.txt", "SW_EpisodeVI.txt"]

In [209]:
# load files
df = pd.read_csv("star_wars_1_data.csv", sep=";")
df.iloc[0]

id                                           1.0
from                                     QUI-GON
to                                       CAPTAIN
text                                    Captain.
text to check                                NaN
where            INT. REPUBLIC CRUISER - COCKPIT
number                                         1
nouns                                          0
determiners                                    0
conjuctions                                    0
adjectives                                     0
prepositions                                   0
pronouns                                       0
verbs                                          0
adverbs                                        0
undefined                                      0
time                                           9
Name: 0, dtype: object

In [212]:
# fix in data
df.loc[df["to"]=="QUI-GON :Your negotiations seem", "text"] = "Your negotiations seem"
df["to"] = df["to"].str.replace("QUI-GON :Your negotiations seem", "QUI-GON")

df.loc[df["to"]=="ANAKIN: The forward stabalizer.", "text"] = "The forward stabalizer"
df["to"] = df["to"].str.replace("ANAKIN: The forward stabalizer.", "ANAKIN")

df.loc[df["to"]=="QUI-GON :They wouldn't dare.", "text"] = "They wouldn't dare."
df["to"] = df["to"].str.replace("QUI-GON :They wouldn't dare.", "QUI-GON")


# replace incorrect names
names_to_replace = { "PALAPATINE": "PALPATINE",
                    "APDME": "PADME",
                    "CONTROLER": "CONTROLLER",
                    "ANAKAIN": "ANAKIN",
                    "AMADILA": "AMIDALA",
                    "CATP. PANAKA":"CAPT. PANAKA",
                    "CAPR. PANAKA": "CAPT. PANAKA",
                    "MACE IWNDU":"MACE WINDU",
                    "ARTOO": "R2D2",
                    "THREEPIO": "C-3PO",
                    "PILOTS": "PILOT",
                    "OOM-CAPTAIN": "CAPTAIN"
                }

for old_text, new_text in names_to_replace.items():
    df["to"] = df["to"].str.replace(old_text,new_text)
    df["from"] = df["from"].str.replace(old_text,new_text)

In [221]:
# Identify secondary actors (speak/spoken to < 10 times)
_from = df["from"].value_counts().to_frame()
_to = df["to"].value_counts().to_frame()

total = pd.concat([_from, _to], axis=1)
total["Total"] = total.sum(axis=1)

total.to_csv("secondary_actors.csv")
total[total["Total"]<10].index

Index(['KI-ADI', 'KITSTER', 'TC-14', 'CAPTAIN', 'DOFINE', 'TEY HOW', 'RABE',
       'GUARD DROID', 'DARTH MAUL', 'GENERAL CEEL', 'OWO-1', 'LOTT DOD',
       'JABBA', 'QUI-GON GUARD DROID', 'CAPT. TARPALS', 'AKS MOE',
       'MAS AMEDDA', 'GUARD', 'c', 'SABE', 'BRAVO TWO', 'GUNGAN LOOKOUT',
       'ODY', 'PILOT', 'TC14', 'VENDOR', 'CAPORAL', 'SEEK', 'AMEE', 'WALD',
       'JIRA', 'CONTROLLER', 'BRAVO THREE', 'SENAT', 'SOLDATS', 'RACERS',
       'SOLDIER', 'DROIDS'],
      dtype='object')

In [222]:
# get clean data for chord diagram
df["to_clean"] = df["to"]
df["from_clean"] = df["from"]

# rename secondary actors
for name in total[total["Total"]<10].index:
    df["to_clean"] = df["to_clean"].str.replace(name, "Other")
    df["from_clean"] = df["from_clean"].str.replace(name, "Other")

In [241]:
# exlude unknown speakers
filter = (~df["to_clean"].isin(["X_X", "PUBLIC", "SPEAKER-A", "SPEAKER B"])) & (~df["from_clean"].isin(["X_X", "PUBLIC", "SPEAKER-A", "SPEAKER B"]))
chord = df[filter][["from_clean", "to_clean"]].value_counts().to_frame().reset_index()

# sort by total appearances
chord = pd.merge(chord, total["Total"], left_on="from_clean", right_index=True, how="left")
chord.sort_values(by="Total", ascending=False).to_csv("test_chord.csv")

In [242]:
# test chord
filter = (df["to_clean"]!="X_X") & (df["from_clean"]!="X_X")
chord2 = df[filter].groupby(["from_clean", "to_clean"])["number"].sum().to_frame()

chord2.to_csv("test_chord2.csv")

In [236]:
chord

Unnamed: 0,from_clean,to_clean,count,Total
0,ANAKIN,QUI-GON,42,276.0
1,QUI-GON,ANAKIN,40,383.0
2,OBI-WAN,QUI-GON,38,121.0
3,QUI-GON,OBI-WAN,38,383.0
4,JAR JAR,QUI-GON,27,105.0
...,...,...,...,...
142,NUTE,PADME,1,92.0
143,QUI-GON,VALORUM,1,383.0
144,MACE WINDU,Other,1,24.0
145,PADME,JAR JAR,1,95.0


In [244]:
matrix = chord
matrix = matrix.pivot(index="from_clean", columns="to_clean", values="count")
matrix = matrix.fillna(0)

matrix.to_numpy()

#df.pivot(index='foo', columns='bar', values=['baz', 'zoo'])

array([[ 0.,  2.,  3.,  2.,  0.,  5.,  0.,  4.,  0.,  5.,  0.,  0.,  2.,
         1., 11.,  4.,  1.,  0.,  0.,  0.,  0.,  3.,  0.,  0.],
       [ 2.,  0.,  0.,  0.,  3.,  0.,  0.,  1.,  1.,  0.,  2.,  0.,  3.,
        23.,  0., 42., 18.,  1.,  0.,  4., 19.,  0.,  2.,  3.],
       [ 5.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  2.,  0.,  0.,  0.,
         1.,  0.,  2.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  4.,  0.,  0.,  2.,  0.,  0.,
         2.,  0.,  7.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  5.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0., 10.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [11.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  4.,  0.,  6.,
         4.,  0.,  9.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0., 14.,  0.,  0.,  3.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 4., 13.,  0.,  3.,  0.,  0.,  0.

In [245]:
matrix.index
#len(matrix.columns)

Index(['AMIDALA', 'ANAKIN', 'BIBBLE', 'BOSS NASS', 'C-3PO', 'CAPT. PANAKA',
       'DARTH SIDIOUS', 'JAR JAR', 'MACE WINDU', 'NUTE', 'OBI-WAN', 'OOM-9',
       'Other', 'PADME', 'PALPATINE', 'QUI-GON', 'R2D2', 'RIC OLIE', 'RUNE',
       'SEBULBA', 'SHMI', 'VALORUM', 'WATTO', 'YODA'],
      dtype='object', name='from_clean')

### Other sources

In [61]:
# load files --> txt files
df = pd.read_csv(files[0], sep='""', quotechar="'",engine="python")

def extract(text):      
  import re
  matches = re.findall(r'"(.+?)"',text)
  return list(matches)

df_new = []
for i in range(len(df)):
  df_new.append(extract(str(df.loc[i].values)))

df_new = pd.DataFrame(df_new, columns=["index", "character", "dialogue"]).drop(columns=["index"])
df_new

Unnamed: 0,character,dialogue
0,THREEPIO,Did you hear that? They\'ve shut down the mai...
1,THREEPIO,We\'re doomed!
2,THREEPIO,There\'ll be no escape for the Princess this t...
3,THREEPIO,What\'s that?
4,THREEPIO,I should have known better than to trust the l...
...,...,...
1005,LUKE,"Oh, no!"
1006,THREEPIO,"Oh, my! Artoo! Can you hear me? Say somethi..."
1007,TECHNICIAN,We\'ll get to work on him right away.
1008,THREEPIO,"You must repair him! Sir, if any of my circui..."
