<a href="https://colab.research.google.com/github/Guimol/Star-Wars-Characters-Relations/blob/main/Star_Wars_Characters_Relations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###### Change page's CSS to be more visually appealing

In [1]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

# Importing Libraries

In [2]:
!pip install scipy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
from scipy.special import softmax
from typing import Tuple
from io import open
import statistics
import requests
import time
import re

# Files obtention
* Path for the data file (external link): [Star Wars Movie Scripts](https://www.kaggle.com/datasets/xvivancos/star-wars-movie-scripts)
* GitHub repository: [Star Wars Characters Relations](https://github.com/Guimol/Star-Wars-Characters-Relations)

In [4]:
movie_files = {
  "movieIV": "https://raw.githubusercontent.com/Guimol/Star-Wars-Characters-Relations/main/datasets/SW_EpisodeIV.txt",
  "movieV": "https://raw.githubusercontent.com/Guimol/Star-Wars-Characters-Relations/main/datasets/SW_EpisodeV.txt",
  "movieVI": "https://raw.githubusercontent.com/Guimol/Star-Wars-Characters-Relations/main/datasets/SW_EpisodeVI.txt",
}

# Initializing movie dictionaries
movies = dict()

# Creating local files for the corpus and opening them
for title, link in movie_files.items():
  # Access a link
  r = requests.get(link, allow_redirects=True)

  # Read file in the link and store it locally
  open(title + '.txt', 'wb').write(r.content)
  
  # Fill the dictionary with data obtained in the local file
  movies[title] = {"raw": open(title + '.txt', 'r').readlines()}

# Text Preprocessing

Making the dialogs lower case

In [5]:
for title in movies:
  movies[title]['lower'] = [line.lower() for line in movies[title]['raw']]

# Characters Identification

Character Class, stores all information regarding a character:
* Name
* Dialogs
* Connections

In [6]:
class Character:
  def __init__(self, name: str):
    self.name = name
    self.dialogs = dict()
    self.relation = dict()

  def __str__(self):
    return f"{self.name}"

  def __repr__(self):
    return f"Class Character(name={self.name})"

  def __eq__(self, comparison):
    if isinstance(comparison, Character):
      return self.name == comparison.name
    else:
      return self.name == comparison

  def __hash__(self):
    return hash(self.name)

  def init_relation(self, character: str):
    self.relation[character] = {'positive': list(), 'neutral': list(), 'negative': list()}

  def add_line(self, line_id: int, line: str):
    self.dialogs[line_id] = {'dialog': line, 'next': None}

  def point_next_character(self, line_id: int, next_character: str):
    self.dialogs[line_id]['next'] = next_character

  def clear_dialogs(self):
    self.dialogs.clear()

Iterate over a movie and adds each dialog associating them to a Character class

In [7]:
character_dict = dict()

for idx, line in enumerate(movies['movieIV']['lower']):
  # RegEx to match text: "text" "other text" "third text" -> [text, other text, third text]
  text = re.split('\"(.*?)\"', line.strip())

  # Remove unwanted strings obtained by RegEx
  text = list(filter(lambda x: x not in ['', ' '], text))

  # Flag to create a new character
  found = False
  
  # If line in the pattern: "<line_number>" "<character_name>" "<dialog>"
  if len(text) >= 3:
    # Remove " from the preprocessed text
    text = [x.replace("\"", "") for x in text]

    # Joins the remainder text together
    if len(text) > 3:
      text[2] = ('').join(text[2:])

    # Extracts current character's name
    character_name = text[1]
    
    # Skips first line
    if idx != 1:
      # Points the previous character to current character
      previous_character.point_next_character(int(text[0]) - 1, character_name)

    # Checks if current character is new on the dict
    if character_name not in character_dict:
      character_dict.update({character_name: Character(character_name)})

    # Adds current line to character class
    character_dict[character_name].add_line(int(text[0]), text[2])
    
    # Overwrites previous character variable
    previous_character = character_dict[character_name]

# Initialize characters relations

For each character initialize a dict showing if their relation is positive or negative

In [8]:
for character in character_dict.values():
  for relation_character in character_dict.values():
    if not character == relation_character:
      character.init_relation(relation_character.name)

# Processing

API to NLP tasks. Inference is run on Hugginface API

In [9]:
API_TOKEN = "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"

POS_API_URL = "https://api-inference.huggingface.co/models/vblagoje/bert-english-uncased-finetuned-pos"
SENTIMENT_API_URL = "https://api-inference.huggingface.co/models/cardiffnlp/twitter-roberta-base-sentiment-latest"
headers = {"Authorization": f"Bearer {API_TOKEN}"}

## Inference API Callback

### POS Callback

In [10]:
def process_pos_output(pos_output: list) -> list:
  for item in pos_output:
    item["entity_group"] = item.pop("entity")

  return pos_output

### Sentiment Callback

In [11]:
def order_sentiment_output(sentiment_output: list) -> list:
  sentiment_output.sort(reverse=True, key=lambda item: item["score"])
  return sentiment_output

In [12]:
def process_sentiment_output(sentiment_output) -> list:
  labels = ["negative", "neutral", "positive"]
  scores = softmax(sentiment_output[0][0].detach().numpy())

  processed_output = list()

  for label, score in zip(labels, scores):
    processed_output.append({"label": label, "score": score})

  ordered_processed_output = order_sentiment_output(processed_output)

  return ordered_processed_output

### Query Callback

Initialize global variables

In [13]:
CALLBACK_FIRST_RUN = True
sentiment_tokenizer = None
sentiment_model = None
pos_pipeline = None

In [14]:
def query_callback(task: str, text: str) -> list:

  global CALLBACK_FIRST_RUN, sentiment_tokenizer, sentiment_model, pos_pipeline

  if CALLBACK_FIRST_RUN:
    print("Inference API failed too many times.\nInitializing models locally...")
    !pip install transformers

  from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

  if CALLBACK_FIRST_RUN:
    sentiment_tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
    sentiment_model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")

    pos_pipeline = pipeline(model="vblagoje/bert-english-uncased-finetuned-pos")

    CALLBACK_FIRST_RUN = False

  if task == "pos":
    output = process_pos_output(pos_pipeline(text))
  elif task == "sentiment":
    encoded_text = sentiment_tokenizer(text, return_tensors="pt")
    output = process_sentiment_output(sentiment_model(**encoded_text))

  return output

## Inference API

In [29]:
ALREADY_FAILED = False

In [28]:
def query(task: str, input_text: str, wait_for_model_flag: bool=False, last_timeout: int=0) -> list:
  """
    Task = POS:

      Tags the input string with their according POS tags

      @param string to be processed
      @out list of dicts of each token and its classifications

    Task = Sentiment:

      Classifies the input string with their emotions: 
      Positive - [0.0, 1.0]
      Neutral - [0.0, 1.0]
      Negative - [0.0, 1.0]

      @param string to be processed
      @out list of list of dicts for each label
  """

  global ALREADY_FAILED

  if ALREADY_FAILED or int(last_timeout / 5) > 3:
    ALREADY_FAILED = True
    return query_callback(task.lower(), input_text)

  if task.lower() == "pos":
    API_URL = POS_API_URL
  elif task.lower() == "sentiment":
    API_URL = SENTIMENT_API_URL

  payload = {"inputs": input_text, "options": {"wait_for_model": wait_for_model_flag}}

  response = requests.post(API_URL, headers=headers, json=payload)

  try:
    if "error" in response.json().keys():
      if int(last_timeout/5) == 0:
        print(f"\n[{int(last_timeout/5)}] Error in {task} query")
      else:
        print(f"[{int(last_timeout/5)}] Error in {task} query")
      last_timeout += 5
      time.sleep(last_timeout)
      return query(task, input_text, True, last_timeout)
  except:
    if task.lower() == "pos":
      response = response.json()
    elif task.lower() == "sentiment":
      response = response.json()[0]

    return response

Standard error variables

In [16]:
PRONOUN_NOT_FOUND = "error"
RECEIVER_NOT_FOUND = "error"

## Pronoun identification

In [17]:
def identify_pronoun_type(pronoun: str) -> str:
  # English language pronouns
  first_person_singular_pronouns = ["i", "me", "my", "mine", "myself"]
  second_person_singular_pronouns = ["you", "your", "yours", "yourself"]
  third_person_singular_pronouns = ["he", "she", "it", "him", "her", "his", "its", "hers", "himself", "herself", "itself"]
  first_person_plural_pronouns = ["we", "us", "our", "ours", "ourselves"]
  second_person_plural_pronouns = ["you", "your", "yours", "yourselves"]
  third_person_plural_pronouns = ["they", "them", "their", "theirs", "themselves"]

  if pronoun in first_person_singular_pronouns:
    return "first_singular"
  if pronoun in second_person_singular_pronouns:
    return "second_singular"
  if pronoun in third_person_singular_pronouns:
    return "third_singular"
  if pronoun in first_person_plural_pronouns:
    return "first_plural"
  if pronoun in second_person_plural_pronouns:
    return "second_plural"
  if pronoun in third_person_plural_pronouns:
    return "third_plural"
  
  return PRONOUN_NOT_FOUND

## Search target receiver

### By next speaker on the script

In [18]:
def search_next_speaker(current_speaker: str, dialogue_dict: dict, characters: dict) -> str:
  current_line_id = list(dialogue_dict.keys())[0]

  if dialogue_dict[current_line_id]["next"] == current_speaker:
    return search_next_speaker(current_speaker, {current_line_id + 1: characters[current_speaker].dialogs[current_line_id + 1]}, characters)
  else:
    return dialogue_dict[current_line_id]["next"]

### By context of the dialog

In [19]:
def search_context(speech: str) -> Tuple[str, float]:
  speech_pos = query("pos", speech)

  possible_receivers = dict()
  for token in speech_pos:
    if token["entity_group"] == "PROPN" and token["score"] > 0.9:
      try:
        possible_receivers[token["word"]] += 1
      except:
        possible_receivers.update({token["word"]: 1})

  if len(possible_receivers) == 0: return RECEIVER_NOT_FOUND, 0.0

  max_frequency = 0
  for receiver, frequency in possible_receivers.items():
    if frequency > max_frequency:
      max_frequency = frequency
      possible_receiver = receiver

  return possible_receiver, max_frequency/len(possible_receivers)

## Identify the receiver of an dialog with probability

In [20]:
def identify_speech_receiver(speaker: str, dialogue_dict: dict, possible_characters: dict) -> Tuple[str, float]:
  
  pronouns_used = []
  for line_id, tokens in dialogue_dict.items():
    dialog_pos = query("pos", tokens["dialog"])

    for token in dialog_pos:
      if token["entity_group"] == "PRON" and token["score"] > 0.9:
        pronouns_used.append(token)

  if len(pronouns_used) == 0:
    return RECEIVER_NOT_FOUND, 0.0
    
  possible_receivers = list()
  for pronoun in pronouns_used:
    pronoun_type = identify_pronoun_type(pronoun["word"])
    
    if pronoun_type == PRONOUN_NOT_FOUND or pronoun_type == "first_singular": continue

    if pronoun_type in ["second_singular", "first_plural", "second_plural"]:
      speech_receiver = search_next_speaker(speaker, dialogue_dict, possible_characters)
      receiver_probability = 1.0

    elif pronoun_type in ["third_singular", "third_plural"]:
      speech_receiver, receiver_probability = search_context(tokens["dialog"])

    if len(possible_receivers) == 0:
      possible_receivers.append((speech_receiver, [receiver_probability]))
    else:
      if receiver_probability > statistics.mean(possible_receivers[0][1]):
        if speech_receiver == possible_receivers[0][0]:
          possible_receivers[0][1].append(receiver_probability)
        possible_receivers.insert(0, (speech_receiver, [receiver_probability]))
      else:
        if speech_receiver == possible_receivers[0][0]:
          possible_receivers[0][1].append(receiver_probability)
        else:
          possible_receivers.append((speech_receiver, [receiver_probability]))
  
  if len(possible_receivers) == 0:
    return RECEIVER_NOT_FOUND, 0.0

  return possible_receivers[0][0], statistics.mean(possible_receivers[0][1])

## Iterate over character lines

In [30]:
character_count = 0
for character in character_dict.values():
  print(f"[{character_count}] Current character: {character}")

  dialog_count = 0
  for line_id, speech in character.dialogs.items():
    print(f"  [{dialog_count}] {character} speaking to", end=" ")
    receiver_character, score = identify_speech_receiver(character.name, {line_id: speech}, character_dict)

    if receiver_character == RECEIVER_NOT_FOUND: 
      print("no one\n")
      continue

    print(f"{receiver_character}", end=" ")

    speech_sentiment = query("sentiment", speech["dialog"])

    print(f"| Relation = {speech_sentiment[0]['label']} ({speech_sentiment[0]['score']})")

    try:
      character.relation[receiver_character][speech_sentiment[0]["label"]].append(speech_sentiment[0]["score"] * score)
    except: 
      continue
    dialog_count += 1

  character_count += 1

[0] Current character: threepio
  [0] threepio speaking to 
[0] Error in pos query
[1] Error in pos query
[2] Error in pos query
[3] Error in pos query
luke | Relation = negative (0.9314931631088257)
  [1] threepio speaking to luke | Relation = negative (0.9071100354194641)
  [2] threepio speaking to 

  [2] threepio speaking to 

  [2] threepio speaking to 

  [2] threepio speaking to imperial officer | Relation = neutral (0.8058019280433655)
  [3] threepio speaking to imperial officer | Relation = neutral (0.5241219997406006)
  [4] threepio speaking to imperial officer | Relation = negative (0.8452939391136169)
  [5] threepio speaking to imperial officer | Relation = neutral (0.9428818821907043)
  [6] threepio speaking to chief pilot | Relation = negative (0.8365911841392517)
  [7] threepio speaking to chief pilot | Relation = negative (0.926473081111908)
  [8] threepio speaking to chief pilot | Relation = negative (0.8706051707267761)
  [9] threepio speaking to 

  [9] threepio spea

# Results

## Show character relations

In [48]:
for id, character in enumerate(character_dict.values()):
  print(f"[{id}] {character}: ")

  for related, relation_dict in character.relation.items():
    if len(relation_dict["positive"]) == 0 and len(relation_dict["neutral"]) ==0 and len(relation_dict["negative"]) == 0: continue
    print(f"  Relation with {related}:")
    for label, score in relation_dict.items():
      if len(score) == 0: continue
      print(f"    {label}: {statistics.mean(score)} from {len(score)} inputs")  

[0] threepio: 
  Relation with luke:
    positive: 0.7390720024704933 from 8 inputs
    neutral: 0.7726514358674327 from 31 inputs
    negative: 0.7165604695981863 from 49 inputs
  Relation with imperial officer:
    neutral: 0.7576018969217936 from 9 inputs
    negative: 0.8452939391136169 from 2 inputs
  Relation with trooper:
    neutral: 0.4743080735206604 from 2 inputs
  Relation with chief pilot:
    negative: 0.8778898318608602 from 9 inputs
  Relation with woman:
    neutral: 0.7383700807889303 from 3 inputs
  Relation with leia:
    neutral: 0.7891287406285604 from 3 inputs
  Relation with first trooper:
    positive: 0.7746626337369283 from 3 inputs
  Relation with beru:
    negative: 0.7039090692996979 from 6 inputs
  Relation with owen:
    positive: 0.6198409199714661 from 3 inputs
  Relation with aunt beru:
    neutral: 0.6055477857589722 from 3 inputs
  Relation with ben:
    neutral: 0.6210874120394388 from 6 inputs
    negative: 0.5932134985923767 from 2 inputs
  Relat