<a href="https://colab.research.google.com/github/Guimol/Star-Wars-Characters-Relations/blob/main/Star_Wars_Characters_Relations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###### Change page's CSS to be more visually appealing

In [1]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

# Importing Libraries

In [2]:
from io import open
import requests

In [3]:
import re

# Files obtention
* Path for the data file (external link): [Star Wars Movie Scripts](https://www.kaggle.com/datasets/xvivancos/star-wars-movie-scripts)
* GitHub repository: [Star Wars Characters Relations](https://github.com/Guimol/Star-Wars-Characters-Relations)

In [4]:
movie_files = {
  "movieIV": "https://raw.githubusercontent.com/Guimol/Star-Wars-Characters-Relations/main/datasets/SW_EpisodeIV.txt",
  "movieV": "https://raw.githubusercontent.com/Guimol/Star-Wars-Characters-Relations/main/datasets/SW_EpisodeV.txt",
  "movieVI": "https://raw.githubusercontent.com/Guimol/Star-Wars-Characters-Relations/main/datasets/SW_EpisodeVI.txt",
}

# Initializing movie dictionaries
movies = dict()

# Creating local files for the corpus and opening them
for title, link in movie_files.items():
  # Access a link
  r = requests.get(link, allow_redirects=True)

  # Read file in the link and store it locally
  open(title + '.txt', 'wb').write(r.content)
  
  # Fill the dictionary with data obtained in the local file
  movies[title] = {"raw": open(title + '.txt', 'r').readlines()}

# Text Preprocessing

Making the dialogs lower case

In [5]:
for title in movies:
  movies[title]['lower'] = [line.lower() for line in movies[title]['raw']]

# Characters Identification

Character Class, stores all information regarding a character:
* Name
* Dialogs
* Connections

In [17]:
class Character:
  def __init__(self, name: str):
    self.name = name
    self.dialogs = dict()
    self.relation = dict()

  def __str__(self):
    return f"{self.name}"

  def __repr__(self):
    return f"Class Character(name={self.name})"

  def __eq__(self, comparison):
    if isinstance(comparison, Character):
      return self.name == comparison.name
    else:
      return self.name == comparison

  def __hash__(self):
    return hash(self.name)

  def init_relation(self, character: str):
    self.relation[character] = {'positive': 0, 'negattive': 0}

  def add_line(self, line_id: int, line: str):
    self.dialogs[line_id] = {'dialog': line, 'next': None}

  def point_next_character(self, line_id: int, next_character: str):
    self.dialogs[line_id]['next'] = next_character

  def clear_dialogs(self):
    self.dialogs.clear()

Iterate over a movie and adds each dialog associating them to a Character class

In [None]:
character_dict = dict()

for idx, line in enumerate(movies['movieIV']['lower']):
  # RegEx to match text: "text" "other text" "third text" -> [text, other text, third text]
  text = re.split('\"(.*?)\"', line.strip())

  # Remove unwanted strings obtained by RegEx
  text = list(filter(lambda x: x not in ['', ' '], text))

  # Flag to create a new character
  found = False
  
  # If line in the pattern: "<line_number>" "<character_name>" "<dialog>"
  if len(text) >= 3:
    # Remove " from the preprocessed text
    text = [x.replace("\"", "") for x in text]

    # Joins the remainder text together
    if len(text) > 3:
      text[2] = ('').join(text[2:])

    # Extracts current character's name
    character_name = text[1]
    
    # Skips first line
    if idx != 1:
      # Points the previous character to current character
      previous_character.point_next_character(int(text[0]) - 1, character_name)

    # Checks if current character is new on the dict
    if character_name not in character_dict:
      character_dict.update({character_name: Character(character_name)})

    # Adds current line to character class
    character_dict[character_name].add_line(int(text[0]), text[2])
    
    # Overwrites previous character variable
    previous_character = character_dict[character_name]

# Initialize relations

For each character initialize a dict showing if their relation is positive or negative

In [None]:
for character in character_list:
  for relation_character in character_list:
    if not character == relation_character:
      character.init_relation(relation_character.name)

In [None]:
character_list[0].relation.keys()