# Data & Data Reader 
In this submission we demonstrate the demensions of our data and provide a Data Reader that pre-processes the raw data into formats that are feasible for machine learning models. The submission comprises the following sections:<br><br>
*[1. Data Overview](#1.-Data-Overview)*<br>
&emsp;&emsp;*[1.1. Data Preprocess](#1.1.-Data-Preprocess)*<br>
&emsp;&emsp;*[1.2. Export Data](#1.2.-Export-Data)*<br>
*[2. Data Reader](#3.-Data-Reader)*<br>

## 1. Data Overview
In this section, we aill summarize and visualize the dimensions of our corpus.

In [None]:
import pandas as pd
import numpy as np
import re
import os
import nltk
from operator import itemgetter
from difflib import SequenceMatcher
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)
os.chdir('/content/drive/.shortcut-targets-by-id/1H3iXw_9Ag494kAPY43CMgachbSqG3PRn/CSE 842 Project/')

Mounted at /content/drive/


## 1.1. Data Preprocess
First, we need to preprocess the data to get more accurate Data Summary, including extracting sentences from the .srt files and removing invalid elements such as html tags, unicode sign, etc...

In [None]:
# from google.colab import drive
# drive.mount('/content/drive/')
# os.chdir('/content/drive/My Drive/CSE 842 Project')

In [None]:
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [None]:
min_leng = 20
min_percent = 0.6
movie_timestamps = {}  # global dictionary for storing script timestamps
def add_timestamp(path):
    # print(path)
    movie_name = path.split("/")[-1].split(".")[0]
    subtitles = movie_timestamps[movie_name]
    output_path = f"{'/'.join(path.split('/')[:-2])}/trailer_time/{movie_name}.txt"
    

    # # If a sentence has less than 20 letters, merge it to the previous sentence
    # new_subtitles = []
    # for i, c in enumerate(subtitles):
    #     if len(''.join(l.lower() for l in c if l.isalpha())) < 10 and i > 0:
    #         new_subtitles[-1] += "".join(c.split(" ")[1:])
    #     else:
    #         new_subtitles.append(c)
    # # subtitles = new_subtitles

    times = [row.split(" ")[0] for row in subtitles]
    subtitles = ["".join(row.split(" ")[1:]) for row in subtitles]
    time_sentence = pd.DataFrame(zip(times, subtitles), columns=["Time", "Sentence"])
    time_sentence.Sentence = time_sentence.Sentence.apply(lambda row:re.sub("\W+","",row).lower())

    mapper = {}
    with open(path, encoding="utf8") as trailer_file:
        trailer_content = trailer_file.read()
        old_content = trailer_content.split("\n")
        trailer_content = re.sub("\[.+?\]","",trailer_content).lower().split("\n")
        trailer_content = [re.sub("\W+","", c) for c in trailer_content]
        remaining_index = [i for i,c in enumerate(trailer_content) if c]
        mapper = {trailer_content[i]:old_content[i] for i in remaining_index}
        trailer_content = [c for c in trailer_content if c]
        

        # # If a sentence has less than 20 letters, merge it to the previous sentence
        # new_trailer_content = []
        # for i,c in enumerate(trailer_content):
        #     if len(''.join(l.lower() for l in c if l.isalpha())) < 10 and i > 0:
        #         new_trailer_content[-1] += c
        #     else:
        #         new_trailer_content.append(c)
        # # trailer_content = new_trailer_content
        
    trailer_sentence = pd.DataFrame()
    trailer_sentence["Sentence"] = trailer_content

    def find_matching_timeslots(row):
        result = []
        percents = []
        for index, r in time_sentence.iterrows():
            s = r["Sentence"]
            if len(row) < 10:
                continue
            # length, i, j = LCS(row, s)
            # print(row, s, length)
            percent = 0
            if len(s) > 0 and len(row) > 0:
                # percent = length/len(s)/2 + length/len(row)/2
                percent = similar(s, row)
                percents += [percent]
            if  percent >= min_percent:
                result.append(r["Time"])
        # print(sorted(percents, reverse=True))
        if not result:
            return np.nan
        return result

    trailer_sentence["Matched Times"] = trailer_sentence.Sentence.apply(lambda row:find_matching_timeslots(row))
    trailer_sentence = trailer_sentence.dropna()
    # display(trailer_sentence)
    print(output_path, "...Done")
    output = ""
    for index, r in trailer_sentence.iterrows():

        string = f"{r['Matched Times'][0]} {mapper[r['Sentence']]}\n"
        output += string
        # print(string)
    with open(output_path, "w") as file:
        file.write(output)

    trailer_with_timestamps = output.splitlines()
    return trailer_with_timestamps
    
    

In [None]:
from pathlib import Path

In [None]:
def data_preprocess():
    corpus_raw_dir = "./corpus_raw/"
    movie_df = pd.DataFrame()
    for label in os.listdir(corpus_raw_dir):
        # Allocate the movie & trailer subtitle folders
        movie_dir = f"{corpus_raw_dir}{label}/movie_corpus/"
        trailer_dir = f"{corpus_raw_dir}{label}/trailer_corpus/"
        print(f"Movie Dir {len(os.listdir(movie_dir))}")
        for file in os.listdir(movie_dir):
          print(Path(file).stem)
        print(f"Trailer Dir {len(os.listdir(trailer_dir))}")
        # Fetch data
        cur_movie_df = pd.DataFrame()
        cur_movie_df["label"] = [label] * len(os.listdir(movie_dir))
        cur_movie_df["movie"] = [re.sub("\.srt|\.txt", "", filename) for filename in os.listdir(movie_dir)]
        cur_movie_df["corpus_movie"] = [sentence_parition(f"{movie_dir}{filename}") for filename in os.listdir(movie_dir)]
        cur_trailer_df = pd.DataFrame()
        cur_trailer_df["movie"] = [re.sub("\.srt|\.txt", "", filename) for filename in os.listdir(trailer_dir)]
        cur_trailer_df["corpus_trailer"] = [sentence_parition(f"{trailer_dir}{filename}", min_sent_length=4) for filename in os.listdir(trailer_dir)]

        combined_df = cur_movie_df.merge(cur_trailer_df, how='inner', on='movie')
        # cur_movie_df["num_sentence_movie"] = cur_movie_df["corpus_movie"].apply(lambda row: len(row))
        # cur_movie_df["num_sentence_trailer"] = cur_movie_df["corpus_trailer"].apply(lambda row: len(row) if row else 0)
        # cur_movie_df["num_words_movie"] = cur_movie_df["corpus_movie"].apply(lambda row: len(' '.join(row).split()))
        # cur_movie_df["num_words_trailer"] = cur_movie_df["corpus_trailer"].apply(lambda row: len(' '.join(row).split()))
        # cur_movie_df["num_letters_movie"] = cur_movie_df["corpus_movie"].apply(lambda row: len(' '.join(row)))
        # cur_movie_df["num_letters_trailer"] = cur_movie_df["corpus_trailer"].apply(lambda row: len(' '.join(row)))
        movie_df = pd.concat([movie_df, combined_df]).reset_index(drop=True)
        # Display overview
        # print(f"{label}:")
        # print(f"    {'Movie subtitle file count:':<40} {len(os.listdir(movie_dir))}")
        # print(f"    {'Trailer subtitle file count:':<40} {len(os.listdir(trailer_dir))}")
        # print(f"    {'Total movie sentences:':<40} {cur_movie_df['num_sentence_movie'].sum()}")
        # print(f"    {'Total trailer sentences:':<40} {cur_movie_df['num_sentence_trailer'].sum()}")
        # print(f"    {'Total movie words:':<40} {cur_movie_df['num_words_movie'].sum()}")
        # print(f"    {'Total trailer words:':<40} {cur_movie_df['num_words_trailer'].sum()}")
        # print(f"    {'Total movie letters:':<40} {cur_movie_df['num_letters_movie'].sum()}")
        # print(f"    {'Total trailer letters:':<40} {cur_movie_df['num_letters_trailer'].sum()}")
        
    display(movie_df.head())
    display(movie_df.tail(), movie_df.shape)
    return movie_df


def sentence_parition(filepath, min_sent_length=None):
    with open(filepath, encoding="latin") as file:
        context = file.read()
    if filepath[-4:] == ".srt":
        pattern = "\n(\d+[\d:,]+) --> (.*?)\n(.*?)\n\n"
    else:
        pattern = "(.*?)\n"
    context = re.sub("\[.*?\]|\(.*?\)|.*?: |\<.*?\>", "", context) # Remove all subtitle tags (if any)
    sentences = re.findall(pattern, context, re.S)
    if filepath[-4:] == ".srt":  # movie script files
        start = pd.to_datetime(sentences[0][0].split(",")[0])
        for i, sentence in enumerate(sentences):
            begin, end, s = sentence
            begin = pd.to_datetime(begin.split(",")[0])
            sec = (begin - start).seconds
            sentences[i] = f"{sec} {s}"

        sentences = [sentence.replace("\n", " ") for sentence in sentences] # There should be no newlines in a sentence
        sentences = [sentence for sentence in sentences if sentence] # Drop nan
        if min_sent_length:  # if user species minimum sentence length, apply it
            sentences = [sentence for sentence in sentences if len(word_tokenize(sentence)) > min_sent_length]
        
        movie_name = filepath.split("/")[-1].split(".")[0]
        print(movie_name)
        movie_timestamps[movie_name] = sentences
    else:  # trailer scripts
        sentences = add_timestamp(filepath)  # adding timestamp to trailer
    return sentences
              
dataframe = data_preprocess()

Movie Dir 30
Avengers Infinity War
Ant-Man and The Wasp
Ant-Man
Avengers Age of Ultron
Avengers Endgame
Black Panther
Captain America Civil War
Captain Marvel
Iron Man 3
Iron Man
Deadpool 2
Captain America The First Avenger
Guardians of the Galaxy Vol. 2
Iron Man 2
Deadpool
Spider-Man Homecoming
Guardians of the Galaxy
The Avengers
Spider-Man Far From Home
Black Widow
Logan
The Incredible Hulk
Doctor Strange
Captain America The Winter Soldier
X-Men Apocalypse
Thor
Thor The Dark World
Thor Ragnarok
X-Men Dark Phoenix
X-Men Days of Future Past
Trailer Dir 30
Avengers Infinity War
Ant-Man and The Wasp
Ant-Man
Avengers Age of Ultron
Avengers Endgame
Black Panther
Captain America Civil War
Captain Marvel
Iron Man 3
Iron Man
Deadpool 2
Captain America The First Avenger
Guardians of the Galaxy Vol
Iron Man 2
Deadpool
Spider-Man Homecoming
Guardians of the Galaxy
The Avengers
Spider-Man Far From Home
Black Widow
Logan
The Incredible Hulk
Doctor Strange
Captain America The Winter Soldier
X-Men 

Unnamed: 0,label,movie,corpus_movie,corpus_trailer
0,Marvel,Avengers Infinity War,"[0 This is the Asgardian, 2 refugee vessel Sta...","[777 There was an idea., 1179 To bring togethe..."
1,Marvel,Ant-Man and The Wasp,"[0 I still think about the night, 2 your mothe...","[4095 So, how long have you been Ant-Man again..."
2,Marvel,Ant-Man,"[0 Stark!, 2 He doesn't seem happy., 3 Hello, ...","[649 imagine a soldier the size of an insect, ..."
3,Marvel,Avengers Age of Ultron,"[0 Report to your stations immediately., 2 Thi...","[2192 Everyone screaming..., 1888 You want to ..."
4,Marvel,Avengers Endgame,"[0 Okay, hold on, don't shoot., 4 - You see wh...","[2553 TONY STARK: It seems like a thousand, 90..."


Unnamed: 0,label,movie,corpus_movie,corpus_trailer
44,DC,The Dark Knight,"[0 , 17 Three of a kind, let's do this. That's...","[5257 Where do we begin., 1346 A year ago, the..."
45,DC,Watchmen,"[0 HAVE A GOOD TIME, 13 Wrong, as usual., 15 T...","[1865 delightful you know why you're here, 619..."
46,DC,The Dark Knight Rises,"[0 I knew Harvey Dent., 4 I was his friend., 8...","[5203 (O, say can you see, By the dawn's early..."
47,DC,Wonder Woman 1984,"[0 Some days, my childhood feels so very far ...","[1578 my life hasn't been what you probably, 1..."
48,DC,Wonder Woman,"[0 I used to want to save the world., 4 This b...","[522 The gods gave us many gifts., 525 One day..."


(49, 4)

## 1.2. Export Data
Once we have cleaned up the data, we can store them into local files for future use.

In [None]:
dataframe.to_csv("corpus_cleaned.csv", index=False)

## 2. Data Reader
It is also neccessary to write a function to read the data we just cleaned. In this section, we write a data reader for future use. Since we still have not decide which model to use, we will not train the data in this submission.

In [None]:
def get_corpus(filepath):
    movie_df = pd.read_csv(filepath)
    display(movie_df.head())
    display(movie_df.tail(), movie_df.shape)
    return movie_df

dataframe = get_corpus("./corpus_cleaned.csv")
print("Successfully loaded the corpus data!")

Unnamed: 0,label,movie,corpus_movie,corpus_trailer
0,Marvel,Avengers Infinity War,"['0 This is the Asgardian', '2 refugee vessel ...","['777 There was an idea.', '1179 To bring toge..."
1,Marvel,Ant-Man and The Wasp,"['0 I still think about the night', '2 your mo...","['4095 So, how long have you been Ant-Man agai..."
2,Marvel,Ant-Man,"['0 Stark!', ""2 He doesn't seem happy."", '3 He...",['649 imagine a soldier the size of an insect'...
3,Marvel,Avengers Age of Ultron,"['0 Report to your stations immediately.', '2 ...","['2192 Everyone screaming...', '1888 You want ..."
4,Marvel,Avengers Endgame,"[""0 Okay, hold on, don't shoot."", ""4 - You see...","['2553 TONY STARK: It seems like a thousand', ..."


Unnamed: 0,label,movie,corpus_movie,corpus_trailer
44,DC,The Dark Knight,"['0 ', ""17 Three of a kind, let's do this. Tha...","['5257 Where do we begin.', ""1346 A year ago, ..."
45,DC,Watchmen,"['0 HAVE A GOOD TIME', '13 Wrong, as usual.', ...","[""1865 delightful you know why you're here"", ""..."
46,DC,The Dark Knight Rises,"['0 I knew Harvey Dent.', '4 I was his friend....","[""5203 (O, say can you see, By the dawn's earl..."
47,DC,Wonder Woman 1984,"['0 Some days, my childhood feels so very far...","[""1578 my life hasn't been what you probably"",..."
48,DC,Wonder Woman,"['0 I used to want to save the world.', '4 Thi...","['522 The gods gave us many gifts.', ""525 One ..."


(49, 4)

Successfully loaded the corpus data!


In [None]:
dataframe["corpus_trailer"][0]

'[\'777 There was an idea.\', \'1179 To bring together...\', \'3586 To see if we could become...\', \'4100 Something more.\', "100 You will know what it\'s like to lose.", "104 To feel so desperately that you\'re right...", \'296 ...all the same.\', \'133 Destiny still arrives.\', \'5646 Evacuate the city.\', \'5646 Engage all defenses.\', \'5650 And get this man a shield.\', \'1204 Who the hell are you guys?\']'