# NST Preprocessing Experiment (NPE)
## Unprocessed Pipeline

Pipeline with only subtitle metadata removed to be used as baseline in the NST preprocessing Experiment.

In [1]:
# Imports
from tabulate import tabulate
import random

from src.utils.file_management import (
    load_subtitles_from_nst_sample,
    stats_file_exists,
    write_subtitles_file,
    write_stats_file,
    load_stats_file,
    write_pipeline_file)

from src.utils.tables_and_plots import create_stats_table

from textPrep.preprocessing_pipeline import (
    Preprocess,
    RemoveSubtitleMetadata)

from textPrep.preprocessing_pipeline.NextGen import NextGen

# Variable for indicating which sample to get and properly name written files 
NST_SAMPLE_SIZE = 1000

In [2]:
# Load unprocessed NST sample for the basic preprocessing pipeline
subtitled_programs = load_subtitles_from_nst_sample(NST_SAMPLE_SIZE)

In [5]:
example_program_id = random.choice(list(subtitled_programs.keys()))
example_program_slice = 10
subtitled_programs[example_program_id][:example_program_slice]

['{\\an1}Dette',
 'er',
 'byen',
 'med',
 'et',
 'ufortjent',
 'dårlig',
 'rykte.',
 '{\\an1}De',
 'fleste']

In [6]:
# Check the baseline stats for the dataset

folder_path_unprocessed = r"nst_preprocessing_experiment/preprocessed_data/"
file_name_unprocessed = f"npe_{NST_SAMPLE_SIZE}_unprocessed"

table_stats_unprocessed = None

if stats_file_exists(folder_path_unprocessed, file_name_unprocessed):
    table_stats_unprocessed = list(load_stats_file(folder_path_unprocessed, file_name_unprocessed).items())
else:
    table_stats_unprocessed = create_stats_table(list(subtitled_programs.values()))
print(tabulate(table_stats_unprocessed, headers=["Stat", "Value"]))

Stat                              Value
---------------------  ----------------
dataset_size              999
vocab_size             157221
total_tokens                1.82958e+06
avg_token_freq             11.637
avg_token_per_doc        1831.41
avg_stopwords_per_doc     881.47


In [7]:
# Initialize the textPrep pipeline
pipeline = Preprocess()

In [8]:
# Removing technical metadata from subtitles
rsm = RemoveSubtitleMetadata()

In [9]:
# Add rules to the pipeline (the stringified rule makes it easy to save the pipeline details)
pipeline.document_methods = [(rsm.remove_subtitle_metadata, str(rsm),)]

In [10]:
# Initialize the pipeline runner
runner = NextGen()

In [14]:
# Preprocess the data with the given rules
preprocessed_data = runner.full_preprocess(list(subtitled_programs.values()), pipeline, ngram_min_freq=-1)
subtitled_programs_preprocessed = dict(zip(subtitled_programs.keys(), preprocessed_data))
for (program_id, subtitles) in subtitled_programs_preprocessed.items():
    subtitled_programs_preprocessed[program_id] = " ".join(subtitles).split()

In [15]:
subtitled_programs_preprocessed[example_program_id][:example_program_slice]

['Dette',
 'er',
 'byen',
 'med',
 'et',
 'ufortjent',
 'dårlig',
 'rykte.',
 'De',
 'fleste']

In [17]:
# Check the stats for the dataset after running pipeline

folder_path = r"nst_preprocessing_experiment/preprocessed_data/"
file_name = f"npe_{NST_SAMPLE_SIZE}_unprocessed"

table_stats_preprocessed = None

if stats_file_exists(folder_path, file_name):
    table_stats_preprocessed = list(load_stats_file(folder_path, file_name).items())
else:
    table_stats_preprocessed = create_stats_table(list(subtitled_programs_preprocessed.values()))

In [18]:
# Check the stats for the dataset after running raw pipeline
print(tabulate(table_stats_unprocessed, headers=["Stat (with metadata)", "Value"]))
print()
print(tabulate(table_stats_preprocessed, headers=["Stat (metadata removed)", "Value"]))

Stat (with metadata)               Value
----------------------  ----------------
dataset_size               999
vocab_size              157221
total_tokens                 1.82958e+06
avg_token_freq              11.637
avg_token_per_doc         1831.41
avg_stopwords_per_doc      881.47

Stat (metadata removed)               Value
-------------------------  ----------------
dataset_size                  999
vocab_size                 149975
total_tokens                    1.82517e+06
avg_token_freq                 12.1698
avg_token_per_doc            1826.99
avg_stopwords_per_doc         880.987


In [19]:
# Write preprocessed data to file

write_subtitles_file(folder_path, file_name, subtitled_programs_preprocessed)
write_stats_file(folder_path, file_name, table_stats_preprocessed)
write_pipeline_file(folder_path, file_name, pipeline.document_methods)