# NST Preprocessing Experiment (NPE)
## Basic Pipeline

Pipeline with simple preprocessing rules to render only lower-case, purely alphabetical and stopword-free data to be used in the NST preprocessing Experiment.

In [1]:
# Imports
from tabulate import tabulate
import random

from src.utils.file_management import (
    load_subtitles_from_nst_sample,
    stats_file_exists,
    load_subtitles,
    write_subtitles_file,
    write_stats_file,
    load_stats_file,
    write_pipeline_file)

from src.utils.tables_and_plots import create_stats_table

from textPrep.preprocessing_pipeline import (
    Preprocess,
    RemoveSubtitleMetadata,
    RemovePunctuation,
    RemoveNumbers,
    Capitalization,
    RemoveStopWords)

from textPrep.preprocessing_pipeline.NextGen import NextGen

# Variable for indicating which sample to get and properly name written files 
NST_SAMPLE_SIZE = 1000

In [2]:
# Load unprocessed NST sample for the basic preprocessing pipeline
subtitled_programs = load_subtitles_from_nst_sample(NST_SAMPLE_SIZE)

In [3]:
# Load file produced by raw pipeline
folder_path = r"nst_preprocessing_experiment/preprocessed_data/"
file_name_previous = f"npe_{NST_SAMPLE_SIZE}_raw"

subtitled_programs_previous_pipeline = load_subtitles(folder_path, file_name_previous)

In [4]:
example_program_id = random.choice(list(subtitled_programs.keys()))
example_program_slice = 10
subtitled_programs_previous_pipeline[example_program_id][:example_program_slice]

['Politiet',
 'stengte',
 'mobilnett',
 'pga',
 'aksjon',
 'mot',
 'fly',
 'som',
 'ble',
 'bombetruet']

In [5]:
# Get the stats for the previous preprocessed data

table_stats_previous = None

if stats_file_exists(folder_path, file_name_previous):
    table_stats_previous = list(load_stats_file(folder_path, file_name_previous).items())
else:
    table_stats_previous = create_stats_table(list(subtitled_programs_previous_pipeline.values()))
print(tabulate(table_stats_previous, headers=["Stat (raw)", "Value"]))

Stat (raw)                       Value
---------------------  ---------------
dataset_size             999
vocab_size             95757
total_tokens               1.77714e+06
avg_token_freq            18.5589
avg_token_per_doc       1778.92
avg_stopwords_per_doc    971.927


In [6]:
# Initialize the textPrep pipeline
pipeline = Preprocess()

In [7]:
# Removing technical metadata from subtitles
rsm = RemoveSubtitleMetadata()

In [8]:
# Removing everything but alphanumerical characters
rp = RemovePunctuation(is_twitter=False, keep_hashtags=False)

In [9]:
# Removing all numbers
rm = RemoveNumbers()

In [10]:
# Lower-casing text
cap = Capitalization()

In [11]:
# Removing stopwords
rsw = RemoveStopWords(is_news=False)

In [12]:
# Add rules to the pipeline (the stringified rule makes it easy to save the pipeline details)
pipeline.document_methods = [
    (rsm.remove_subtitle_metadata, str(rsm),),
    (rp.remove_punctuation, str(rp),),
    (rm.remove_numbers, str(rm),),
    (cap.lowercase, str(cap),),
    (rsw.remove_stopwords, str(rsw),)
    ]

In [13]:
# Initialize the pipeline runner
runner = NextGen()

In [14]:
# Preprocess the data with the given rules
preprocessed_data = runner.full_preprocess(list(subtitled_programs.values()), pipeline, ngram_min_freq=-1)
subtitled_programs_preprocessed = dict(zip(subtitled_programs.keys(), preprocessed_data))

In [15]:
subtitled_programs_preprocessed[example_program_id][:example_program_slice]

['politiet',
 'stengte',
 'mobilnett',
 'pga',
 'aksjon',
 'fly',
 'bombetruet',
 'nedgang',
 'uttak',
 'fedrekvote']

In [16]:
# Check the stats for the dataset after running pipeline

file_name_preprocessed = f"npe_{NST_SAMPLE_SIZE}_basic"

table_stats_preprocessed = None

if stats_file_exists(folder_path, file_name_preprocessed):
    table_stats_preprocessed = list(load_stats_file(folder_path, file_name_preprocessed).items())
else:
    table_stats_preprocessed = create_stats_table(list(subtitled_programs_preprocessed.values()))

print(tabulate(table_stats_previous, headers=["Stat (raw)", "Value"]))
print()
print(tabulate(table_stats_preprocessed, headers=["Stat (basic)", "Value"]))

Stat (raw)                       Value
---------------------  ---------------
dataset_size             999
vocab_size             95757
total_tokens               1.77714e+06
avg_token_freq            18.5589
avg_token_per_doc       1778.92
avg_stopwords_per_doc    971.927

Stat (basic)                 Value
---------------------  -----------
dataset_size              999
vocab_size              81649
total_tokens           604211
avg_token_freq              7.4001
avg_token_per_doc         604.816
avg_stopwords_per_doc       0


In [17]:
# Write preprocessed data to file

write_subtitles_file(folder_path, file_name_preprocessed, subtitled_programs_preprocessed)
write_stats_file(folder_path, file_name_preprocessed, table_stats_preprocessed)
write_pipeline_file(folder_path, file_name_preprocessed, pipeline.document_methods)