# NST Preprocessing Experiment (NPE)
## Lemmatized Pipeline

Pipeline with intermediate preprocessing rules to render only lemmatized words to be used in the NST preprocessing Experiment.

In [1]:
# Imports
from tabulate import tabulate
import random

from src.utils.file_management import (
    load_subtitles_from_nst_sample,
    stats_file_exists,
    load_subtitles,
    subtitles_file_exists,
    write_subtitles_file,
    write_stats_file,
    load_stats_file,
    write_pipeline_file)

from src.utils.tables_and_plots import create_stats_table

from textPrep.preprocessing_pipeline import (
    Preprocess,
    RemoveSubtitleMetadata,
    RemovePunctuation,
    RemoveNumbers,
    Capitalization,
    RemoveStopWords,
    Lemmatize)

from textPrep.preprocessing_pipeline.NextGen import NextGen

# Variable for indicating which sample to get and properly name written files 
NST_SAMPLE_SIZE = 1000

In [2]:
# Load unprocessed NST sample for the basic preprocessing pipeline
subtitled_programs = load_subtitles_from_nst_sample(NST_SAMPLE_SIZE)

In [3]:
# Load file produced by previous pipeline
folder_path = r"nst_preprocessing_experiment/preprocessed_data/"
file_name_previous = f"npe_{NST_SAMPLE_SIZE}_basic"

subtitled_programs_previous_pipeline = load_subtitles(folder_path, file_name_previous)

In [4]:
example_program_id = random.choice(list(subtitled_programs.keys()))
example_program_slice = 10
subtitled_programs_previous_pipeline[example_program_id][:example_program_slice]

['dårlig',
 'svømme',
 'gode',
 'seilere',
 'liker',
 'svømme',
 'oppå',
 'vannet',
 'forsøk',
 'bunns']

In [5]:
# Get the stats for the previous preprocessed data

table_stats_previous = None

if stats_file_exists(folder_path, file_name_previous):
    table_stats_previous = list(load_stats_file(folder_path, file_name_previous).items())
else:
    table_stats_previous = create_stats_table(list(subtitled_programs_previous_pipeline.values()))
print(tabulate(table_stats_previous, headers=["Stat (basic)", "Value"]))

Stat (basic)                 Value
---------------------  -----------
dataset_size              999
vocab_size              81649
total_tokens           604211
avg_token_freq              7.4001
avg_token_per_doc         604.816
avg_stopwords_per_doc       0


In [6]:
# Initialize the textPrep pipeline
pipeline = Preprocess()

In [7]:
# Removing technical metadata from subtitles
rsm = RemoveSubtitleMetadata()

In [8]:
# Lemmatizing tokens
lm = Lemmatize(is_english=False, spacy_language_pipeline='nb_core_news_lg')

In [9]:
# Removing everything but alphanumerical characters
rp = RemovePunctuation(is_twitter=False, keep_hashtags=False)

In [10]:
# Removing all numbers
rm = RemoveNumbers()

In [11]:
# Lower-casing text
cap = Capitalization()

In [12]:
# Removing stopwords
rsw = RemoveStopWords(is_news=False)

In [13]:
# Add rules to the pipeline (the stringified rule makes it easy to save the pipeline details)
pipeline.document_methods = [
    (rsm.remove_subtitle_metadata, str(rsm),),
    (lm.lemmatize_document, str(lm),),
    (rp.remove_punctuation, str(rp),),
    (rm.remove_numbers, str(rm),),
    (cap.lowercase, str(cap),),
    (rsw.remove_stopwords, str(rsw),)
    ]

In [14]:
# Initialize the pipeline runner
runner = NextGen()

In [15]:
# Preprocess the data with the given rules

file_name_preprocessed = f"npe_{NST_SAMPLE_SIZE}_lemmatized"

subtitled_programs_preprocessed = None

if subtitles_file_exists(folder_path, file_name_preprocessed):
    subtitled_programs_preprocessed = load_subtitles(folder_path, file_name_preprocessed)
else:
    preprocessed_data = runner.full_preprocess(list(subtitled_programs.values()), pipeline, ngram_min_freq=-1)
    subtitled_programs_preprocessed = dict(zip(subtitled_programs.keys(), preprocessed_data))
    for (program_id, subtitles) in subtitled_programs_preprocessed.items():
        subtitled_programs_preprocessed[program_id] = " ".join(" ".join(subtitles).split("gåre")).split()

In [19]:
subtitled_programs_preprocessed[example_program_id][:example_program_slice]

['dårlig',
 'svømme',
 'seiler',
 'svømme',
 'oppå',
 'forsøk',
 'bunns',
 'vest',
 'utfordring',
 'utroligst']

In [17]:
# Check the stats for the dataset after running pipeline

table_stats_preprocessed = None

if stats_file_exists(folder_path, file_name_preprocessed):
    table_stats_preprocessed = list(load_stats_file(folder_path, file_name_preprocessed).items())
else:
    table_stats_preprocessed = create_stats_table(list(subtitled_programs_preprocessed.values()))

print(tabulate(table_stats_previous, headers=["Stat (basic)", "Value"]))
print()
print(tabulate(table_stats_preprocessed, headers=["Stat (lemmatized)", "Value"]))

Stat (basic)                 Value
---------------------  -----------
dataset_size              999
vocab_size              81649
total_tokens           604211
avg_token_freq              7.4001
avg_token_per_doc         604.816
avg_stopwords_per_doc       0

Stat (lemmatized)             Value
---------------------  ------------
dataset_size              999
vocab_size              64426
total_tokens           610326
avg_token_freq              9.47329
avg_token_per_doc         610.937
avg_stopwords_per_doc       0


In [18]:
# Write preprocessed data to file

write_subtitles_file(folder_path, file_name_preprocessed, subtitled_programs_preprocessed)
write_stats_file(folder_path, file_name_preprocessed, table_stats_preprocessed)
write_pipeline_file(folder_path, file_name_preprocessed, pipeline.document_methods)