# NST Preprocessing Experiment (NPE)
## POS: Keep only nouns Pipeline

Pipeline with intermediate preprocessing rules to remove all tokens that do not have the part-of-speech (POS) tag "NOUN" or "PROPN" to be used in the NST preprocessing Experiment.

In [1]:
# Imports
from tabulate import tabulate
import random

from src.utils.file_management import (
    load_subtitles_from_nst_sample,
    stats_file_exists,
    load_subtitles,
    subtitles_file_exists,
    write_subtitles_file,
    write_stats_file,
    load_stats_file,
    write_pipeline_file)

from src.utils.tables_and_plots import create_stats_table

from textPrep.preprocessing_pipeline import (
    Preprocess,
    RemoveSubtitleMetadata,
    RemovePunctuation,
    RemoveNumbers,
    Capitalization,
    RemoveStopWords,
    Lemmatize,
    PartOfSpeech)

from textPrep.settings.common import word_tf_df

from textPrep.preprocessing_pipeline.NextGen import NextGen

# Variable for indicating which sample to get and properly name written files 
NST_SAMPLE_SIZE = 1000

In [2]:
# Load unprocessed NST sample for the basic preprocessing pipeline
subtitled_programs = load_subtitles_from_nst_sample(NST_SAMPLE_SIZE)

In [3]:
# Load file produced by previous pipeline
folder_path = r"nst_preprocessing_experiment/preprocessed_data/"
file_name_previous = f"npe_{NST_SAMPLE_SIZE}_pos_verb"

subtitled_programs_previous_pipeline = load_subtitles(folder_path, file_name_previous)

In [4]:
example_program_id = random.choice(list(subtitled_programs.keys()))
example_program_slice = 10
subtitled_programs_previous_pipeline[example_program_id][:example_program_slice]

['bilen',
 'utelukkende',
 'opptatt',
 'miljøet',
 'grunnene',
 'tesla',
 'misunnelse',
 'motoren',
 'responsen',
 'glipp']

In [5]:
# Get the stats for the previous preprocessed data

table_stats_previous = None

if stats_file_exists(folder_path, file_name_previous):
    table_stats_previous = list(load_stats_file(folder_path, file_name_previous).items())
else:
    table_stats_previous = create_stats_table(list(subtitled_programs_previous_pipeline.values()))
print(tabulate(table_stats_previous, headers=["Stat (POS-verbs)", "Value"]))

Stat (POS-verbs)             Value
---------------------  -----------
dataset_size              999
vocab_size              19984
total_tokens           394057
avg_token_freq             19.7186
avg_token_per_doc         394.451
avg_stopwords_per_doc       0


In [6]:
# Initialize the textPrep pipeline
pipeline = Preprocess()

In [7]:
# Removing technical metadata from subtitles
rsm = RemoveSubtitleMetadata()

In [8]:
# Removing tokens with POS tags (using a list of U-POS tags without the "VERB" tag)
UPOS_TAGS_NOUNS = ("NOUN", "PROPN")
pos = PartOfSpeech(is_english=False, spacy_language_pipeline='nb_core_news_lg')

In [9]:
# Lemmatizing tokens
lm = Lemmatize(is_english=False, spacy_language_pipeline='nb_core_news_lg')

In [10]:
# Removing everything but alphanumerical characters
rp = RemovePunctuation(is_twitter=False, keep_hashtags=False)

In [11]:
# Removing all numbers
rm = RemoveNumbers()

In [12]:
# Lower-casing text
cap = Capitalization()

In [13]:
# Removing stopwords
rsw = RemoveStopWords(is_news=False)

In [14]:
# Add rules to the pipeline (the stringified rule makes it easy to save the pipeline details)
pipeline.document_methods = [
    (rsm.remove_subtitle_metadata, str(rsm),),
    (rp.remove_punctuation, str(rp),),
    (rm.remove_numbers, str(rm),),
    (pos.keep_pos, str(pos), {"pos": UPOS_TAGS_NOUNS}),
    (lm.lemmatize_document, str(lm),),
    (cap.lowercase, str(cap),),
    (rsw.remove_stopwords, str(rsw),)
    ]

In [15]:
# Initialize the pipeline runner
runner = NextGen()

In [16]:
# Preprocess the data with the given rules

file_name_preprocessed = f"npe_{NST_SAMPLE_SIZE}_pos_only_noun"

subtitled_programs_preprocessed = None

if subtitles_file_exists(folder_path, file_name_preprocessed):
    subtitled_programs_preprocessed = load_subtitles(folder_path, file_name_preprocessed)
else:
    preprocessed_data = runner.full_preprocess(list(subtitled_programs.values()), pipeline, ngram_min_freq=-1)

    frequencies = word_tf_df({}, preprocessed_data)

    tfidf_preprocessed_data = runner.filter_by_tfidf(dataset=preprocessed_data, freq=frequencies, threshold=0.5)
    fully_preprocessed_data = runner.filter_by_frequency(dataset=tfidf_preprocessed_data, max_freq=1000)

    subtitled_programs_preprocessed = dict(zip(subtitled_programs.keys(), fully_preprocessed_data))
    for (program_id, subtitles) in subtitled_programs_preprocessed.items():
        subtitled_programs_preprocessed[program_id] = " ".join(subtitles).split()

In [17]:
subtitled_programs_preprocessed[example_program_id][:example_program_slice]

['bile',
 'miljø',
 'grunne',
 'tesla',
 'misunnelse',
 'motor',
 'vibrasjon',
 'respons',
 'glipp',
 'elbil']

In [18]:
# Check the stats for the dataset after running pipeline

table_stats_preprocessed = None

if stats_file_exists(folder_path, file_name_preprocessed):
    table_stats_preprocessed = list(load_stats_file(folder_path, file_name_preprocessed).items())
else:
    table_stats_preprocessed = create_stats_table(list(subtitled_programs_preprocessed.values()))

print(tabulate(table_stats_previous, headers=["Stat (POS - verbs)", "Value"]))
print()
print(tabulate(table_stats_preprocessed, headers=["Stat (POS - Only nouns)", "Value"]))

Stat (POS - verbs)           Value
---------------------  -----------
dataset_size              999
vocab_size              19984
total_tokens           394057
avg_token_freq             19.7186
avg_token_per_doc         394.451
avg_stopwords_per_doc       0

Stat (POS - Only nouns)          Value
-------------------------  -----------
dataset_size                  999
vocab_size                  13753
total_tokens               288623
avg_token_freq                 20.9862
avg_token_per_doc             288.912
avg_stopwords_per_doc           0


In [19]:
methods = []
for method in pipeline.document_methods:
    if len(method) > 2:
        method_desc = str(f"{method[1]}, extra_args={method[2:]}")
        methods.append((method[0], method_desc))
    else:
        methods.append(method) 

In [20]:
# Write preprocessed data to file

write_subtitles_file(folder_path, file_name_preprocessed, subtitled_programs_preprocessed)
write_stats_file(folder_path, file_name_preprocessed, table_stats_preprocessed)
write_pipeline_file(folder_path, file_name_preprocessed, methods)