# NST Preprocessing Experiment (NPE)
## Lemmatized Pipeline

Pipeline with intermediate preprocessing rules to render only lemmatized words with a high enough TF-IDF count to be used in the NST preprocessing Experiment.

In [1]:
# Imports
from tabulate import tabulate
import random

from src.utils.file_management import (
    subtitles_file_exists,
    load_subtitles,
    write_subtitles_file,
    stats_file_exists,
    write_stats_file,
    load_stats_file,
    write_pipeline_file)

from src.utils.tables_and_plots import create_stats_table

from textPrep.settings.common import word_tf_df

from textPrep.preprocessing_pipeline.NextGen import NextGen

# Variable for indicating which sample to get and properly name written files 
NST_SAMPLE_SIZE = 1000

In [2]:
# Load file produced by previous pipeline
folder_path = r"nst_preprocessing_experiment/preprocessed_data/"
file_name_previous = f"npe_{NST_SAMPLE_SIZE}_lemmatized"

subtitled_programs_previous_pipeline = load_subtitles(folder_path, file_name_previous)

In [3]:
example_program_id = random.choice(list(subtitled_programs_previous_pipeline.keys()))
example_program_slice = 10
subtitled_programs_previous_pipeline[example_program_id][:example_program_slice]

['truls',
 'pedersen',
 'åtte',
 'åringe',
 'konkurrere',
 'hverandres',
 'idrett',
 'dagers',
 'trening',
 'gjelde']

In [4]:
# Get the stats for the previous preprocessed data

table_stats_previous = None

if stats_file_exists(folder_path, file_name_previous):
    table_stats_previous = list(load_stats_file(folder_path, file_name_previous).items())
else:
    table_stats_previous = create_stats_table(list(subtitled_programs_previous_pipeline.values()))
print(tabulate(table_stats_previous, headers=["Stat (lemmatized)", "Value"]))

Stat (lemmatized)             Value
---------------------  ------------
dataset_size              999
vocab_size              64427
total_tokens           613956
avg_token_freq              9.52948
avg_token_per_doc         614.571
avg_stopwords_per_doc       0


In [5]:
# Find term and document frequenciesuencies for all tokens
frequencies = word_tf_df({}, list(subtitled_programs_previous_pipeline.values()))

In [6]:
# Removing tokens with TF-IDF score below threshold of 0.5 and tokens with collection frequency above 1000

file_name_preprocessed = f"npe_{NST_SAMPLE_SIZE}_lem_tfidf"

subtitled_programs_preprocessed = None

if subtitles_file_exists(folder_path, file_name_preprocessed):
    subtitled_programs_preprocessed = load_subtitles(folder_path, file_name_preprocessed)
else:
    runner = NextGen()
    tfidf_preprocessed_data = runner.filter_by_tfidf(
        dataset=list(subtitled_programs_previous_pipeline.values()), freq=frequencies, threshold=0.5)
    fully_preprocessed_data = runner.filter_by_frequency(
        dataset=tfidf_preprocessed_data, max_freq=1000)
    subtitled_programs_preprocessed = dict(zip(
        subtitled_programs_previous_pipeline.keys(), fully_preprocessed_data))

In [7]:
# Check the stats for the dataset after doing TF-IDF cleaning

table_stats_preprocessed = None

if stats_file_exists(folder_path, file_name_preprocessed):
    table_stats_preprocessed = list(load_stats_file(folder_path, file_name_preprocessed).items())
else:
    table_stats_preprocessed = create_stats_table(list(subtitled_programs_preprocessed.values()))

print(tabulate(table_stats_previous, headers=["Stat (lemmatized)", "Value"]))
print()
print(tabulate(table_stats_preprocessed, headers=["Stat (lem + TFIDF)", "Value"]))

Stat (lemmatized)             Value
---------------------  ------------
dataset_size              999
vocab_size              64427
total_tokens           613956
avg_token_freq              9.52948
avg_token_per_doc         614.571
avg_stopwords_per_doc       0

Stat (lem + TFIDF)           Value
---------------------  -----------
dataset_size              999
vocab_size              17784
total_tokens           498286
avg_token_freq             28.0188
avg_token_per_doc         498.785
avg_stopwords_per_doc       0


In [8]:
# Write preprocessed data to file

write_subtitles_file(folder_path, file_name_preprocessed, subtitled_programs_preprocessed)
write_stats_file(folder_path, file_name_preprocessed, table_stats_preprocessed)