# Full loading and preprocessing of the data

## Loading the data

In [6]:
import os
import bz2
import pandas as pd

In [4]:
def parse_raw_files(root_path:str, file_path_string:str):
    """
    Function that loads data out of a .bz2 file and converts it into a pandas
    DataFrame
    """

    file_path = os.path.join(root_path, file_path_string)

    data = []

    with bz2.open(file_path, 'rt', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split(' ' , 1)
            if len(parts) == 2:
                label, text = parts
                label = label.replace('__label__', '')
                data.append((label, text))

    df = pd.DataFrame(data, columns=['label', 'text'])
    df['label'] = df['label'].astype(int)

    return df

In [3]:
# Path to data folder
path= "../raw_data"

#Path to csv data files
train_path = '../raw_data/raw_train_data.csv'
test_path = '../raw_data/raw_test_data.csv'

In [8]:
# Try to load data from csv, otherwise load from .bz2 and store as csv
if os.path.exists(train_path):
    train_df = pd.read_csv(train_path)
    train_df.head()
    print("File found. Data loaded.")
else:
    # Your alternative logic here
    print("File not found. Performing direct read operation.")
    train_df = parse_raw_files(path,'train.ft.txt.bz2')
    train_df.to_csv('../raw_data/raw_train_data.csv', index=False)
    train_df.head()

File found. Data loaded.


## Cleaning function

In [9]:
import string
import re
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

In [10]:
def clean_text(text):
    #No whitespaces in beginning or end
    text = text.strip()
    #lowercase
    text= text.lower()
    #remove numbers
    text = re.sub(r'\b\d+\b', '', text)

    text = re.sub(rf"[{re.escape(string.punctuation)}]", '', text)

    # Tokenizing
    tokenized = word_tokenize(text)
    # Lemmatizing
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(word) for word in tokenized]
    text = " ".join(lemmatized)
    return text

## TF-IDF Vectorizer

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [41]:
#Instanciating a vectorizer which extracts the 10.000 most important features
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features = 10000)

## Pipeline

In [52]:
def preprocess_series(X):
    return[clean_text(text) for text in X]

In [53]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

In [54]:
#Including the cleaning function into a Functiontransformer
transformer = FunctionTransformer(func=preprocess_series)

In [58]:
#Building the pipeline with cleaning and then vectorizing
preproc_pipe = Pipeline([('cleaning', transformer), ('vectorizer', vectorizer)])
preproc_pipe

0,1,2
,steps,"[('cleaning', ...), ('vectorizer', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,func,<function pre...t 0x143ad9f30>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'


### Testing the pipeline

In [56]:
train_df_small = train_df.sample(frac =0.01)

In [57]:
X_small_processed = preproc_pipe.fit_transform(train_df_small['text'])

In [None]:
#This step takes around 70 minutes on the full dataset
preproc_pipe.fit(train_df['text'])

0,1,2
,steps,"[('cleaning', ...), ('vectorizer', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,func,<function pre...t 0x143ad9f30>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'


In [68]:
X_small_processed = preproc_pipe.transform(train_df_small['text'])

In [None]:
import pickle

In [None]:
# Export Pipeline as pickle file
with open("../preprocessing_pipelines/preproc_pipeline_ml.pkl", "wb") as file:
    pickle.dump(preproc_pipe, file)

In [69]:
X_small_processed

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2638276 stored elements and shape (36000, 10000)>

### Testing if the pipe was saved correctly

In [70]:
with open("../preprocessing_pipelines/preproc_pipeline_ml.pkl", 'rb') as file:
            preproc_pipeline_loaded = pickle.load(file)

In [71]:
X_small_processed_2 = preproc_pipeline_loaded.transform(train_df_small['text'])

In [72]:
X_small_processed.sum(), X_small_processed_2.sum()

(np.float64(269500.0850556207), np.float64(269500.0850556207))