# Data Cleaning and Text Standardization.

a. Uniform text formats (e.g., case normalization, Hint: standardize the letters in lower case).
If necessary, clean the comment text (e.g. URLs, subreddit refs, …).

b. Stop words are not contributing much to our ML tasks, such as "the", "a", since they carry
very little information. Take care of these kinds of words.

c. Reduce words to their base or root form using Stemming/Lemmatization. This helps in
reducing inflected words to a common base form. (Hint: Consider using libraries like NLTK
or spaCy for tokenization).


In [None]:
!pip install spacy

!python -m spacy download en_core_web_sm

In [None]:
# import needed python libraries

%matplotlib inline
from tqdm import tqdm
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

import html
import spacy
nlp = spacy.load("en_core_web_sm", disable=["parser","ner","textcat"])
from langdetect import detect

In [None]:
df_supervised   = pd.read_csv("../data/data_supervised.csv")
df_unsupervised = pd.read_csv("../data/data_unsupervised.csv")
df_target       = pd.read_csv("../data/target_supervised.csv")

print(df_supervised.shape, df_unsupervised.shape, df_target.shape)

Uniform text formats (e.g., case normalization, Hint: standardize the letters in lower case). If necessary, clean the comment text (e.g. URLs, subreddit refs, …).



In [None]:
remove_pattern = r'https?://\S+|www\.\S+|r/\w+|u/\w+'

df_supervised['body_normalized'] = (
    df_supervised['body']
    .fillna('')                                     # Gestisce i NaN
    .astype(str)                                    # Assicura formato stringa
    .str.lower()                                    # Case normalization (Punto a.)
    .apply(html.unescape)                           # Decodifica HTML (es. &amp; -> &)
    .str.replace(remove_pattern, ' ', regex=True) # Rimuove URL, r/, u/
    .str.replace(r'\s+', ' ', regex=True)           # Rimuove doppi spazi
    .str.strip()                                    # Pulisce spazi inizio/fine
)

df_unsupervised['body_normalized'] = (
    df_unsupervised['body']
    .fillna('')
    .astype(str)
    .str.lower()
    .apply(html.unescape)
    .str.replace(remove_pattern, ' ', regex=True)
    .str.replace(r'\s+', ' ', regex=True)
    .str.strip()
)


In [None]:
# CHECKK!!!
df_supervised[["body", 'body_normalized']].head()

b. Stop words are not contributing much to our ML tasks, such as "the", "a", since they carry very little information. Take care of these kinds of words.

c. Reduce words to their base or root form using Stemming/Lemmatization. This helps in reducing inflected words to a common base form. (Hint: Consider using libraries like NLTK or spaCy for tokenization).

In [None]:
def process_text_full(text_series, batch_size=2000):
    clean_texts = []

    total_docs = len(text_series)

    # tqdm show the process bar
    for doc in tqdm(nlp.pipe(text_series, batch_size=batch_size), total=total_docs, desc="Processing"):

        tokens = []
        for token in doc:
            # 1. Filtering Stop Words e punctation (b)
            if not token.is_stop and not token.is_punct and not token.is_space:
                # 2. Take the lemma using spaCy (c)
                tokens.append(token.lemma_)

        clean_texts.append(" ".join(tokens))

    return clean_texts

print("Elaboration of SUPERVISED dataset (smaller)...")
df_supervised['body_clean'] = process_text_full(df_supervised['body_normalized'].astype(str))

df_supervised.to_csv("./clean_supervised.csv", index=False)

print("Elaboration of UNSUPERVISED  dataset (bigger)...")
df_unsupervised['body_clean'] = process_text_full(df_unsupervised['body_normalized'].astype(str))

df_unsupervised.to_csv("./clean_unsupervised.csv", index=False)