In [13]:
%pip install -r ../requirements.txt

Collecting shellingham>=1.3.0 (from typer<1.0.0,>=0.3.0->spacy->-r ../requirements.txt (line 9))
  Using cached shellingham-1.5.4-py2.py3-none-any.whl.metadata (3.5 kB)
Using cached shellingham-1.5.4-py2.py3-none-any.whl (9.8 kB)
Installing collected packages: shellingham
Successfully installed shellingham-1.5.4
Note: you may need to restart the kernel to use updated packages.


In [14]:
import pandas as pd
import re
from sklearn.pipeline import Pipeline # Pipeline applies a list of transforms. You can also add an estimator at the end, so it will be completely encapsulated.
from sklearn.preprocessing import FunctionTransformer # FunctionTransformer allows to apply an arbitrary function to the data, so we can use it in the pipeline
import unicodedata
import spacy
from typing import List

In [6]:
path = "../llm-proper-data/train_drcat_"

In [9]:
drcat_01 = pd.read_csv(path + "01.csv")
drcat_02 = pd.read_csv(path + "02.csv")
drcat_03 = pd.read_csv(path + "03.csv")
drcat_04 = pd.read_csv(path + "04.csv")

In [10]:
drcat_01.head()

Unnamed: 0,text,label,source,fold
0,There are alot reasons to keep our the despise...,0,persuade_corpus,2
1,Driving smart cars that drive by themself has ...,0,persuade_corpus,4
2,"Dear Principal,\n\nI believe that students at ...",0,persuade_corpus,0
3,"Dear Principal,\n\nCommunity service should no...",0,persuade_corpus,0
4,My argument for the development of the driverl...,0,persuade_corpus,3


In [23]:
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m[36m0:00:01[0mm eta [36m0:00:01[0m
[?25hInstalling collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [24]:
def remove_excessive_spaces(text: str) -> str:
    """
    This function removes excessive spaces from the text.

    Args:
        text (str): The input text.

    Returns:
        str: The text with excessive spaces removed.
    """
    return re.sub(r'\s+', ' ', text).strip() 

def remove_repeated_non_word_characters(text: str) -> str:
    """
    This function removes repeated non-word characters from the text.

    Args:
        text (str): The input text.

    Returns:
        str: The text with repeated non-word characters removed.
    """
    return re.sub(r'(\W)\1+', r'\1', text).strip()

def remove_first_line_from_text(text: str) -> str:
    """
    This function removes the first line from the text.

    Args:
        text (str): The input text.

    Returns:
        str: The text with the first line removed.
    """
    return re.sub(r'^.*\n', '', text).strip()

def remove_last_line_from_text(text: str) -> str:
    """
    This function removes the last line from the text.

    Args:
        text (str): The input text.

    Returns:
        str: The text with the last line removed.
    """
    return re.sub(r'\n.*$', '', text).strip()

def fix_isolated_commas_in_text(text: str) -> str:
    """
    This function fixes isolated commas in the text.

    Args:
        text (str): The input text.

    Returns:
        str: The text with isolated commas fixed.
    """
    text = re.sub(r' ([.,:;!?])', r'\1', text)
    return text.strip()

def keep_words_longer_than(text: str, min_length: int = 2) -> str:
    """
    This function keeps only the words in the text that are longer than a given length.

    Args:
        text (str): The input text.
        min_length (int, optional): The minimum length of the words to keep. Defaults to 2.

    Returns:
        str: The text with only the words longer than the given length.
    """
    return ' '.join([word for word in text.split() if len(word) > min_length])

def keep_only_alphabet_characters(text: str) -> str:
    """
    This function keeps only the alphabet characters in the text.

    Args:
        text (str): The input text.

    Returns:
        str: The text with only the alphabet characters.
    """
    return re.sub(r'[^a-zA-Z]', ' ', text).strip()

def remove_accents_from_text(text: str) -> str:
    """
    This function removes accents from the text.

    Args:
        text (str): The input text.

    Returns:
        str: The text with accents removed.
    """
    return unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('ASCII')

def lemmatize_text_with_spacy(text: str) -> str:
    """
    This function lemmatizes the text using the Spacy library.

    Args:
        text (str): The input text.

    Returns:
        str: The lemmatized text.
    """
    doc = nlp_spacy(text)
    return ' '.join([token.lemma_ for token in doc])


pipeline_clean_text = Pipeline([
    ('remove_first_line_from_text', FunctionTransformer(remove_first_line_from_text)),
    ('remove_last_line_from_text', FunctionTransformer(remove_last_line_from_text)),
    ('remove_excessive_spaces', FunctionTransformer(remove_excessive_spaces)),
    ('remove_repeated_non_word_characters', FunctionTransformer(remove_repeated_non_word_characters)),
    ('fix_isolated_commas_in_text', FunctionTransformer(fix_isolated_commas_in_text)),
])

nlp_spacy = spacy.load('en_core_web_sm')

In [None]:
# TODO: ALTERAR PARA APLICAR A PIPELINE NOS DADOS
# We can apply the pipeline to the data
# df['input_text_clean'] = df['input_text'].apply(pipeline_clean_text.transform)

# df['input_text_clean_simplified'] = df['input_text_clean'].apply(lemmatize_text_with_spacy)
# df['input_text_clean_simplified'] = df['input_text_clean_simplified'].apply(remove_accents_from_text)
# df['input_text_clean_simplified'] = df['input_text_clean_simplified'].apply(keep_words_longer_than)
# df['input_text_clean_simplified'] = df['input_text_clean_simplified'].str.lower()
# df['input_text_clean_simplified'] = df['input_text_clean_simplified'].apply(keep_only_alphabet_characters)
# df['input_text_clean_simplified'] = df['input_text_clean_simplified'].apply(remove_excessive_spaces)