In [1]:
%pip install -r ../requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [25]:
import pandas as pd
import numpy as np
import re
from sklearn.pipeline import Pipeline # Pipeline applies a list of transforms. You can also add an estimator at the end, so it will be completely encapsulated.
from sklearn.preprocessing import FunctionTransformer # FunctionTransformer allows to apply an arbitrary function to the data, so we can use it in the pipeline
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
import unicodedata
import spacy
from typing import List

In [2]:
path = "../data/inter/train_drcat_"

In [3]:
drcat_01 = pd.read_csv(path + "01.csv")
drcat_02 = pd.read_csv(path + "02.csv")
drcat_03 = pd.read_csv(path + "03.csv")
drcat_04 = pd.read_csv(path + "04.csv")

In [4]:
drcat_01.head()

Unnamed: 0,text,label,source,fold
0,There are alot reasons to keep our the despise...,0,persuade_corpus,2
1,Driving smart cars that drive by themself has ...,0,persuade_corpus,4
2,"Dear Principal,\n\nI believe that students at ...",0,persuade_corpus,0
3,"Dear Principal,\n\nCommunity service should no...",0,persuade_corpus,0
4,My argument for the development of the driverl...,0,persuade_corpus,3


## Preprocessing data

In [5]:
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 330.3 kB/s eta 0:00:39
     --------------------------------------- 0.1/12.8 MB 656.4 kB/s eta 0:00:20
     - -------------------------------------- 0.6/12.8 MB 4.8 MB/s eta 0:00:03
     ---- ----------------------------------- 1.5/12.8 MB 8.9 MB/s eta 0:00:02
     -------- ------------------------------- 2.7/12.8 MB 12.4 MB/s eta 0:00:01
     ------------- -------------------------- 4.2/12.8 MB 15.8 MB/s eta 0:00:01
     ------------------- -------------------- 6.3/12.8 MB 20.1 MB/s eta 0:00:01
     --------------------------- ------------ 8.9/12.8 MB 24.7 MB/s eta 0:00:01
     ------------------------------- ------- 10.5/12.8 MB 36.4 MB/s eta 0:00:01
     -----------------------------

In [6]:
def remove_excessive_spaces(text: str) -> str:
    """
    This function removes excessive spaces from the text.

    Args:
        text (str): The input text.

    Returns:
        str: The text with excessive spaces removed.
    """
    return re.sub(r'\s+', ' ', text).strip() 

def remove_repeated_non_word_characters(text: str) -> str:
    """
    This function removes repeated non-word characters from the text.

    Args:
        text (str): The input text.

    Returns:
        str: The text with repeated non-word characters removed.
    """
    return re.sub(r'(\W)\1+', r'\1', text).strip()

def remove_first_line_from_text(text: str) -> str:
    """
    This function removes the first line from the text.

    Args:
        text (str): The input text.

    Returns:
        str: The text with the first line removed.
    """
    return re.sub(r'^.*\n', '', text).strip()

def remove_last_line_from_text(text: str) -> str:
    """
    This function removes the last line from the text.

    Args:
        text (str): The input text.

    Returns:
        str: The text with the last line removed.
    """
    return re.sub(r'\n.*$', '', text).strip()

def fix_isolated_commas_in_text(text: str) -> str:
    """
    This function fixes isolated commas in the text.

    Args:
        text (str): The input text.

    Returns:
        str: The text with isolated commas fixed.
    """
    text = re.sub(r' ([.,:;!?])', r'\1', text)
    return text.strip()

def keep_words_longer_than(text: str, min_length: int = 2) -> str:
    """
    This function keeps only the words in the text that are longer than a given length.

    Args:
        text (str): The input text.
        min_length (int, optional): The minimum length of the words to keep. Defaults to 2.

    Returns:
        str: The text with only the words longer than the given length.
    """
    return ' '.join([word for word in text.split() if len(word) > min_length])

def keep_only_alphabet_characters(text: str) -> str:
    """
    This function keeps only the alphabet characters in the text.

    Args:
        text (str): The input text.

    Returns:
        str: The text with only the alphabet characters.
    """
    return re.sub(r'[^a-zA-Z]', ' ', text).strip()

def remove_accents_from_text(text: str) -> str:
    """
    This function removes accents from the text.

    Args:
        text (str): The input text.

    Returns:
        str: The text with accents removed.
    """
    return unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('ASCII')

def lemmatize_text_with_spacy(text: str) -> str:
    """
    This function lemmatizes the text using the Spacy library.

    Args:
        text (str): The input text.

    Returns:
        str: The lemmatized text.
    """
    doc = nlp_spacy(text)
    return ' '.join([token.lemma_ for token in doc])


pipeline_clean_text = Pipeline([
    ('remove_first_line_from_text', FunctionTransformer(remove_first_line_from_text)),
    ('remove_last_line_from_text', FunctionTransformer(remove_last_line_from_text)),
    ('remove_excessive_spaces', FunctionTransformer(remove_excessive_spaces)),
    ('remove_repeated_non_word_characters', FunctionTransformer(remove_repeated_non_word_characters)),
    ('fix_isolated_commas_in_text', FunctionTransformer(fix_isolated_commas_in_text)),
])

nlp_spacy = spacy.load('en_core_web_sm')

In [7]:
drcat_01['clean_text'] = drcat_01['text'].apply(pipeline_clean_text.transform)
drcat_02['clean_text'] = drcat_02['text'].apply(pipeline_clean_text.transform)
drcat_03['clean_text'] = drcat_03['text'].apply(pipeline_clean_text.transform)
drcat_04['clean_text'] = drcat_04['text'].apply(pipeline_clean_text.transform)

Merge data into one dataframe

In [8]:
df = pd.concat([drcat_01, drcat_02, drcat_03, drcat_04], axis=0, ignore_index=True)
df = df.drop(columns=['essay_id','source','fold'])
df = df.drop_duplicates()

In [11]:
df.label.value_counts()

label
0    29792
1    19256
Name: count, dtype: int64

In [15]:
df.columns

Index(['text', 'label', 'clean_text', 'prompt'], dtype='object')

In [17]:
df.drop(columns=['text', 'prompt'], inplace=True)
df

Unnamed: 0,label,clean_text
0,0,Another reason is that the Electiral College r...
1,0,First let's start off with the pros of having ...
2,0,I believe that students at school should not b...
3,0,Community service should not be required by al...
4,0,"First of all, the diverless cars will help ton..."
...,...,...
159394,1,Seeking advice from more than one person when ...
159410,1,"On one hand, this technology could help teache..."
159412,1,The opportunity to travel the world while work...
159413,1,I am writing to you regarding the proposal to ...


## Data spliting

In [29]:
X, y = df.clean_text.values, df.label

Balancing data

In [33]:
sampler = RandomUnderSampler(random_state=42, sampling_strategy='majority')

X_balanced, y_balanced = sampler.fit_resample(X.reshape(-1, 1), y)
y_balanced.value_counts() # Balanced data

label
0    19256
1    19256
Name: count, dtype: int64

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, random_state=42)

## Weights and Biases

In [34]:
import wandb

import params

In [35]:
!wandb login

wandb: Currently logged in as: pedro_miguel (pedro_miguel-universidade-federal-do-rio-grande-do-norte). Use `wandb login --relogin` to force relogin


In [36]:
wandb.init(project=params.WANDB_PROJECT)

wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: pedro_miguel (pedro_miguel-universidade-federal-do-rio-grande-do-norte). Use `wandb login --relogin` to force relogin


In [37]:
external_raw_data_artifact = wandb.Artifact("daigt_proper_train_dataset", type="raw")
external_dataset_artifact = wandb.Artifact("drcat_data", type="dataset")

In [39]:
external_raw_data_artifact.add_file('../data/raw/daigt-proper-train-dataset.zip')

for i in range(1, 5):
    external_dataset_artifact.add_file(f'../data/inter/train_drcat_0{i}.csv')

In [40]:
wandb.log_artifact(external_raw_data_artifact)
wandb.log_artifact(external_dataset_artifact)

<Artifact drcat_data>

In [41]:
table = wandb.Table(dataframe=df)
wandb.log({"Cleaned and merged data": table})

In [42]:
wandb.finish()

VBox(children=(Label(value='18.027 MB of 558.906 MB uploaded\r'), FloatProgress(value=0.032253918262510485, ma…