# Task 02: Exploratory Data Analysis (Part 4)

## Identifying "Noise" and Artifacts

### Load Dataset

In [8]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer
import re
from collections import Counter 

train_path = Path("../data/processed/pcl_task1_train.csv")
dev_path = Path("../data/processed/pcl_task1_dev.csv")

train_df = pd.read_csv(train_path)
dev_df = pd.read_csv(dev_path)

### Duplicates

In [13]:
def check_duplicates(df, name):
    print(name)
    print("duplicate texts:", df["text"].duplicated().sum())
    print("duplicate par_ids:", df["par_id"].duplicated().sum())
    print()

check_duplicates(train_df, "TRAIN")
check_duplicates(dev_df, "DEV")

# leakage
overlap = set(train_df.text) & set(dev_df.text)
print("train/dev text overlap:", len(overlap))

TRAIN
duplicate texts: 0
duplicate par_ids: 0

DEV
duplicate texts: 0
duplicate par_ids: 0

train/dev text overlap: 0


### Split leakage

In [10]:
train_texts = set(train_df["text"])
dev_texts = set(dev_df["text"])

overlap = train_texts & dev_texts
print("train/dev text overlap:", len(overlap))

# optional inspect
list(overlap)[:5]

train/dev text overlap: 0


[]

### Special characters/HTML

In [11]:
artifact_patterns = [
    r"&\w+;",     # &amp; &quot;
    r"\\n",       # escaped newline
    r"\s{3,}",    # huge spacing
    r"http\S+",   # URLs
    r"\w+\.\w+\.\w+", # domains
]

def find_artifacts(texts, n=20):
    hits = []
    for t in texts:
        for p in artifact_patterns:
            if re.search(p, t):
                hits.append(t)
                break
    return hits[:n], len(hits)

examples, total = find_artifacts(train_df["text"])
print("artifact count:", total)
examples[:5]

artifact count: 91


['People who are homeless , those who were once homeless , those working with the homeless and concerned New Zealanders are being asked to share their experiences and solutions to this growing issue with the Cross-Party Homelessness Inquiry . More&gt;&gt;',
 "The departures from London will barely put a dent in S&amp;P 's overall presence in Europe 's main financial centre . But Peterson warned Britain needed to provide clarity on key post-Brexit regulatory arrangements to ensure there is n't more upheaval .",
 "Another collective sale leads the region 's real estate headlines again today as Asia gets back to work after the western holiday season , with the owners of a housing development hoping to bring in S$355 million to be homeless . Meanwhile , Guangzhou R&amp;F is the latest mainland giant to report encouraging sales numbers and there 's much more if you just read on .",
 'Bank of America \'s biggest competitors do n\'t have specific policies on employment for DACA permit holders

- will need to normalise during data cleanup so strip these before training

### Outliers (length profiling)

- Most of length profiling and visualisations done in basic stat profiling (EDA_stat_profiling.ipyng)

In [12]:
train_df["char_len"] = train_df["text"].str.len()
train_df["tok_len"] = train_df["text"].str.split().str.len()

print(train_df["tok_len"].describe(percentiles=[.01,.05,.95,.99]))

# inspect extreme short
train_df.nsmallest(5, "tok_len")[["tok_len","text"]]

# inspect extreme long
train_df.nlargest(5, "tok_len")[["tok_len","text"]]

count    8375.000000
mean       48.675224
std        29.677952
min         1.000000
1%          8.000000
5%         16.000000
50%        42.000000
95%       102.000000
99%       142.000000
max       909.000000
Name: tok_len, dtype: float64


Unnamed: 0,tok_len,text
7267,909,Dr Mayengbam Lalit Singh Recently honourable P...
6695,512,Most are from desperately poor Horn of Africa ...
5862,419,"Mahinda Wijesinghe , the Inspector General of ..."
8031,390,ANNUAL State of Education Report ( ASER ) laun...
285,266,The following is a brief history of the Rajnee...
