In [1]:
import pandas as pd
import numpy as np

# the same dataset after cleaning
df_features = pd.read_pickle("df_with_features.pickle")

In [3]:
cols = [
    "pub_number", "fr", "en", "similarity",
    "len_ratio", "verb_ratio", "noun_ratio",
    "entity_ratio", "clause_ratio"
]

numeric_df = df_features[[
    "similarity", "len_ratio", "verb_ratio",
    "noun_ratio", "entity_ratio", "clause_ratio"
]]

Q1 = numeric_df.quantile(0.25)
Q3 = numeric_df.quantile(0.75)
IQR = Q3 - Q1

outlier_mask = (numeric_df < (Q1 - 1.5 * IQR)) | (numeric_df > (Q3 + 1.5 * IQR))
outlier_mask["similarity"] = numeric_df["similarity"] < (Q1["similarity"] - 1.5 * IQR["similarity"])

outlier_flags = outlier_mask.add_suffix("_outlier")

df_features = pd.concat([df_features[cols], outlier_flags], axis=1)

df_features["outlier"] = outlier_flags.any(axis=1)

In [4]:
IQR

similarity      0.117000
len_ratio       0.229047
verb_ratio      0.500000
noun_ratio      0.388889
entity_ratio    1.000000
clause_ratio    0.500000
dtype: float64

In [5]:
# number of cleaned data after removing outliers
df_features[~df_features.outlier].shape[0] / df_features.shape[0] 

0.722734806168809

In [6]:
df_clean = df_features[~df_features.outlier].copy()
df_outliers = df_features[df_features.outlier].copy()

In [7]:
df_outliers.sample().T

Unnamed: 0,548710
pub_number,SAR 2014/057
fr,Les montaisons de petits saumons pour la régio...
en,Small salmon returns for Gulf Region in 2012 a...
similarity,0.819
len_ratio,1.149171
verb_ratio,1.5
noun_ratio,1.142857
entity_ratio,8.0
clause_ratio,1.333333
similarity_outlier,False


In [8]:
# entity_ratio_outlier is probably bad
df_outliers.loc[366969]

pub_number                                                   RES 2022/027
fr                      De 2018 à 2021, plus de 80 de la biomasse cumu...
en                      From 2018 to 2021, more than 80 of the cumulat...
similarity                                                          0.972
len_ratio                                                        1.010309
verb_ratio                                                            1.0
noun_ratio                                                       1.285714
entity_ratio                                                          4.0
clause_ratio                                                          2.0
similarity_outlier                                                  False
len_ratio_outlier                                                   False
verb_ratio_outlier                                                  False
noun_ratio_outlier                                                  False
entity_ratio_outlier                  

In [9]:
# this one is bad, but it's at 5
df_outliers.loc[498816]

pub_number                                                   SAR 2010/020
fr                      Sources d incertitude Il existe de l incertitu...
en                      Newfoundland and Labrador Region 2HJ3KLNOPs4R ...
similarity                                                          0.818
len_ratio                                                        1.296875
verb_ratio                                                            1.0
noun_ratio                                                          1.125
entity_ratio                                                          5.0
clause_ratio                                                          1.0
similarity_outlier                                                  False
len_ratio_outlier                                                   False
verb_ratio_outlier                                                  False
noun_ratio_outlier                                                  False
entity_ratio_outlier                  

In [10]:
df_outliers[df_outliers.entity_ratio_outlier].sample().T

Unnamed: 0,574405
pub_number,SAR 2017/033
fr,Saison Permis TAC (t) Débarquements (t) CPUE (...
en,Season Licenses TAC (t) Landings (t) CPUE (kg ...
similarity,0.781
len_ratio,1.101205
verb_ratio,3.0
noun_ratio,4.636364
entity_ratio,8.666667
clause_ratio,7.5
similarity_outlier,False
