# Generate smaller Datasets

In [2]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv("../data/data.csv")
df.head()

Unnamed: 0,dataset,text,logical_fallacies,source
0,1,The World Coal Association disputed the conclu...,false_dilemma,https://www.nytimes.com/2018/10/07/climate/ipc...
1,1,Refusing to approve the document would place t...,appeal_to_emotion,https://www.nytimes.com/2018/10/07/climate/ipc...
2,1,At 3 6 degrees of warming the report predicts ...,faulty_generalization,https://www.nytimes.com/2018/10/07/climate/ipc...
3,1,Scribbler and Beckwith said the anomalies were...,faulty_generalization,https://www.independent.co.uk/news/science/cli...
4,1,Meanwhile Mr Beckwith confirmed the changes wo...,appeal_to_emotion,https://www.independent.co.uk/news/science/cli...


In [4]:
df.groupby("logical_fallacies").count()

Unnamed: 0_level_0,dataset,text,source
logical_fallacies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ad_hominem,1063,1063,87
appeal_to_authority,727,727,25
appeal_to_emotion,1619,1619,121
false_dilemma,941,941,45
faulty_generalization,1350,1350,120
none,5418,5418,89


## Generate training set (5000 rows)

In [5]:
# Pick fallacies randomly, but as balanced as possible
# Configuration parameters
TARGET_SIZE = 5000
CLASSES = df['logical_fallacies'].unique()

# Calculate target per class (integer division)
target_per_class = TARGET_SIZE // len(CLASSES)

# Stratified sampling with undersampling
sampled_dfs = []
for class_name in CLASSES:
    class_df = df[df['logical_fallacies'] == class_name]
    sample_size = min(target_per_class, len(class_df))
    sampled_dfs.append(class_df.sample(sample_size, random_state=42))

# Handle remaining samples
balanced_df = pd.concat(sampled_dfs)
remaining = TARGET_SIZE - len(balanced_df)

if remaining > 0:
    extra_samples = df[~df.index.isin(balanced_df.index)]
    balanced_df = pd.concat([
        balanced_df,
        extra_samples.sample(remaining, random_state=42)
    ])

# Final shuffle
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Verify distribution
print(balanced_df['logical_fallacies'].value_counts())

logical_fallacies
none                     912
faulty_generalization    850
appeal_to_emotion        839
ad_hominem               838
false_dilemma            834
appeal_to_authority      727
Name: count, dtype: int64


In [6]:
balanced_df.to_csv("../data/data_training.csv", index=False)

### Balance not only the fallacies, but also the word length

In [7]:
df['text_char_length'] = df['text'].map(lambda x : len(x))
df['text_word_length'] = df['text'].str.split().str.len()

In [8]:
df.groupby("logical_fallacies")["text_word_length"].mean()

logical_fallacies
ad_hominem               22.436500
appeal_to_authority      53.354883
appeal_to_emotion        20.291538
false_dilemma            44.621679
faulty_generalization    38.643704
none                     45.946290
Name: text_word_length, dtype: float64

In [9]:
df.describe()

Unnamed: 0,dataset,text_char_length,text_word_length
count,11118.0,11118.0,11118.0
mean,4.304731,215.560443,39.448282
std,2.392147,167.186961,30.738583
min,1.0,2.0,1.0
25%,3.0,87.0,16.0
50%,3.0,184.0,33.0
75%,4.0,289.0,54.0
max,9.0,1385.0,257.0


In [10]:
df_test = df.query("text_word_length > 16 and text_word_length < 54")
df_test.describe()

Unnamed: 0,dataset,text_char_length,text_word_length
count,5453.0,5453.0,5453.0
mean,4.157528,185.653402,33.690262
std,2.216364,61.10045,10.807085
min,1.0,62.0,17.0
25%,3.0,132.0,24.0
50%,4.0,184.0,33.0
75%,4.0,238.0,43.0
max,9.0,353.0,53.0


In [11]:
df_test.groupby("logical_fallacies")["text_word_length"].mean()

logical_fallacies
ad_hominem               30.462428
appeal_to_authority      34.618076
appeal_to_emotion        30.155007
false_dilemma            35.635910
faulty_generalization    31.707353
none                     35.309241
Name: text_word_length, dtype: float64

In [12]:
df_test["logical_fallacies"].value_counts()

logical_fallacies
none                     2781
appeal_to_emotion         729
faulty_generalization     680
ad_hominem                519
false_dilemma             401
appeal_to_authority       343
Name: count, dtype: int64

In [13]:
# Pick fallacies randomly, but as balanced as possible
# Configuration parameters
df_balanced_length = df_test.copy()

TARGET_SIZE = 2000
CLASSES = df_balanced_length['logical_fallacies'].unique()

# Calculate target per class (integer division)
target_per_class = TARGET_SIZE // len(CLASSES)

# Stratified sampling with undersampling
sampled_dfs = []
for class_name in CLASSES:
    class_df = df_balanced_length[df_balanced_length['logical_fallacies'] == class_name]
    sample_size = min(target_per_class, len(class_df))
    sampled_dfs.append(class_df.sample(sample_size, random_state=42))

# Handle remaining samples
df_balanced_length= pd.concat(sampled_dfs)
remaining = TARGET_SIZE - len(df_balanced_length)

if remaining > 0:
    extra_samples = df[~df.index.isin(df_balanced_length.index)]
    df_balanced_length = pd.concat([
        df_balanced_length,
        extra_samples.sample(remaining, random_state=42)
    ])

# Final shuffle
df_balanced_length = df_balanced_length.sample(frac=1, random_state=42).reset_index(drop=True)

# Verify distribution
print(df_balanced_length['logical_fallacies'].value_counts())

logical_fallacies
appeal_to_emotion        334
false_dilemma            334
appeal_to_authority      333
none                     333
faulty_generalization    333
ad_hominem               333
Name: count, dtype: int64


In [14]:
df_balanced_length.groupby("logical_fallacies")["text_word_length"].mean()

logical_fallacies
ad_hominem               30.276276
appeal_to_authority      34.777778
appeal_to_emotion        30.326347
false_dilemma            35.589820
faulty_generalization    31.135135
none                     36.231231
Name: text_word_length, dtype: float64

In [15]:
df_balanced_length.to_csv("../data/data_length_balanced.csv")

## Generate small dataset (2000 rows)

In [54]:
y = balanced_df["logical_fallacies"]
X = balanced_df[["text", "dataset", "source"]]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.6, random_state=42, stratify=y)

df_balanced_small = pd.concat([X_train, y_train], axis=1)
df_balanced_small.head()

df_balanced_small.to_csv("../data/data_small.csv")

## Generate tiny dataset (100 rows)

In [53]:
number_of_rows = 100
RSEED = 42

df_tiny = balanced_df.sample(number_of_rows, axis=0, random_state=RSEED)
df_tiny.to_csv("../data/data_tiny.csv", index=False)