# Generate smaller Datasets

In [24]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split

In [25]:
df = pd.read_csv("../data/data.csv")
df.head()

Unnamed: 0,dataset,text,logical_fallacies,source
0,1,The World Coal Association disputed the conclu...,false_dilemma,https://www.nytimes.com/2018/10/07/climate/ipc...
1,1,Refusing to approve the document would place t...,appeal_to_emotion,https://www.nytimes.com/2018/10/07/climate/ipc...
2,1,At 3 6 degrees of warming the report predicts ...,faulty_generalization,https://www.nytimes.com/2018/10/07/climate/ipc...
3,1,Scribbler and Beckwith said the anomalies were...,faulty_generalization,https://www.independent.co.uk/news/science/cli...
4,1,Meanwhile Mr Beckwith confirmed the changes wo...,appeal_to_emotion,https://www.independent.co.uk/news/science/cli...


In [26]:
df.groupby("logical_fallacies").count()

Unnamed: 0_level_0,dataset,text,source
logical_fallacies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ad_hominem,1062,1062,87
appeal_to_authority,726,726,25
appeal_to_emotion,1619,1619,121
false_dilemma,941,941,45
faulty_generalization,1347,1347,120
none,5418,5418,89
slippery_slope,627,627,0


In [27]:
df_ss = df[df['logical_fallacies'] != 'faulty_generalization'] # remove 'faulty_generalization'

In [28]:
df_fg =  df[df['logical_fallacies'] != 'slippery_slope'] # remove 'slippery_slope'

In [32]:
df_ex3 =  df[df['dataset'] != 3] # remove dataset 3
df_ex3 =  df_ex3[df_ex3['logical_fallacies'] != 'slippery_slope'] # remove 'slippery_slope'

## Generate training set (5000 rows)

In [6]:
## With 'slippery_slope' and not 'faulty_generalization'


# Pick fallacies randomly, but as balanced as possible
# Configuration parameters
TARGET_SIZE = 5000
CLASSES = df_ss['logical_fallacies'].unique()

# Calculate target per class (integer division)
target_per_class = TARGET_SIZE // len(CLASSES)

# Stratified sampling with undersampling
sampled_dfs_ss = []
for class_name in CLASSES:
    class_dfss = df_ss[df_ss['logical_fallacies'] == class_name]
    sample_size = min(target_per_class, len(class_dfss))
    sampled_dfs_ss.append(class_dfss.sample(sample_size, random_state=42))

# Handle remaining samples
balanced_df_ss = pd.concat(sampled_dfs_ss)
remaining = TARGET_SIZE - len(balanced_df_ss)

if remaining > 0:
    extra_samples = df_ss[~df_ss.index.isin(balanced_df_ss.index)]
    balanced_df_ss = pd.concat([
        balanced_df_ss,
        extra_samples.sample(remaining, random_state=42)
    ])

# Final shuffle
balanced_df_ss = balanced_df_ss.sample(frac=1, random_state=42).reset_index(drop=True)

# Verify distribution
print(balanced_df_ss['logical_fallacies'].value_counts())

logical_fallacies
none                   1088
appeal_to_emotion       878
ad_hominem              844
false_dilemma           837
appeal_to_authority     726
slippery_slope          627
Name: count, dtype: int64


In [7]:
## With 'faulty_generalization' and not 'slippery_slope'


# Pick fallacies randomly, but as balanced as possible
# Configuration parameters
TARGET_SIZE = 5000
CLASSES = df_fg['logical_fallacies'].unique()

# Calculate target per class (integer division)
target_per_class = TARGET_SIZE // len(CLASSES)

# Stratified sampling with undersampling
sampled_dfs_fg = []
for class_name in CLASSES:
    class_df_fg = df_fg[df_fg['logical_fallacies'] == class_name]
    sample_size = min(target_per_class, len(class_df_fg))
    sampled_dfs_fg.append(class_df_fg.sample(sample_size, random_state=42))

# Handle remaining samples
balanced_df_fg = pd.concat(sampled_dfs_fg)
remaining = TARGET_SIZE - len(balanced_df_fg)

if remaining > 0:
    extra_samples = df[~df.index.isin(balanced_df_fg.index)]
    balanced_df_fg = pd.concat([
        balanced_df_fg,
        extra_samples.sample(remaining, random_state=42)
    ])

# Final shuffle
balanced_df_fg = balanced_df_fg.sample(frac=1, random_state=42).reset_index(drop=True)

# Verify distribution
print(balanced_df_fg['logical_fallacies'].value_counts())

logical_fallacies
none                     915
appeal_to_emotion        843
ad_hominem               837
false_dilemma            836
faulty_generalization    835
appeal_to_authority      726
slippery_slope             8
Name: count, dtype: int64


In [8]:
balanced_df_fg.to_csv("../data/data_training_fg.csv", index=False)
balanced_df_ss.to_csv("../data/data_training_ss.csv", index=False)

## Generate small dataset (2000 rows)

In [9]:
# Slipery slope
y = balanced_df_ss["logical_fallacies"]
X = balanced_df_ss[["text", "dataset", "source"]]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.6, random_state=42, stratify=y)

df_balanced_ss_small = pd.concat([X_train, y_train], axis=1)
df_balanced_ss_small.head()

df_balanced_ss_small.to_csv("../data/data_ss_small.csv", index=False)

In [10]:
# faulty generalization
y = balanced_df_fg["logical_fallacies"]
X = balanced_df_fg[["text", "dataset", "source"]]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.6, random_state=42, stratify=y)

df_balanced_fg_small = pd.concat([X_train, y_train], axis=1)
df_balanced_fg_small.head()

df_balanced_fg_small.to_csv("../data/data_fg_small.csv", index=False)

In [37]:
## without dataset 3 and with 'faulty_generalization' and without 'slippery slope'

# Pick fallacies randomly, but as balanced as possible
# Configuration parameters
TARGET_SIZE = 2000
CLASSES = df_ex3['logical_fallacies'].unique()

# Calculate target per class (integer division)
target_per_class = TARGET_SIZE // len(CLASSES)

# Stratified sampling with undersampling
sampled_df_ex3 = []
for class_name in CLASSES:
    class_dfss = df_ex3[df_ex3['logical_fallacies'] == class_name]
    sample_size = min(target_per_class, len(class_dfss))
    sampled_df_ex3.append(class_dfss.sample(sample_size, random_state=42))

# Handle remaining samples
balanced_df_ex3_small = pd.concat(sampled_df_ex3)
remaining = TARGET_SIZE - len(balanced_df_ex3_small)

if remaining > 0:
    extra_samples = df_ex3[~df_ex3.index.isin(balanced_df_ex3_small.index)]
    balanced_df_ex3_small = pd.concat([
        balanced_df_ex3_small,
        extra_samples.sample(remaining, random_state=42)
    ])

# Final shuffle
balanced_df_ex3_small = balanced_df_ex3_small.sample(frac=1, random_state=42).reset_index(drop=True)

# Verify distribution
print(balanced_df_ex3_small['logical_fallacies'].value_counts())

balanced_df_ex3_small.to_csv("../data/data_ex3_small.csv", index=False)

logical_fallacies
none                     358
appeal_to_emotion        350
ad_hominem               342
faulty_generalization    336
false_dilemma            334
appeal_to_authority      280
Name: count, dtype: int64


In [11]:
# what used to be! 

# y = balanced_df["logical_fallacies"]
# X = balanced_df[["text", "dataset", "source"]]

# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.6, random_state=42, stratify=y)

# df_balanced_small = pd.concat([X_train, y_train], axis=1)
# df_balanced_small.head()

# df_balanced_small.to_csv("../data/data_small.csv", index=False)

## Generate tiny dataset (100 rows)

In [12]:
# Slipery slope
number_of_rows = 100
RSEED = 42

df_ss_tiny = balanced_df_ss.sample(number_of_rows, axis=0, random_state=RSEED)
df_ss_tiny.to_csv("../data/data_ss_tiny.csv", index=False)

In [13]:
# faulty generalization
number_of_rows = 100
RSEED = 42

df_fg_tiny = balanced_df_fg.sample(number_of_rows, axis=0, random_state=RSEED)
df_fg_tiny.to_csv("../data/data_fg_tiny.csv", index=False)