## Imports

In [1]:
import requests
import pandas as pd
import json
from builtins import len


## Download file

In [40]:
url = 'https://raw.githubusercontent.com/tmakesense/logical-fallacy/main/dataset-fixed/edu_all_fixed.csv'
filename = 'edu_all_fixed.csv'

response = requests.get(url)
with open(filename, 'wb') as file:
    file.write(response.content)
    

In [41]:
def get_line_count(file_path, encoding='utf-8'):
    with open(file_path, 'r', encoding=encoding, errors='ignore') as file:
        line_count = sum(1 for line in file)
    return line_count

In [42]:
line_count = get_line_count('edu_all_fixed.csv')
print(f"Line count: {line_count}")

Line count: 2226


## Transform file

In [43]:
df = pd.read_csv('edu_all_fixed.csv')
df_filtered = df[df['updated_label'] != 'miscellaneous'] # remove 'miscellaneous' label
sorted_df = df_filtered.sort_values(by='updated_label') # sort by 'updated_label'
sorted_df.to_csv('edu_all_fixed_transformed.csv', index=False)

line_count = get_line_count('edu_all_fixed_transformed.csv')
print(f"Line count: {line_count}")

Line count: 2223


In [44]:
df = pd.read_csv('edu_all_fixed_transformed.csv')

with open('mapping.json') as file:
    mapping = json.load(file)

df['updated_label'] = df['updated_label'].map(mapping)
df.to_csv('edu_all_fixed_transformed_mapped.csv', index=False, quoting=1)


In [45]:
def split_and_save_by_category(input_file_path, output_file_path1, output_file_path2, encoding='utf-8'):
    # Read the CSV file
    df = pd.read_csv(input_file_path, encoding=encoding)

    # Get unique categories
    categories = df['updated_label'].unique()

    # Initialize empty dataframes for the two files
    df1 = pd.DataFrame(columns=df.columns)
    df2 = pd.DataFrame(columns=df.columns)

    # Split each category into two halves
    for category in categories:
        category_df = df[df['updated_label'] == category]
        half_size = len(category_df) // 2

        df1 = pd.concat([df1, category_df.iloc[:half_size]])
        df2 = pd.concat([df2, category_df.iloc[half_size:]])

    # Save the split dataframes to two separate CSV files
    df1.to_csv(output_file_path1, index=False, encoding=encoding, quoting=1)
    df2.to_csv(output_file_path2, index=False, encoding=encoding, quoting=1)

In [46]:
split_and_save_by_category('edu_all_fixed_transformed_mapped.csv', 'edu_all_fixed_transformed_1st_half.csv', 'edu_all_fixed_transformed_2nd_half.csv')

# print file 1 line count
line_count = get_line_count('edu_all_fixed_transformed_1st_half.csv')
print(f"Line count of 1st half: {line_count}")

# print file 2 line count
line_count = get_line_count('edu_all_fixed_transformed_2nd_half.csv')
print(f"Line count of 2nd half: {line_count}")

Line count of 1st half: 1109
Line count of 2nd half: 1115


In [47]:
df1 = pd.read_csv('edu_all_fixed_transformed_1st_half.csv', skiprows=1)
df2 = pd.read_csv('edu_all_fixed_transformed_2nd_half.csv', skiprows=1)

# Convert DataFrames to lists of strings for comparison
lines1 = df1.astype(str).apply(lambda x: ','.join(x), axis=1).tolist()
lines2 = df2.astype(str).apply(lambda x: ','.join(x), axis=1).tolist()

# Check that all lines in the two files are different
all_different = set(lines1).isdisjoint(lines2)

# Print the result
if all_different:
    print("All lines except the first one are different between the two files.")
else:
    print("There are some common lines between the two files.")
    

All lines except the first one are different between the two files.


## Output

In [7]:
output = pd.read_csv('output_new.csv')
input = pd.read_csv('input.csv')

input['source_article'] = output
input.to_csv('output_full.csv', index=False)


In [8]:
df = pd.read_csv('output_full.csv')
print(len(df))

1108


## Simple sentences

In [2]:
import pandas as pd

In [5]:
df = pd.read_csv('input.csv')

def remove_excessive_lines(df, max_words=30):
    # Filter rows where the word count in 'source_article' is less than or equal to max_words
    df_filtered = df[df['source_article'].apply(lambda x: len(x.split()) <= max_words)]
    return df_filtered

df_reduced = remove_excessive_lines(df)
df_reduced.to_csv('output_reduced.csv', index=False)
