In [5]:
import pandas as pd
import os

def clean_text(text):
    if isinstance(text, str):
            cleaned_text = text.encode('ascii', 'ignore').decode('ascii')
            return cleaned_text
    else:
            return text

def remove_quotes(text):
    if isinstance(text, str): 
        return text.strip('"')
    else:
        return text

def combine_csv_files(folder_path, output_file):
    # Initialize an empty list to store DataFrames
    dfs = []
    
    # Iterate over each file in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            # Read the CSV file into a DataFrame
            file_path = os.path.join(folder_path, filename)
            df = pd.read_csv(file_path)
            # Drop rows where label is "FAKE"
            df = df[df['Label'] != False]

            # # Drop rows where Labels is neither "FAKE" nor "REAL"
            df = df[(df['Label'] == True) | (df['Label'] == False)]
                 
            # Check if the DataFrame has a column named "title"
            if 'Statement' in df.columns:
                # Clean up the 'title' column
                df['Statement'] = df['Statement'].apply(clean_text)
                
                # Remove surrounding quotation marks from 'title' column
                df['Statement'] = df['Statement'].apply(remove_quotes)

                
                # Append the DataFrame to the list
                dfs.append(df)
            else:
                print(f"Warning: File '{filename}' does not have a 'title' column.")
    
    # Combine all DataFrames into a single DataFrame
    combined_df = pd.concat(dfs, ignore_index=True)
    
    # Save the combined DataFrame to a new CSV file
    combined_df.to_csv(output_file, index=False)

    print(f"Combined DataFrame saved to '{output_file}'.")

# Replace 'folder_path' with the path to your folder containing CSV files
folder_path = 'yo'
# Replace 'output_file.csv' with the desired name and path of the output CSV file
output_file = 'output_file_true10.csv'

combine_csv_files(folder_path, output_file)


Combined DataFrame saved to 'output_file_true10.csv'.


In [1]:
import pandas as pd
import os

def split_csv(input_file, output_folder, chunk_size=400):
    # Create the output folder if it does not exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Read the entire CSV file
    df = pd.read_csv(input_file)
    
    # Calculate the number of chunks
    num_chunks = (len(df) + chunk_size - 1) // chunk_size  # This ensures we get all items
    
    # Split the DataFrame into chunks and save each one
    for i in range(num_chunks):
        chunk = df[i * chunk_size:(i + 1) * chunk_size]
        chunk.to_csv(os.path.join(output_folder, f'chunk_{i+1}.csv'), index=False)

    print(f'{num_chunks} files have been created in "{output_folder}".')

# Specify the path to the input CSV file
input_file = 'output_file_true7.csv'

# Specify the output folder where the split files will be saved
output_folder = 'ariel'

# Call the function
split_csv(input_file, output_folder)

5 files have been created in "ariel".


In [6]:
# Combine Jsonl files from a folder
import glob

# Define the input directory containing JSONL files
input_directory = "datasets_true/"

# Define the output file path for the combined JSONL file
output_file_path = "all_true_data2.jsonl"

# Open the output file in write mode
with open(output_file_path, 'w') as output_file:
    # Iterate through each JSONL file in the input directory
    for filename in glob.glob(input_directory + "*.jsonl"):
        # Open the current JSONL file in read mode
        with open(filename, 'r') as input_file:
            # Read each line (JSON object) from the input file
            for line in input_file:
                # Write the line to the output file
                output_file.write(line)
