In [3]:
import pandas as pd
import os

def mergeCsv(output_file, *input_files):
    """
    Merges multiple CSV files into a single CSV file.

    Parameters:
        output_file (str): The name of the output CSV file.
        *input_files (str): Paths to the input CSV files to be merged.
    """
    # List to store DataFrames
    dataframes = []

    # Read each CSV file and append to the list
    for file in input_files:
        if os.path.exists(file):
            df = pd.read_csv(file)
            dataframes.append(df)
        else:
            print(f"File not found: {file}")

    # Concatenate all DataFrames
    if dataframes:
        merged_df = pd.concat(dataframes, ignore_index=True)
        # Save the merged DataFrame to the output file
        merged_df.to_csv(output_file, index=False)
        print(f"Merged CSV saved as: {output_file}")
    else:
        print("No valid files to merge.")


def countRow(input_file):
    """
    Counts the number of rows in a CSV file.

    Parameters:
        input_file (str): Path to the input CSV file.

    Returns:
        int: Number of rows in the CSV file.
    """
    if os.path.exists(input_file):
        df = pd.read_csv(input_file)
        row_count = len(df)
        print(f"Number of rows in {input_file}: {row_count}")
    else:
        print(f"File not found: {input_file}")

def checkDuplicate(file_path):
    """
    Checks for duplicate rows in a CSV file based on the 'Domain' column.

    Parameters:
        file_path (str): Path to the input CSV file.

    Returns:
        None
    """
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)

        # Check for duplicates based on the 'Domain' column
        duplicates = df[df.duplicated(subset='Domain', keep=False)]

        # Print the duplicates if any
        if not duplicates.empty:
            print(f"Found {len(duplicates)} duplicate rows based on the 'Domain' column:")
        else:
            print("No duplicates found based on the 'Domain' column.")
    else:
        print(f"File not found: {file_path}")

def removeDuplicate(file_path, output_file):
    """
    Removes duplicate rows in a CSV file based on the 'Domain' column, keeping the first occurrence.

    Parameters:
        file_path (str): Path to the input CSV file.
        output_file (str): Path to save the deduplicated CSV file.

    Returns:
        None
    """
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)

        # Remove duplicates based on the 'Domain' column, keeping the first occurrence
        deduplicated_df = df.drop_duplicates(subset='Domain', keep='first')

        # Save the deduplicated DataFrame to a new file
        deduplicated_df.to_csv(output_file, index=False)
        print(f"Duplicates removed. Deduplicated file saved as: {output_file}")
    else:
        print(f"File not found: {file_path}")


def load_and_print_all_columns(file_path):
    """
    Load a CSV file and print all columns.

    Parameters:
        file_path (str): Path to the input CSV file.

    Returns:
        None
    """
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
        print(df.columns.tolist())
    else:
        print(f"File not found: {file_path}")


In [4]:
countRow("./dataset/merged_combined_dedup_final.csv")
checkDuplicate("./dataset/merged_combined_dedup_final.csv")
load_and_print_all_columns("./dataset/merged_combined_dedup_final.csv")

Number of rows in ./dataset/merged_combined_dedup_final.csv: 96926
Found 4491 duplicate rows based on the 'Domain' column:
['Domain', 'Content', 'Label', 'Classification', 'Reason', 'Confidence', 'Thought']


In [2]:
import pandas as pd
import os

# File paths
input_file = "./dataset/merged_combined_dedup_final.csv"
output_file = "./dataset/duplicated3.csv"

# Check if the input file exists
if os.path.exists(input_file):
    # Load the CSV file
    df = pd.read_csv(input_file)
    
    # Drop duplicates based on the 'Domain' column, keeping the first occurrence
    deduplicated_df = df.drop_duplicates(subset='Domain', keep='first')
    
    # Save the deduplicated DataFrame to a new file
    deduplicated_df.to_csv(output_file, index=False)
    print(f"First occurrences saved to: {output_file}")
else:
    print(f"File not found: {input_file}")

First occurrences saved to: ./dataset/duplicated3.csv


In [5]:
countRow("./dataset/duplicated3.csv")
checkDuplicate("./dataset/duplicated3.csv")
load_and_print_all_columns("./dataset/duplicated3.csv")

Number of rows in ./dataset/duplicated3.csv: 93953
No duplicates found based on the 'Domain' column.
['Domain', 'Content', 'Label', 'Classification', 'Reason', 'Confidence', 'Thought']


In [6]:
import pandas as pd

# File path
file_path = "./dataset/duplicated3.csv"

# Check if the file exists
if os.path.exists(file_path):
    # Load the CSV file
    df = pd.read_csv(file_path)
    
    # Count occurrences of each unique value in the 'Label' column
    label_counts = df['Label'].value_counts()
    
    # Print the occurrences
    for label, count in label_counts.items():
        print(f"Label {label}: {count} occurrences")
else:
    print(f"File not found: {file_path}")

Label 0: 77451 occurrences
Label 2: 8662 occurrences
Label 1: 6618 occurrences
Label 3: 1222 occurrences


In [7]:
# Read the original CSV file
file_path = "./dataset/duplicated3.csv"
df = pd.read_csv(file_path)

# Split into separate DataFrames for each label
label0 = df[df['Label'] == 0]
label1 = df[df['Label'] == 1]
label2 = df[df['Label'] == 2]
label3 = df[df['Label'] == 3]

# Save Label1, Label2, Label3 to CSV files
label1.to_csv("Label1.csv", index=False)
label2.to_csv("Label2.csv", index=False)
label3.to_csv("Label3.csv", index=False)

# Shuffle Label0 and limit to 9,000 rows
label0_shuffled = label0.sample(frac=1, random_state=42).reset_index(drop=True)
label0_sampled = label0_shuffled.head(9000)

# Save the sampled Label0 to CSV
label0_sampled.to_csv("Label0.csv", index=False)

print("Files created successfully!")

Files created successfully!


In [8]:
countRow("./dataset/Label0.csv")
checkDuplicate("./dataset/Label0.csv")
load_and_print_all_columns("./dataset/Label0.csv")

Number of rows in ./dataset/Label0.csv: 9000
No duplicates found based on the 'Domain' column.
['Domain', 'Content', 'Label', 'Classification', 'Reason', 'Confidence', 'Thought']


In [9]:
countRow("./dataset/Label1.csv")
checkDuplicate("./dataset/Label1.csv")
load_and_print_all_columns("./dataset/Label0.csv")

Number of rows in ./dataset/Label1.csv: 6618
No duplicates found based on the 'Domain' column.
['Domain', 'Content', 'Label', 'Classification', 'Reason', 'Confidence', 'Thought']


In [10]:
countRow("./dataset/Label2.csv")
checkDuplicate("./dataset/Label2.csv")
load_and_print_all_columns("./dataset/Label2.csv")

Number of rows in ./dataset/Label2.csv: 8662
No duplicates found based on the 'Domain' column.
['Domain', 'Content', 'Label', 'Classification', 'Reason', 'Confidence', 'Thought']


In [11]:
countRow("./dataset/Label3.csv")
checkDuplicate("./dataset/Label3.csv")
load_and_print_all_columns("./dataset/Label3.csv")

Number of rows in ./dataset/Label3.csv: 1222
No duplicates found based on the 'Domain' column.
['Domain', 'Content', 'Label', 'Classification', 'Reason', 'Confidence', 'Thought']
