In [None]:
import pandas as pd
import glob
import os
import torch


# --- Configuration & Hyperparameters ---
MODEL_NAME = './local-bert-base-uncased'
DATASET_DIRECTORY = './datasets/'
OUTPUT_FILENAME = 'cleaned_spam_dataset.csv'


# --- Device Setup ---
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

def load_and_standardize_datasets(path, output_path):
    """
    Loads, standardizes, combines, cleans, shuffles, and saves datasets.
    Also prints the final class distribution.
    """
    all_files = glob.glob(os.path.join(path, "*.csv"))
    if not all_files:
        print(f"Error: No CSV files were found in the directory '{path}'.")
        exit()

    print(f"Found {len(all_files)} dataset files to process...")

    df_list = []



    for filename in all_files:
        try:
            df = pd.read_csv(filename, encoding='latin-1')
            standard_df = pd.DataFrame()
            file_basename = os.path.basename(filename).lower()

            if file_basename == 'data-augmented.csv':
                if 'labels' in df.columns and 'text' in df.columns:
                    standard_df['label'] = df['labels']
                    standard_df['text'] = df['text']
                else:
                    print(f"--> Skipping file: '{os.path.basename(filename)}'. Required columns 'labels' and 'text' not found.")
                    continue

            elif file_basename == 'sms_spam.csv':
                if 'label' in df.columns and 'sms' in df.columns:
                    standard_df['label'] = df['label']
                    standard_df['text'] = df['sms']
                else:
                    print(f"--> Skipping file: '{os.path.basename(filename)}'. Required columns 'label' and 'sms' not found.")
                    continue

            elif file_basename == 'sms_text.csv':
                if 'label' in df.columns and 'data' in df.columns:
                    standard_df['label'] = df['label']
                    standard_df['text'] = df['data']
                else:
                    print(f"--> Skipping file: '{os.path.basename(filename)}'. Required columns 'label' and 'data' not found.")
                    continue
            
            elif 'v1' in df.columns and 'v2' in df.columns:
                standard_df['label'] = df['v1']
                standard_df['text'] = df['v2']
            
            else:
                print(f"--> Skipping file: '{os.path.basename(filename)}'. Did not match any known file formats.")
                continue

            label_map = {
                'ham': 0, 'spam': 1, '0': 0, '1': 1,
                'normal': 0, 'legitimate': 0
            }
            standard_df['label'] = standard_df['label'].astype(str).str.lower().map(label_map)
            standard_df.dropna(inplace=True)
            standard_df['label'] = standard_df['label'].astype(int)
            
            df_list.append(standard_df)
            print(f"--> Successfully loaded and processed '{os.path.basename(filename)}', adding {len(standard_df)} rows.")

        except Exception as e:
            print(f"--> Error processing file '{os.path.basename(filename)}': {e}")

    if not df_list:
        print("\nError: No data could be loaded from any of the files. Exiting.")
        exit()


    master_df = pd.concat(df_list, ignore_index=True)
    print(f"\nTotal combined rows: {len(master_df):,}")


    master_df.drop_duplicates(subset=['text'], inplace=True)
    print(f"Rows after removing duplicate text entries: {len(master_df):,}")

    master_df = master_df.sample(frac=1).reset_index(drop=True)
    print("Final dataset shuffled.")


    # ---  CODE TO DISPLAY CLASS DISTRIBUTION (MOVED TO CORRECT LOCATION) ---
    print("\n--- Final Class Distribution ---")


    # Use value_counts() on the 'label' column before it's renamed
    class_counts = master_df['label'].value_counts()
    
    ham_count = class_counts.get(0, 0)
    spam_count = class_counts.get(1, 0)
    
    print(f"Ham (0):  {ham_count:,}")
    print(f"Spam (1): {spam_count:,}")
    print("--------------------------------\n")
    # --------------------------------------------------------------------------

    # Rename columns to v1 and v2 for the final output file
    master_df.rename(columns={'label': 'v1', 'text': 'v2'}, inplace=True)

    try:
        # Save the final DataFrame to a new CSV file
        master_df.to_csv(output_path, index=False)
        print(f"Successfully saved the cleaned dataset to '{output_path}' with columns v1 and v2.")
    except Exception as e:
        print(f"\nError: Could not save the file. Reason: {e}")
    return master_df


# Now the function handles everything internally: cleaning, counting, and saving.
df = load_and_standardize_datasets(DATASET_DIRECTORY, OUTPUT_FILENAME)



# The final print statement outside the function now correctly uses 'df'.
print("\n--- Final DataFrame Head (with v1, v2 columns) ---")
print(df.head(10))

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 3060
Found 4 dataset files to process...
--> Successfully loaded and processed 'data-augmented.csv', adding 5572 rows.
--> Successfully loaded and processed 'sms_spam.csv', adding 5574 rows.
--> Successfully loaded and processed 'SMS_Text.csv', adding 5574 rows.
--> Successfully loaded and processed 'spam_sms.csv', adding 5572 rows.

Total combined rows: 22,292
Rows after removing duplicate text entries: 16,205
Final dataset shuffled.

--- Final Class Distribution ---
Ham (0):  14,016
Spam (1): 2,189
--------------------------------

Successfully saved the cleaned dataset to 'cleaned_spam_dataset_v1_v2.csv' with columns v1 and v2.

--- Final DataFrame Head (with v1, v2 columns) ---
   v1                                                 v2
0   0  ['Theyre doing it to lots of places. Only hosp...
1   0  Buzz! Hey, my Love ! I think of you and hope y...
2   0  ['NO GIFTS!! You trying to get me to throw mys...
3   0  No d