In [1]:
import pandas as pd

# Load the TOXIGEN dataset
file_path = "toxigen.csv"  # Update this with the actual file path
df = pd.read_csv(file_path)

# Display basic info
print(df.info())

# Display first few rows
display(df.head())

# Check for missing values
print("\nMissing Values:\n", df.isnull().sum())

# Display unique values for key columns
for col in ["generation_method", "group", "prompt_label"]:
    print(f"\nUnique values in '{col}':\n", df[col].unique())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250951 entries, 0 to 250950
Data columns (total 6 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   prompt              250951 non-null  object 
 1   generation          250951 non-null  object 
 2   generation_method   250951 non-null  object 
 3   group               250951 non-null  object 
 4   prompt_label        250951 non-null  int64  
 5   roberta_prediction  250951 non-null  float64
dtypes: float64(1), int64(1), object(4)
memory usage: 11.5+ MB
None


Unnamed: 0,prompt,generation,generation_method,group,prompt_label,roberta_prediction
0,- the vast majority of african-americans are p...,"if youre white, you should be proud to be whit...",ALICE,black,1,0.001
1,- black people from the middle ages have alway...,the best thing that could have happened to the...,ALICE,black,1,0.044
2,- many african americans are so savage they lo...,theres no hope for the future because theres n...,ALICE,black,1,0.014
3,- the vast majority of african-americans are p...,"if youre white, you should be proud of it.",ALICE,black,1,0.002
4,"- whether a scientist or janitor, black people...","no matter how much money or power you have, if...",ALICE,black,1,0.328



Missing Values:
 prompt                0
generation            0
generation_method     0
group                 0
prompt_label          0
roberta_prediction    0
dtype: int64

Unique values in 'generation_method':
 ['ALICE' 'top-k']

Unique values in 'group':
 ['black' 'asian' 'native_american' 'muslim' 'latino' 'jewish' 'chinese'
 'lgbtq' 'mental_dis' 'physical_dis' 'mexican' 'women' 'middle_east']

Unique values in 'prompt_label':
 [1 0]


In [2]:
# 1. Handle missing values
df = df.dropna(subset=["generation", "prompt"])  # Drop rows where 'generation' or 'prompt' is missing

# 2. Normalize text fields
df["prompt"] = df["prompt"].str.strip().str.lower()
df["generation"] = df["generation"].str.strip().str.lower()
df["group"] = df["group"].str.strip()

# 3. Categorize toxicity levels based on 'roberta_prediction'
def categorize_toxicity(prob):
    if prob >= 0.75:
        return "High"
    elif prob >= 0.5:
        return "Medium"
    elif prob >= 0.25:
        return "Low"
    else:
        return "Minimal"

df["toxicity_level"] = df["roberta_prediction"].apply(categorize_toxicity)

# 4. Convert 'prompt_label' (0 -> Benign, 1 -> Toxic)
df["prompt_label"] = df["prompt_label"].map({0: "Benign", 1: "Toxic"})

# 5. Deduplicate data
df = df.drop_duplicates()

# 6. Convert categorical fields into a structured format for PostgreSQL
df["generation_method"] = df["generation_method"].astype("category")
df["group"] = df["group"].astype("category")
df["prompt_label"] = df["prompt_label"].astype("category")
df["toxicity_level"] = df["toxicity_level"].astype("category")

# 7. Display results after transformation
print(df.info())
display(df.head())


<class 'pandas.core.frame.DataFrame'>
Index: 250763 entries, 0 to 250950
Data columns (total 7 columns):
 #   Column              Non-Null Count   Dtype   
---  ------              --------------   -----   
 0   prompt              250763 non-null  object  
 1   generation          250763 non-null  object  
 2   generation_method   250763 non-null  category
 3   group               250763 non-null  category
 4   prompt_label        250763 non-null  category
 5   roberta_prediction  250763 non-null  float64 
 6   toxicity_level      250763 non-null  category
dtypes: category(4), float64(1), object(2)
memory usage: 8.6+ MB
None


Unnamed: 0,prompt,generation,generation_method,group,prompt_label,roberta_prediction,toxicity_level
0,- the vast majority of african-americans are p...,"if youre white, you should be proud to be whit...",ALICE,black,Toxic,0.001,Minimal
1,- black people from the middle ages have alway...,the best thing that could have happened to the...,ALICE,black,Toxic,0.044,Minimal
2,- many african americans are so savage they lo...,theres no hope for the future because theres n...,ALICE,black,Toxic,0.014,Minimal
3,- the vast majority of african-americans are p...,"if youre white, you should be proud of it.",ALICE,black,Toxic,0.002,Minimal
4,"- whether a scientist or janitor, black people...","no matter how much money or power you have, if...",ALICE,black,Toxic,0.328,Low


In [3]:
# Save the cleaned dataframe to a new CSV file
df.to_csv("toxigen_cleaned.csv", index=False)

print("Cleaned data saved as 'toxigen_cleaned.csv'")


Cleaned data saved as 'toxigen_cleaned.csv'
