In [3]:
import pandas as pd
import numpy as np
import random
import string

def inject_errors(df, 
                  missing_rate=0.05, 
                  duplicate_rate=0.1, 
                  case_error_rate=0.05,
                  swap_rate=0.03,
                  typo_rate=0.02,
                  seed=None,
                  uppercase_columns=True):
    """
    Inject random errors into a copy of the input DataFrame.
    
    Parameters:
        df: DataFrame to corrupt
        missing_rate: % of cells to make NaN
        duplicate_rate: % of rows to duplicate
        case_error_rate: % of string cells to change case randomly
        swap_rate: % of rows to swap values between two columns
        typo_rate: % of string cells to add a typo
        seed: random seed for reproducibility
        uppercase_columns: convert column names to uppercase
    
    Returns:
        Corrupted DataFrame
    """
    if seed is not None:
        np.random.seed(seed)
        random.seed(seed)

    df = df.copy()

    # Uppercase column names
    if uppercase_columns:
        df.columns = [col.upper() for col in df.columns]

    # 1. Add missing values
    n_missing = int(missing_rate * df.size)
    for _ in range(n_missing):
        i = random.randint(0, df.shape[0] - 1)
        j = random.randint(0, df.shape[1] - 1)
        df.iat[i, j] = np.nan

    # 2. Add duplicated rows
    n_dupes = int(duplicate_rate * len(df))
    if n_dupes > 0:
        dupes = df.sample(n=n_dupes, replace=True)
        df = pd.concat([df, dupes], ignore_index=True)

    # 3. Inconsistent case (strings only)
    for col in df.select_dtypes(include='object').columns:
        for i in df.index:
            if pd.notnull(df.at[i, col]) and random.random() < case_error_rate:
                val = df.at[i, col]
                if isinstance(val, str):
                    df.at[i, col] = val.lower() if val.isupper() else val.upper()

    # 4. Swap values between columns
    for _ in range(int(swap_rate * len(df))):
        row = random.randint(0, len(df) - 1)
        col1, col2 = random.sample(range(df.shape[1]), 2)
        df.iat[row, col1], df.iat[row, col2] = df.iat[row, col2], df.iat[row, col1]

    # 5. Introduce typos (change a character in a string)
    for col in df.select_dtypes(include='object').columns:
        for i in df.index:
            if pd.notnull(df.at[i, col]) and random.random() < typo_rate:
                val = df.at[i, col]
                if isinstance(val, str) and len(val) > 2:
                    pos = random.randint(0, len(val)-1)
                    char = random.choice(string.ascii_letters)
                    df.at[i, col] = val[:pos] + char + val[pos+1:]

    return df


In [4]:
# Load your clean dataset
df_clean = pd.read_csv("nih_reporter_projects_test.csv")

# Create a corrupted version
df_dirty = inject_errors(df_clean, seed=42)

# Save to file or view
df_dirty.to_csv("your_data_dirty.csv", index=False)
df_dirty.head()

Unnamed: 0,APPL_ID,SUBPROJECT_ID,FISCAL_YEAR,PROJECT_NUM,PROJECT_SERIAL_NUM,ORGANIZATION,AWARD_TYPE,ACTIVITY_CODE,AWARD_AMOUNT,PROJECT_NUM_SPLIT,...,PROJECT_TITLE,PHR_TEXT,SPENDING_CATEGORIES_DESC,ARRA_FUNDED,BUDGET_START,BUDGET_END,CFDA_CODE,FUNDING_MECHANISM,DIRECT_COST_AMT,INDIRECT_COST_AMT
0,10827889.0,9078.0,2025.0,5P01AG052350-08,AG052350,{'org_name': 'UNIVERSITY OF SOUTHERN CALIFORNI...,5,P01,957843.0,"{'appl_type_code': '5', 'activity_code': 'P01'...",...,"Project 2 - Imaging of Brain Connrectivity, St...",Project Narrative:\nNot required per funding o...,,N,2024-04-01T00:00:00,2025-03-31T00:00:00,,Non-SBIR/STTR,704507.0,253336.0
1,10617263.0,,2023.0,5F32DK132864-02,DK132864,"{'org_name': 'DANA-FARBER CANCER INST', 'city'...",5,F32,71792.0,"{'appl_type_code': '5', 'activity_code': 'F32'...",...,Elucidating novel molecular mechanisms of iris...,PROJECT NARRATIVE:\nInactive lifestyle leads t...,Biotechnology; Obesity; Physical Activity; Pre...,N,2023-05-01T00:00:00,2024-C4-30T00:00:00,847.0,"Training, Individual",71792.0,0.0
2,11032318.0,,2025.0,1K0jDK141969-01,DK141969,"{'org_name': 'DANA-FARBER CANCER INST', 'city'...",1,K01,152454.0,"{'appl_type_code': '1', 'activity_code': 'K01'...",...,The mechanistic study on irisin-mediated immun...,PROJECT NARRATIVE\nExercise benefits the body ...,,n,2025-01-01T00:00:00,2025-11-30T00:00:00,847.0,,141589.0,10865.0
3,10824314.0,,2024.0,5F32DK132864-03,DK132864,"{'org_name': 'DANA-FARBER CANCER INST', 'city'...",5,F32,56728.0,"{'appl_type_code': '5', 'activity_code': 'F32'...",...,Elucidating novel molecular mechanisms of iris...,PROJECT NARRATIVE:\nInactive lifestyle leads t...,Biotechnology; Obesity; Physical Activity; Pre...,n,2024-05-01T00:00:00,2024-12-31T00:00:00,847.0,"Training, Individual",56728.0,0.0
4,10646015.0,,2023.0,1r21ai175731-01,AI175731,"{'org_name': 'UNIV OF ARKANSAS FOR MED SCIS', ...",1,R21,229500.0,"{'appl_type_code': '1', 'activity_code': 'R21'...",...,Pyroptotic Macrophages Traps Against Shigella ...,Shigella is a gastrointestinal pathogen that c...,Biodefense and Related Countermeasures; Digest...,N,2023-08-25T00:00:00,2024-07-31T00:00:00,855.0,Non-SBIR/STTR,150000.0,79500.0
