In [1]:
import pandas as pd
import numpy as np

# Load your dataset
df = pd.read_csv("teen_phone_addiction_dataset.csv")

# Copy the original dataframe
df_noisy = df.copy()

rng = np.random.default_rng(seed=42)  # reproducible noise

# 1. Add Gaussian noise to numeric columns
numeric_cols = df_noisy.select_dtypes(include=[np.number]).columns.tolist()

for col in numeric_cols:
    if col == "ID":
        continue  # Don't change IDs
    col_min, col_max = df_noisy[col].min(), df_noisy[col].max()
    
    # Gaussian noise: mean=0, std = 5% of column std
    noise = rng.normal(0, df_noisy[col].std() * 0.05, size=len(df_noisy))
    df_noisy[col] = df_noisy[col] + noise
    
    # Ensure bounds for certain columns
    if col == "Age":
        df_noisy[col] = np.clip(df_noisy[col], 13, 19)
    if col == "Addiction_Level":
        df_noisy[col] = np.clip(df_noisy[col], 1, 10)
    if col in ["Academic_Performance", "Phone_Checks_Per_Day", "Apps_Used_Daily",
               "Social_Interactions", "Anxiety_Level", "Depression_Level", "Self_Esteem",
               "Parental_Control", "Family_Communication"]:
        df_noisy[col] = np.round(df_noisy[col])  # keep integer type
    
    # Clip to original column range to avoid impossible values
    df_noisy[col] = np.clip(df_noisy[col], col_min, col_max)

# 2. Add categorical noise (5–10% of rows randomly changed)
categorical_cols = df_noisy.select_dtypes(include=['object']).columns.tolist()
for col in categorical_cols:
    unique_vals = df_noisy[col].dropna().unique().tolist()
    n_changes = int(len(df_noisy) * rng.uniform(0.05, 0.1))
    idx_to_change = rng.choice(len(df_noisy), n_changes, replace=False)
    random_replacements = rng.choice(unique_vals, n_changes, replace=True)
    df_noisy.loc[idx_to_change, col] = random_replacements

# 3. Introduce some outliers (extreme values) for numeric columns
outlier_fraction = 0.01  # 1% rows
for col in numeric_cols:
    if col != "ID":
        n_outliers = int(len(df_noisy) * outlier_fraction)
        outlier_indices = rng.choice(len(df_noisy), n_outliers, replace=False)
        # Inflate values for outliers
        df_noisy.loc[outlier_indices, col] *= rng.uniform(1.5, 3.0)

# 4. Add missing values (~3% for some columns)
cols_for_nan = rng.choice(categorical_cols + numeric_cols, size=5, replace=False)
for col in cols_for_nan:
    nan_indices = rng.choice(len(df_noisy), int(len(df_noisy) * 0.03), replace=False)
    df_noisy.loc[nan_indices, col] = np.nan

# Save noisy dataset to file
df_noisy.to_excel("teen_phone_addiction_dataset_noisy.xlsx", index=False)

print("Noisy dataset saved as teen_phone_addiction_dataset_noisy.xlsx")


Noisy dataset saved as teen_phone_addiction_dataset_noisy.xlsx


In [3]:
import pandas as pd

# For Excel
df = pd.read_excel("teen_phone_addiction_dataset_noisy.xlsx")

# For CSV
# df = pd.read_csv("teen_phone_addiction_dataset_noisy.csv")


In [5]:
import pandas as pd

# Read the Excel file
df = pd.read_excel("teen_phone_addiction_dataset_noisy.xlsx")

# Save as CSV
df.to_csv("teen_phone_addiction_dataset_noisy.csv", index=False)

In [6]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ID,3000.0,1500.5,866.169729,1.0,750.75,1500.5,2250.25,3000.0
Age,3000.0,16.094612,2.37014,13.0,14.063936,15.995079,17.92924,33.805617
Daily_Usage_Hours,3000.0,5.10621,2.175749,0.0,3.7184,5.012012,6.400018,21.135083
Sleep_Hours,3000.0,6.542057,1.583042,3.0,5.4872,6.510024,7.536298,18.975768
Academic_Performance,3000.0,75.445682,15.595047,50.0,62.0,75.0,88.0,168.569612
Social_Interactions,3000.0,5.199752,3.454915,0.0,2.0,5.0,8.0,28.972808
Exercise_Hours,3000.0,1.052212,0.74167,0.0,0.448809,1.012452,1.55584,3.988475
Anxiety_Level,3000.0,5.666795,3.045186,1.0,3.0,6.0,8.0,25.35902
Depression_Level,3000.0,5.519772,2.980505,1.0,3.0,5.0,8.0,21.654569
Self_Esteem,2910.0,5.592221,3.018924,1.0,3.0,6.0,8.0,22.135044
