# Synthetic Data Project – End-to-End Pipeline
Covers Questions 2–9: generation, errors, modules, cleaning, visuals, stats, augmentation, models.

## Setup Project Structure

In [None]:

import os, sys, textwrap, traceback, json
from pathlib import Path
import numpy as np, pandas as pd
import matplotlib.pyplot as plt

ROOT = Path("./synth_project")
DATA_RAW = ROOT / "data" / "raw"
DATA_PROCESSED = ROOT / "data" / "processed"
MODELS = ROOT / "models"
RESULTS = ROOT / "results"
LOGS = ROOT / "logs"
PLOTS = ROOT / "plots"
PKG = ROOT / "synthpkg"

for d in [DATA_RAW, DATA_PROCESSED, MODELS, RESULTS, LOGS, PLOTS, PKG]:
    d.mkdir(parents=True, exist_ok=True)

print('Project root:', ROOT.resolve())


## Q4 – Create Minimal Package (`synthpkg`)

In [None]:

from pathlib import Path
import textwrap

# __init__.py
(Path(PKG) / "__init__.py").write_text("# synthpkg package\n")

# stats.py
(Path(PKG) / "stats.py").write_text(
    "import numpy as np\n\n"
    "def mean(arr):\n"
    "    arr = np.asarray(arr, dtype=float)\n"
    "    return float(np.nanmean(arr))\n\n"
    "def median(arr):\n"
    "    arr = np.asarray(arr, dtype=float)\n"
    "    return float(np.nanmedian(arr))\n\n"
    "def std(arr):\n"
    "    arr = np.asarray(arr, dtype=float)\n"
    "    return float(np.nanstd(arr, ddof=0))\n"
)

# augment.py
(Path(PKG) / "augment.py").write_text(
    "import numpy as np\n"
    "import pandas as pd\n\n"
    "def augment_df(df, numeric_cols, target_col=None, scale=1.0, noise_frac=0.02, target_size_factor=2.0, random_state=42):\n"
    "    rng = np.random.default_rng(random_state)\n"
    "    n_original = len(df)\n"
    "    n_target = int(np.ceil(target_size_factor * n_original))\n"
    "    idx = rng.integers(low=0, high=n_original, size=n_target)\n"
    "    df_aug = df.iloc[idx].copy().reset_index(drop=True)\n"
    "    for col in numeric_cols:\n"
    "        col_std = df[col].std(ddof=0)\n"
    "        noise = rng.normal(loc=0.0, scale=max(1e-12, noise_frac * col_std * scale), size=len(df_aug))\n"
    "        df_aug[col] = df_aug[col].astype(float) + noise\n"
    "    return df_aug\n"
)

# visuals.py
(Path(PKG) / "visuals.py").write_text(
    "import numpy as np\n"
    "import pandas as pd\n"
    "import matplotlib.pyplot as plt\n\n"
    "def save_histogram(series, out_path, bins=30, title=None, xlabel=None):\n"
    "    fig, ax = plt.subplots(figsize=(6,4))\n"
    "    ax.hist(series.dropna(), bins=bins)\n"
    "    ax.set_title(title or f'Histogram of {series.name}')\n"
    "    ax.set_xlabel(xlabel or series.name)\n"
    "    ax.set_ylabel('Frequency')\n"
    "    fig.tight_layout()\n"
    "    fig.savefig(out_path, dpi=150)\n"
    "    plt.close(fig)\n\n"
    "def save_bar_counts(series, out_path, title=None, xlabel=None):\n"
    "    counts = series.value_counts(dropna=False)\n"
    "    fig, ax = plt.subplots(figsize=(6,4))\n"
    "    ax.bar(counts.index.astype(str), counts.values)\n"
    "    ax.set_title(title or f'Counts of {series.name}')\n"
    "    ax.set_xlabel(xlabel or series.name)\n"
    "    ax.set_ylabel('Count')\n"
    "    ax.tick_params(axis='x', rotation=45)\n"
    "    fig.tight_layout()\n"
    "    fig.savefig(out_path, dpi=150)\n"
    "    plt.close(fig)\n\n"
    "def save_scatter(x, y, out_path, title=None, xlabel=None, ylabel=None):\n"
    "    fig, ax = plt.subplots(figsize=(6,4))\n"
    "    ax.scatter(x, y, s=10)\n"
    "    ax.set_title(title or 'Scatter Plot')\n"
    "    ax.set_xlabel(xlabel or getattr(x, 'name', 'x'))\n"
    "    ax.set_ylabel(ylabel or getattr(y, 'name', 'y'))\n"
    "    fig.tight_layout()\n"
    "    fig.savefig(out_path, dpi=150)\n"
    "    plt.close(fig)\n\n"
    "def save_corr_heatmap(df, out_path, title='Correlation Heatmap'):\n"
    "    corr = df.corr(numeric_only=True)\n"
    "    fig, ax = plt.subplots(figsize=(6,5))\n"
    "    cax = ax.imshow(corr.values, interpolation='nearest')\n"
    "    ax.set_title(title)\n"
    "    ax.set_xticks(range(len(corr.columns)))\n"
    "    ax.set_yticks(range(len(corr.columns)))\n"
    "    ax.set_xticklabels(corr.columns, rotation=90)\n"
    "    ax.set_yticklabels(corr.columns)\n"
    "    fig.colorbar(cax, ax=ax, fraction=0.046, pad=0.04)\n"
    "    fig.tight_layout()\n"
    "    fig.savefig(out_path, dpi=150)\n"
    "    plt.close(fig)\n"
)

# generator.py
(Path(PKG) / "generator.py").write_text(
    "import numpy as np\n"
    "import pandas as pd\n"
    "from pathlib import Path\n"
    "from datetime import datetime\n\n"
    "class InvalidPathError(Exception):\n"
    "    pass\n\n"
    "class DataGenerationError(Exception):\n"
    "    pass\n\n"
    "class DataGenerator:\n"
    "    def __init__(self, out_csv, log_file=None, random_state=42):\n"
    "        self.out_csv = Path(out_csv)\n"
    "        self.log_file = Path(log_file) if log_file else None\n"
    "        self.rng = np.random.default_rng(random_state)\n\n"
    "    def _log_error(self, msg):\n"
    "        if self.log_file is not None:\n"
    "            self.log_file.parent.mkdir(parents=True, exist_ok=True)\n"
    "            with open(self.log_file, 'a', encoding='utf-8') as f:\n"
    "                f.write(f'[{datetime.now().isoformat()}] {msg}\n')\n\n"
    "    def generate(self, n_rows=500, introduce_nans=True):\n"
    "        out_dir = self.out_csv.parent\n"
    "        if not out_dir.exists():\n"
    "            msg = f'Invalid output directory: {out_dir}'\n"
    "            self._log_error(msg)\n"
    "            raise InvalidPathError(msg)\n\n"
    "        if not isinstance(n_rows, int) or n_rows <= 0:\n"
    "            msg = f'n_rows must be a positive integer. Got: {n_rows}'\n"
    "            self._log_error(msg)\n"
    "            raise DataGenerationError(msg)\n\n"
    "        try:\n"
    "            age = self.rng.integers(18, 80, size=n_rows)\n"
    "            income = self.rng.normal(70000, 20000, size=n_rows).clip(15000, None)\n"
    "            account_balance = self.rng.normal(15000, 8000, size=n_rows).clip(0, None)\n"
    "            visits_last_month = self.rng.poisson(3, size=n_rows)\n"
    "            avg_session_minutes = self.rng.normal(12, 6, size=n_rows).clip(0.5, None)\n\n"
    "            gender = self.rng.choice(['Male', 'Female', 'Other'], size=n_rows, p=[0.48, 0.48, 0.04])\n"
    "            product_type = self.rng.choice(['Basic', 'Plus', 'Premium'], size=n_rows, p=[0.5, 0.3, 0.2])\n\n"
    "            prob_purchase = (0.2 + 0.000006*(income-30000) + 0.05*(visits_last_month>2).astype(float) + "
    "                             0.01*(avg_session_minutes>10).astype(float) + 0.05*(product_type=='Premium').astype(float))\n"
    "            import numpy as _np\n"
    "            prob_purchase = _np.clip(prob_purchase, 0.01, 0.95)\n"
    "            purchased = (self.rng.random(n_rows) < prob_purchase).astype(int)\n\n"
    "            df = pd.DataFrame({"
    "'age': age, "
    "'income': income.round(2), "
    "'account_balance': account_balance.round(2), "
    "'visits_last_month': visits_last_month, "
    "'avg_session_minutes': avg_session_minutes.round(2), "
    "'gender': gender, "
    "'product_type': product_type, "
    "'purchased': purchased"
    "})\n\n"
    "            if introduce_nans:\n"
    "                num_cols = ['age','income','account_balance','visits_last_month','avg_session_minutes']\n"
    "                for col in num_cols:\n"
    "                    mask_col = (self.rng.random(n_rows) < 0.006)\n"
    "                    df.loc[mask_col, col] = _np.nan\n"
    "                cat_mask = self.rng.random(n_rows) < 0.01\n"
    "                df.loc[cat_mask, 'gender'] = _np.nan\n\n"
    "            df.to_csv(self.out_csv, index=False)\n"
    "            return df\n\n"
    "        except Exception as e:\n"
    "            msg = f'Unexpected error during generation: {repr(e)}'\n"
    "            self._log_error(msg)\n"
    "            raise DataGenerationError(msg) from e\n"
)
print('Package created at:', PKG.resolve())


In [None]:

import sys
sys.path.insert(0, str(Path(ROOT).resolve()))
from synthpkg.generator import DataGenerator, InvalidPathError, DataGenerationError
from synthpkg import stats as sp_stats
from synthpkg import augment as sp_aug
from synthpkg import visuals as sp_vis


## Q2 & Q3 – Generate Data + Exception Handling

In [None]:

raw_csv = DATA_RAW / "generated_data.csv"
log_file = LOGS / "errors.txt"

# Successful generation
gen = DataGenerator(out_csv=raw_csv, log_file=log_file, random_state=7)
df_raw = gen.generate(n_rows=800, introduce_nans=True)
print("Saved:", raw_csv.resolve())

# Intentional failing generation for screenshot/logging
invalid_csv = ROOT / "not_a_real_dir" / "generated_data.csv"
gen_bad = DataGenerator(out_csv=invalid_csv, log_file=log_file, random_state=7)

try:
    gen_bad.generate(n_rows=200)
except Exception as e:
    import traceback
    trace = traceback.format_exc()
    print("Captured error (as expected):\n", trace)

# Save a text screenshot of the traceback
def save_text_as_image(text, out_path, title="Error Handling Screenshot"):
    fig, ax = plt.subplots(figsize=(10,6))
    ax.axis('off')
    import textwrap as tw
    wrapped = tw.fill(text, width=110)
    ax.text(0.01, 0.95, title, fontsize=14, va='top', fontfamily='monospace')
    ax.text(0.01, 0.88, wrapped, fontsize=10, va='top', fontfamily='monospace')
    fig.tight_layout()
    fig.savefig(out_path, dpi=150, bbox_inches='tight')
    plt.close(fig)

error_img = LOGS / "error_screenshot.png"
save_text_as_image(trace, error_img)
print("Error screenshot saved:", error_img.resolve())
print("Error log path:", log_file.resolve())


## Q5 – Data Preparation with Pandas

In [None]:

import pandas as pd

df = pd.read_csv(raw_csv)

def save_df_head_as_image(df_, out_path, title="DataFrame Head"):
    fig, ax = plt.subplots(figsize=(10,3))
    ax.axis('off')
    ax.set_title(title)
    table = ax.table(cellText=df_.head(10).values, colLabels=df_.columns, loc='center')
    table.auto_set_font_size(False)
    table.set_fontsize(8)
    table.scale(1, 1.5)
    fig.tight_layout()
    fig.savefig(out_path, dpi=150, bbox_inches='tight')
    plt.close(fig)

before_img = PLOTS / "before_cleaning.png"
save_df_head_as_image(df, before_img, title="Before Cleaning (head)")

numeric_cols = ['age','income','account_balance','visits_last_month','avg_session_minutes']
categorical_cols = ['gender','product_type']
target_col = 'purchased'

for col in numeric_cols:
    df[col].fillna(df[col].median(), inplace=True)

for col in categorical_cols:
    mode_val = df[col].mode(dropna=True)
    if len(mode_val) > 0:
        df[col].fillna(mode_val.iloc[0], inplace=True)
    else:
        df[col].fillna("Unknown", inplace=True)

df_clean = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

clean_csv = DATA_PROCESSED / "cleaned_data.csv"
df_clean.to_csv(clean_csv, index=False)

after_img = PLOTS / "after_cleaning.png"
save_df_head_as_image(df_clean, after_img, title="After Cleaning + Encoding (head)")

print("Cleaned CSV:", clean_csv.resolve())


## Q6 – Visualization with Matplotlib

In [None]:

sp_vis.save_histogram(df['age'], PLOTS / "hist_age.png", bins=20, title="Age Distribution")
sp_vis.save_bar_counts(df['product_type'], PLOTS / "bar_product_type.png", title="Product Type Counts")
sp_vis.save_corr_heatmap(df_clean[['age','income','account_balance','visits_last_month','avg_session_minutes']], PLOTS / "corr_heatmap.png")
sp_vis.save_scatter(df['income'], df['account_balance'], PLOTS / "scatter_income_balance.png", title="Income vs Account Balance")

print("Plots saved to:", PLOTS.resolve())


## Q7 – Statistics & Augmentation with NumPy

In [None]:

mean_age = sp_stats.mean(df['age']); median_age = sp_stats.median(df['age']); std_age = sp_stats.std(df['age'])
mean_income = sp_stats.mean(df['income']); median_income = sp_stats.median(df['income']); std_income = sp_stats.std(df['income'])

print("Age  -> mean:", mean_age, "median:", median_age, "std:", std_age)
print("Income -> mean:", mean_income, "median:", median_income, "std:", std_income)

numeric_cols_for_noise = ['age','income','account_balance','visits_last_month','avg_session_minutes']
df_aug = sp_aug.augment_df(df_clean, numeric_cols=numeric_cols_for_noise, target_col='purchased',
                           scale=1.0, noise_frac=0.02, target_size_factor=2.0, random_state=7)
aug_csv = DATA_PROCESSED / "augmented_data.csv"
df_aug.to_csv(aug_csv, index=False)
print("Augmented CSV:", aug_csv.resolve())


## Q8 – Model Training (Logistic Regression & Random Forest)

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import joblib
import pandas as pd

def train_and_evaluate(df_ml, dataset_name, target_col='purchased'):
    X = df_ml.drop(columns=[target_col])
    y = df_ml[target_col].astype(int)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7, stratify=y)

    results = []

    pipe_lr = Pipeline([("scaler", StandardScaler(with_mean=False)), ("lr", LogisticRegression(max_iter=200, solver='liblinear'))])
    pipe_lr.fit(X_train, y_train)
    y_pred_lr = pipe_lr.predict(X_test)
    y_proba_lr = pipe_lr.predict_proba(X_test)[:,1]
    results.append({
        "dataset": dataset_name, "model": "LogisticRegression",
        "accuracy": accuracy_score(y_test, y_pred_lr),
        "precision": precision_score(y_test, y_pred_lr),
        "recall": recall_score(y_test, y_pred_lr),
        "f1": f1_score(y_test, y_pred_lr),
        "roc_auc": roc_auc_score(y_test, y_proba_lr),
    })
    joblib.dump(pipe_lr, MODELS / f"logreg_{dataset_name}.joblib")

    rf = RandomForestClassifier(n_estimators=200, random_state=7)
    rf.fit(X_train, y_train)
    y_pred_rf = rf.predict(X_test)
    y_proba_rf = rf.predict_proba(X_test)[:,1]
    results.append({
        "dataset": dataset_name, "model": "RandomForest",
        "accuracy": accuracy_score(y_test, y_pred_rf),
        "precision": precision_score(y_test, y_pred_rf),
        "recall": recall_score(y_test, y_pred_rf),
        "f1": f1_score(y_test, y_pred_rf),
        "roc_auc": roc_auc_score(y_test, y_proba_rf),
    })
    joblib.dump(rf, MODELS / f"rf_{dataset_name}.joblib")
    return results

import pandas as pd
df_clean = pd.read_csv(DATA_PROCESSED / "cleaned_data.csv")
df_aug = pd.read_csv(DATA_PROCESSED / "augmented_data.csv")

metrics_clean = train_and_evaluate(df_clean, "cleaned")
metrics_aug = train_and_evaluate(df_aug, "augmented")

all_metrics = pd.DataFrame(metrics_clean + metrics_aug)
all_metrics.to_csv(RESULTS / "metrics.csv", index=False)

best_row = all_metrics.sort_values(by=["f1","roc_auc"], ascending=False).iloc[0]
print("Best model:", best_row.to_dict())
print("Metrics saved to:", (RESULTS / "metrics.csv").resolve())


## Q9 – Generate a Short Report (Markdown)

In [None]:

report_md = ROOT / "report.md"
report_md.write_text(
    "# Synthetic Data Project – Report\n\n"
    f"- Outputs saved under `{ROOT.resolve()}`.\n"
    "- OOP, exceptions, modules, cleaning, plots, stats, augmentation, and model training done.\n"
    "- See `results/metrics.csv` for model comparisons; `plots/` for figures; `logs/` for error logs and screenshot.\n"
)
print("Report saved to:", report_md.resolve())
