In [1]:
import os
from pathlib import Path

In [2]:
PROJECT_NAME = "project"
ROOT = Path(PROJECT_NAME)

# Define the directory list
DIRS = [
    ROOT / "data" / "raw",
    ROOT / "data" / "processed",
    ROOT / "docs",
    ROOT / "models",
    ROOT / "notebooks",
    ROOT / "reports" / "figures",
    ROOT / "src",
    ROOT / "scripts",
]

In [3]:
# Create the root directory
ROOT.mkdir(exist_ok=True)

# Create all subdirectories
for d in DIRS:
    d.mkdir(parents=True, exist_ok=True)
    # Create a .gitkeep file in each directory
    (d / ".gitkeep").touch()

print("Directories ensured:", DIRS)

Directories ensured: [PosixPath('project/data/raw'), PosixPath('project/data/processed'), PosixPath('project/docs'), PosixPath('project/models'), PosixPath('project/notebooks'), PosixPath('project/reports/figures'), PosixPath('project/src'), PosixPath('project/scripts')]


In [4]:
from textwrap import dedent

config_py = dedent("""
from __future__ import annotations
import os
from dotenv import load_dotenv

def load_environment() -> None:
    load_dotenv()

def get_api_key(name: str = 'ALPHAVANTAGE_API_KEY') -> str | None:
    return os.getenv(name)
""").lstrip()

with open(ROOT / "src/config.py", "w", encoding="utf-8") as f:
    f.write(config_py)

print(f"Written {ROOT / 'src/config.py'}")

Written project/src/config.py


In [5]:
from textwrap import dedent

storage_py = dedent("""
from __future__ import annotations
import pandas as pd
from pathlib import Path

def detect_format(path: str | Path) -> str:
    ext = str(path).lower().rsplit('.', 1)[-1]
    if ext in ('csv', 'parquet'):
        return ext
    raise ValueError(f'Unsupported file extension: {ext}')

def write_df(df: pd.DataFrame, path: str | Path) -> Path:
    path = Path(path)
    fmt = detect_format(path)
    path.parent.mkdir(parents=True, exist_ok=True)
    if fmt == 'csv':
        df.to_csv(path, index=False)
        return path
    elif fmt == 'parquet':
        try:
            df.to_parquet(path, index=False)
        except Exception:
            fallback = path.with_suffix('.csv')
            df.to_csv(fallback, index=False)
            return fallback
        return path
    else:
        raise ValueError(f'Unsupported format: {fmt}')

def read_df(path: str | Path) -> pd.DataFrame:
    path = Path(path)
    fmt = detect_format(path)
    if fmt == 'csv':
        df = pd.read_csv(path)
        if 'date' in df.columns:
            try:
                df['date'] = pd.to_datetime(df['date'])
            except Exception:
                pass
        return df
    elif fmt == 'parquet':
        return pd.read_parquet(path)
    else:
        raise ValueError(f'Unsupported format: {fmt}')
""").lstrip()

with open(ROOT / "src/storage.py", "w", encoding="utf-8") as f:
    f.write(storage_py)

print(f"Written {ROOT / 'src/storage.py'}")

Written project/src/storage.py


In [6]:
from textwrap import dedent
cleaning_py = dedent("""
from __future__ import annotations
import pandas as pd
import numpy as np
from typing import Iterable

def fill_missing_median(df: pd.DataFrame, cols: Iterable[str]) -> pd.DataFrame:
    out = df.copy()
    for c in cols:
        if c in out.columns and pd.api.types.is_numeric_dtype(out[c]):
            med = out[c].median()
            out[c] = out[c].fillna(med)
    return out

def drop_missing(df: pd.DataFrame, threshold: float = 0.5) -> pd.DataFrame:
    out = df.copy()
    # drop columns above threshold missing
    col_missing = out.isna().mean()
    to_drop = [c for c, r in col_missing.items() if r > threshold]
    if to_drop:
        out = out.drop(columns=to_drop)
    # drop remaining rows with any missing
    out = out.dropna(axis=0, how='any')
    return out

def normalize_data(df: pd.DataFrame, cols: Iterable[str]) -> pd.DataFrame:
    out = df.copy()
    for c in cols:
        if c in out.columns and pd.api.types.is_numeric_dtype(out[c]):
            mu = out[c].mean()
            sigma = out[c].std(ddof=0)
            if sigma and not np.isnan(sigma) and sigma != 0:
                out[c] = (out[c] - mu) / sigma
    return out
""").lstrip()

with open(ROOT / "src/cleaning.py", "w", encoding="utf-8") as f:
    f.write(cleaning_py)

print(f"Written {ROOT / 'src/cleaning.py'}")

Written project/src/cleaning.py


In [7]:
from textwrap import dedent
outliers_py = dedent("""
from __future__ import annotations
from typing import Iterable, Dict, Optional, Literal
import numpy as np
import pandas as pd

OutlierMethod = Literal["iqr", "zscore"]
HandleMode = Literal["flag", "remove", "winsorize", "none"]

__all__ = [
    "detect_outliers_iqr",
    "detect_outliers_zscore",
    "winsorize_series",
    "flag_outliers_df",
    "remove_outliers_df",
    "winsorize_df",
]

def detect_outliers_iqr(series: pd.Series, k: float = 1.5) -> pd.Series:
    s = series.dropna()
    if s.empty:
        return pd.Series(False, index=series.index)
    q1 = s.quantile(0.25)
    q3 = s.quantile(0.75)
    iqr = q3 - q1
    lower = q1 - k * iqr
    upper = q3 + k * iqr
    mask = (series < lower) | (series > upper)
    return mask.fillna(False)

def detect_outliers_zscore(series: pd.Series, threshold: float = 3.0) -> pd.Series:
    mu = series.mean(skipna=True)
    sigma = series.std(ddof=0, skipna=True)
    if sigma == 0 or np.isnan(sigma):
        return pd.Series(False, index=series.index)
    z = (series - mu) / sigma
    mask = z.abs() > threshold
    return mask.fillna(False)

def winsorize_series(series: pd.Series, lower: float = 0.05, upper: float = 0.95) -> pd.Series:
    if series.dropna().empty:
        return series
    lo = series.quantile(lower)
    hi = series.quantile(upper)
    return series.clip(lower=lo, upper=hi)

def flag_outliers_df(
    df: pd.DataFrame,
    columns: Optional[Iterable[str]] = None,
    method: OutlierMethod = "iqr",
    method_params: Optional[Dict] = None,
    flag_suffix: Optional[str] = None,
) -> pd.DataFrame:
    if columns is None:
        columns = df.select_dtypes(include=[np.number]).columns.tolist()
    method_params = method_params or {}
    out = df.copy()
    for col in columns:
        if not pd.api.types.is_numeric_dtype(out[col]):
            continue
        if method == "iqr":
            mask = detect_outliers_iqr(out[col], **method_params)
            suffix = flag_suffix or "outlier_iqr"
        elif method == "zscore":
            mask = detect_outliers_zscore(out[col], **method_params)
            suffix = flag_suffix or "outlier_z"
        else:
            raise ValueError(f"Unsupported method: {method}")
        out[f"{col}_{suffix}"] = mask
    return out

def remove_outliers_df(
    df: pd.DataFrame,
    flag_columns: Optional[Iterable[str]] = None,
    how: Literal["any", "all"] = "any",
) -> pd.DataFrame:
    if flag_columns is None:
        flag_columns = [c for c in df.columns if "outlier" in c]
    if not flag_columns:
        return df.copy()
    mask = df[flag_columns].any(axis=1) if how == "any" else df[flag_columns].all(axis=1)
    return df.loc[~mask].copy()

def winsorize_df(
    df: pd.DataFrame,
    columns: Optional[Iterable[str]] = None,
    lower: float = 0.05,
    upper: float = 0.95,
) -> pd.DataFrame:
    if columns is None:
        columns = df.select_dtypes(include=[np.number]).columns.tolist()
    out = df.copy()
    for col in columns:
        if pd.api.types.is_numeric_dtype(out[col]):
            out[col] = winsorize_series(out[col], lower=lower, upper=upper)
    return out
""").lstrip()

with open(ROOT / "src/outliers.py", "w", encoding="utf-8") as f:
    f.write(outliers_py)

print(f"Written {ROOT / 'src/outliers.py'}")

Written project/src/outliers.py


In [8]:
from textwrap import dedent

eda_py = dedent("""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew, kurtosis

def run_eda(df):
    sns.set(context='talk', style='whitegrid')
    pd.set_option('display.max_columns', 100)
    print(df.info(), df.isna().sum())
    desc = df[['age','income','transactions','spend']].describe().T
    desc['skew'] = [skew(df[c].dropna()) for c in desc.index]
    desc['kurtosis'] = [kurtosis(df[c].dropna()) for c in desc.index]
    print(desc)
    
    sns.histplot(df['income'], kde=True)
    plt.title('Income Distribution')
    plt.show()

    sns.boxplot(x=df['transactions'])
    plt.title('Transactions (Outliers)')
    plt.show()

    sns.histplot(df['spend'], kde=True)
    plt.title('Spend Distribution')
    plt.show()

    sns.scatterplot(data=df, x='income', y='spend')
    plt.title('Spend vs Income')
    plt.show()

    sns.countplot(data=df, x='region')
    plt.title('Count by Region')
    plt.show()
""").lstrip()

with open(ROOT / "scripts/eda.py", "w", encoding="utf-8") as f:
    f.write(eda_py)

print(f"Written {ROOT / 'scripts/eda.py'}")

Written project/scripts/eda.py


In [9]:
from textwrap import dedent

feature_engineering_py = dedent("""
import pandas as pd

def create_features(df):
    df['daily_return'] = df['close'].pct_change()
    df['rolling_avg_5d_close'] = df['close'].rolling(window=5).mean()
    df['rolling_vol_5d'] = df['daily_return'].rolling(window=5).std()
    return df
""").lstrip()

with open(ROOT / "scripts/feature_engineering.py", "w", encoding="utf-8") as f:
    f.write(feature_engineering_py)

print(f"Written {ROOT / 'scripts/feature_engineering.py'}")

Written project/scripts/feature_engineering.py


In [10]:
from textwrap import dedent

modeling_py = dedent("""
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

def train_regression_model(df):
    df['return'] = df['close'].pct_change()
    y = df['return'].shift(-1)
    features = ['open', 'high', 'low', 'close', 'volume']
    X = df[features]
    combined = pd.concat([y, X], axis=1)
    combined.dropna(inplace=True)
    y = combined['return']
    X = combined[features]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
    lr = LinearRegression().fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f'Baseline (predicting returns)   R²={r2:.4f}  RMSE={rmse:.6f}')
    return lr, X_test, y_test
""").lstrip()

with open(ROOT / "scripts/modeling.py", "w", encoding="utf-8") as f:
    f.write(modeling_py)

print(f"Written {ROOT / 'scripts/modeling.py'}")

Written project/scripts/modeling.py


In [11]:
from textwrap import dedent

evaluation_py = dedent("""
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

def mean_impute(a: np.ndarray) -> np.ndarray:
    m = np.nanmean(a)
    out = a.copy()
    out[np.isnan(out)] = m
    return out

def evaluate_model(df):
    X_raw = df['x_feature'].values
    y = df['y_target'].values
    X_base = mean_impute(X_raw)
    model = LinearRegression().fit(X_base.reshape(-1,1), y)
    y_hat = model.predict(X_base.reshape(-1,1))
    df['x_imputed'] = X_base
    base_mae = mean_absolute_error(y, y_hat)
    print(f"Base MAE: {base_mae}")
""").lstrip()

with open(ROOT / "scripts/evaluation.py", "w", encoding="utf-8") as f:
    f.write(evaluation_py)
    
print(f"Written {ROOT / 'scripts/evaluation.py'}")

Written project/scripts/evaluation.py


In [12]:
from textwrap import dedent

reporting_py = dedent("""
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

def generate_report(df):
    img_dir = Path('../deliverables/images')
    img_dir.mkdir(parents=True, exist_ok=True)
    
    plt.figure(figsize=(7,5))
    sns.scatterplot(data=df, x='volatility', y='return', hue='scenario', s=80)
    plt.title('Risk–Return by Scenario')
    plt.xlabel('Volatility')
    plt.ylabel('Return')
    plt.savefig(img_dir / 'risk_return.png', dpi=300)
    plt.show()
""").lstrip()

with open(ROOT / "scripts/reporting.py", "w", encoding="utf-8") as f:
    f.write(reporting_py)

print(f"Written {ROOT / 'scripts/reporting.py'}")

Written project/scripts/reporting.py


In [13]:
from textwrap import dedent

productization_py = dedent("""
from flask import Flask, request, jsonify
import pickle

app = Flask(__name__)

# Load the model
with open('model/model.pkl', 'rb') as f:
    model = pickle.load(f)

@app.route('/predict', methods=['POST'])
def predict():
    data = request.get_json()
    features = data.get('features', None)
    if features is None:
        return jsonify({'error': 'No features provided'}), 400
    prediction = model.predict([features])
    return jsonify({'prediction': prediction[0]})

if __name__ == '__main__':
    app.run(port=5000)
""").lstrip()

with open(ROOT / "app.py", "w", encoding="utf-8") as f:
    f.write(productization_py)

print(f"Written {ROOT / 'app.py'}")

Written project/app.py


In [14]:
from textwrap import dedent

main_py = dedent("""
from __future__ import annotations
import argparse
from pathlib import Path
import sys
import numpy as np
import pandas as pd
from src.config import load_environment
from src.storage import read_df, write_df
from scripts import eda, feature_engineering, modeling, evaluation, reporting

def main():
    load_environment()
    
    # Example usage of imported modules
    DATA_PATH = Path('data/raw/api_aapl.csv')
    if DATA_PATH.exists():
        df = read_df(DATA_PATH)
        df_featured = feature_engineering.create_features(df.copy())
        model, X_test, y_test = modeling.train_regression_model(df_featured.copy())
        
        # This part needs a DataFrame with 'x_feature' and 'y_target' for evaluation.
        # Since the provided notebooks have different data, we'll note this.
        # evaluation.evaluate_model(some_other_df) 
        
        # This part needs a DataFrame with scenario analysis results for reporting.
        # reporting.generate_report(scenario_df)
    else:
        print(f"{DATA_PATH} not found. Run the acquisition script first.")

if __name__ == "__main__":
    main()
""").lstrip()

with open(ROOT / "main.py", "w", encoding="utf-8") as f:
    f.write(main_py)

print(f"Written {ROOT / 'main.py'}")

Written project/main.py


In [15]:
from textwrap import dedent
readme = dedent("""
# Financial Engineering Project

## Overview
This project provides a comprehensive, end-to-end pipeline for financial analysis, including data acquisition, preprocessing, feature engineering, modeling, evaluation, and productization.

## Project Structure
- `data/`: Raw, interim, and processed datasets
- `src/`: Reusable Python modules for core logic (e.g., cleaning, outlier detection)
- `scripts/`: Standalone Python scripts for pipeline stages (e.g., data acquisition, EDA)
- `notebooks/`: Jupyter notebooks for exploratory analysis and prototyping
- `models/`: Trained and serialized models (e.g., `model.pkl`)
- `reports/`: Generated reports and figures
- `app.py`: Flask application for model deployment

## Quickstart
1. `python -m venv .venv && source .venv/bin/activate`
2. `pip install -r requirements.txt`
3. `cp .env.example .env` (and optionally set `ALPHAVANTAGE_API_KEY`)
4. `python main.py` to run the full pipeline.

""").lstrip()

with open(ROOT / "README.md", "w", encoding="utf-8") as f:
    f.write(readme)

print(f"Written {ROOT / 'README.md'}")

Written project/README.md


In [3]:
from textwrap import dedent

storage_py = dedent("""
from __future__ import annotations
import pandas as pd
from pathlib import Path

def detect_format(path: str | Path) -> str:
    ext = str(path).lower().rsplit('.', 1)[-1]
    if ext in ('csv', 'parquet'):
        return ext
    raise ValueError(f'Unsupported file extension: {ext}')

def write_df(df: pd.DataFrame, path: str | Path) -> Path:
    path = Path(path)
    fmt = detect_format(path)
    path.parent.mkdir(parents=True, exist_ok=True)
    if fmt == 'csv':
        df.to_csv(path, index=False)
        return path
    elif fmt == 'parquet':
        try:
            df.to_parquet(path, index=False)
        except Exception:
            fallback = path.with_suffix('.csv')
            df.to_csv(fallback, index=False)
            return fallback
        return path
    else:
        raise ValueError(f'Unsupported format: {fmt}')

def read_df(path: str | Path) -> pd.DataFrame:
    path = Path(path)
    fmt = detect_format(path)
    if fmt == 'csv':
        df = pd.read_csv(path)
        if 'date' in df.columns:
            try:
                df['date'] = pd.to_datetime(df['date'])
            except Exception:
                pass
        return df
    elif fmt == 'parquet':
        return pd.read_parquet(path)
    else:
        raise ValueError(f'Unsupported format: {fmt}')
""").lstrip()

with open("src/storage.py", "w", encoding="utf-8") as f:
    f.write(storage_py)

print("Written src/storage.py")

Written src/storage.py


In [4]:
from textwrap import dedent
cleaning_py = dedent("""
from __future__ import annotations
import pandas as pd
import numpy as np
from typing import Iterable

def fill_missing_median(df: pd.DataFrame, cols: Iterable[str]) -> pd.DataFrame:
    out = df.copy()
    for c in cols:
        if c in out.columns and pd.api.types.is_numeric_dtype(out[c]):
            med = out[c].median()
            out[c] = out[c].fillna(med)
    return out

def drop_missing(df: pd.DataFrame, threshold: float = 0.5) -> pd.DataFrame:
    out = df.copy()
    # drop columns above threshold missing
    col_missing = out.isna().mean()
    to_drop = [c for c, r in col_missing.items() if r > threshold]
    if to_drop:
        out = out.drop(columns=to_drop)
    # drop remaining rows with any missing
    out = out.dropna(axis=0, how='any')
    return out

def normalize_data(df: pd.DataFrame, cols: Iterable[str]) -> pd.DataFrame:
    out = df.copy()
    for c in cols:
        if c in out.columns and pd.api.types.is_numeric_dtype(out[c]):
            mu = out[c].mean()
            sigma = out[c].std(ddof=0)
            if sigma and not np.isnan(sigma) and sigma != 0:
                out[c] = (out[c] - mu) / sigma
    return out
""").lstrip()

with open("src/cleaning.py", "w", encoding="utf-8") as f:
    f.write(cleaning_py)

print("Written src/cleaning.py")

Written src/cleaning.py


In [5]:
from textwrap import dedent
outliers_py = dedent("""
from __future__ import annotations
from typing import Iterable, Dict, Optional, Literal
import numpy as np
import pandas as pd

OutlierMethod = Literal["iqr", "zscore"]
HandleMode = Literal["flag", "remove", "winsorize", "none"]

__all__ = [
    "detect_outliers_iqr",
    "detect_outliers_zscore",
    "winsorize_series",
    "flag_outliers_df",
    "remove_outliers_df",
    "winsorize_df",
]

def detect_outliers_iqr(series: pd.Series, k: float = 1.5) -> pd.Series:
    s = series.dropna()
    if s.empty:
        return pd.Series(False, index=series.index)
    q1 = s.quantile(0.25)
    q3 = s.quantile(0.75)
    iqr = q3 - q1
    lower = q1 - k * iqr
    upper = q3 + k * iqr
    mask = (series < lower) | (series > upper)
    return mask.fillna(False)

def detect_outliers_zscore(series: pd.Series, threshold: float = 3.0) -> pd.Series:
    mu = series.mean(skipna=True)
    sigma = series.std(ddof=0, skipna=True)
    if sigma == 0 or np.isnan(sigma):
        return pd.Series(False, index=series.index)
    z = (series - mu) / sigma
    mask = z.abs() > threshold
    return mask.fillna(False)

def winsorize_series(series: pd.Series, lower: float = 0.05, upper: float = 0.95) -> pd.Series:
    if series.dropna().empty:
        return series
    lo = series.quantile(lower)
    hi = series.quantile(upper)
    return series.clip(lower=lo, upper=hi)

def flag_outliers_df(
    df: pd.DataFrame,
    columns: Optional[Iterable[str]] = None,
    method: OutlierMethod = "iqr",
    method_params: Optional[Dict] = None,
    flag_suffix: Optional[str] = None,
) -> pd.DataFrame:
    if columns is None:
        columns = df.select_dtypes(include=[np.number]).columns.tolist()
    method_params = method_params or {}
    out = df.copy()
    for col in columns:
        if not pd.api.types.is_numeric_dtype(out[col]):
            continue
        if method == "iqr":
            mask = detect_outliers_iqr(out[col], **method_params)
            suffix = flag_suffix or "outlier_iqr"
        elif method == "zscore":
            mask = detect_outliers_zscore(out[col], **method_params)
            suffix = flag_suffix or "outlier_z"
        else:
            raise ValueError(f"Unsupported method: {method}")
        out[f"{col}_{suffix}"] = mask
    return out

def remove_outliers_df(
    df: pd.DataFrame,
    flag_columns: Optional[Iterable[str]] = None,
    how: Literal["any", "all"] = "any",
) -> pd.DataFrame:
    if flag_columns is None:
        flag_columns = [c for c in df.columns if "outlier" in c]
    if not flag_columns:
        return df.copy()
    mask = df[flag_columns].any(axis=1) if how == "any" else df[flag_columns].all(axis=1)
    return df.loc[~mask].copy()

def winsorize_df(
    df: pd.DataFrame,
    columns: Optional[Iterable[str]] = None,
    lower: float = 0.05,
    upper: float = 0.95,
) -> pd.DataFrame:
    if columns is None:
        columns = df.select_dtypes(include=[np.number]).columns.tolist()
    out = df.copy()
    for col in columns:
        if pd.api.types.is_numeric_dtype(out[col]):
            out[col] = winsorize_series(out[col], lower=lower, upper=upper)
    return out
""").lstrip()

with open("src/outliers.py", "w", encoding="utf-8") as f:
    f.write(outliers_py)

print("Written src/outliers.py")

Written src/outliers.py


In [6]:
from textwrap import dedent
acquire_py = dedent("""
from __future__ import annotations
import os
import pathlib
import datetime as dt
import requests
import pandas as pd
from bs4 import BeautifulSoup
from dotenv import load_dotenv

RAW = pathlib.Path('data/raw')
RAW.mkdir(parents=True, exist_ok=True)
load_dotenv()

def ts() -> str:
    return dt.datetime.now().strftime('%Y%m%d-%H%M%S')

def save_csv(df: pd.DataFrame, prefix: str, **meta) -> pathlib.Path:
    mid = '_'.join([f"{k}-{v}" for k, v in meta.items()])
    path = RAW / f"{prefix}_{mid}_{ts()}.csv"
    df.to_csv(path, index=False)
    print("Saved", path)
    return path

def validate(df: pd.DataFrame, required):
    missing = [c for c in required if c not in df.columns]
    return {'missing': missing, 'shape': df.shape, 'na_total': int(df.isna().sum().sum())}

def acquire_api(symbol: str = 'AAPL') -> pd.DataFrame:
    use_alpha = bool(os.getenv('ALPHAVANTAGE_API_KEY'))
    if use_alpha:
        url = 'https://www.alphavantage.co/query'
        params = {'function':'TIME_SERIES_DAILY','symbol':symbol,'outputsize':'full','apikey':os.getenv('ALPHAVANTAGE_API_KEY')}
        r = requests.get(url, params=params, timeout=30)
        r.raise_for_status()
        js = r.json()
        key = [k for k in js if 'Time Series' in k][0]
        df_api = pd.DataFrame(js[key]).T
        df_api.columns = [c.split('. ')[1] for c in df_api.columns]
        df_api = df_api.reset_index().rename(columns={'index':'date'})
        df_api['date'] = pd.to_datetime(df_api['date'])
        for col in ['open','high','low','close','volume']:
            df_api[col] = pd.to_numeric(df_api[col], errors='coerce')
    else:
        import yfinance as yf
        df_api = yf.download(symbol, period='3mo', interval='1d').reset_index()
        df_api = df_api.rename(columns={'Date':'date','Open':'open','High':'high','Low':'low','Close':'close','Volume':'volume'})
    required_cols = ['date','open','high','low','close','volume']
    v = validate(df_api, required_cols)
    print("API Validation Results:", v)
    return df_api

def acquire_scrape(url: str = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies') -> pd.DataFrame:
    headers = {'User-Agent':'AFE-Project/1.0'}
    try:
        resp = requests.get(url, headers=headers, timeout=30)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, 'html.parser')
        rows = [[c.get_text(strip=True) for c in tr.find_all(['th','td'])] for tr in soup.find_all('tr')]
        header, *data = [r for r in rows if r]
        df_scrape = pd.DataFrame(data, columns=header)
    except Exception as e:
        print('Scrape failed, using inline demo table:', e)
        html = '<table><tr><th>Ticker</th><th>Price</th></tr><tr><td>AAA</td><td>101.2</td></tr></table>'
        soup = BeautifulSoup(html, 'html.parser')
        rows = [[c.get_text(strip=True) for c in tr.find_all(['th','td'])] for tr in soup.find_all('tr')]
        header, *data = [r for r in rows if r]
        df_scrape = pd.DataFrame(data, columns=header)
    if 'Price' in df_scrape.columns:
        df_scrape['Price'] = pd.to_numeric(df_scrape['Price'], errors='coerce')
    required_cols = ['Symbol','Security']
    v = {'missing':[c for c in required_cols if c not in df_scrape.columns], 'shape': df_scrape.shape, 'na_total': int(df_scrape.isna().sum().sum())}
    print("Scrape Validation Results:", v)
    return df_scrape

def main():
    df_api = acquire_api('AAPL')
    _ = save_csv(df_api.sort_values('date'), prefix='api', source='alpha' if bool(os.getenv('ALPHAVANTAGE_API_KEY')) else 'yfinance', symbol='AAPL')

    df_scrape = acquire_scrape()
    _ = save_csv(df_scrape, prefix='scrape', site='wikipedia', table='SP500-List')

if __name__ == "__main__":
    main()
""").lstrip()

with open("scripts/acquire.py", "w", encoding="utf-8") as f:
    f.write(acquire_py)

print("Written scripts/acquire.py")

Written scripts/acquire.py


In [7]:
from textwrap import dedent
preprocess_py = dedent("""
from __future__ import annotations
from pathlib import Path
import numpy as np
import pandas as pd
from src.outliers import flag_outliers_df, remove_outliers_df, winsorize_df

def ensure_dataset(path: Path) -> pd.DataFrame:
    if path.exists():
        return pd.read_csv(path)
    # Fallback synthetic linear dataset with extremes
    x = np.linspace(0, 10, 200)
    y = 2.2 * x + 1 + np.random.normal(0, 1.2, size=x.size)
    y[10] += 15; y[120] -= 13; y[160] += 18
    df = pd.DataFrame({'x': x, 'y': y})
    path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(path, index=False)
    return df

def main():
    raw_path = Path("data/raw/outliers_homework.csv")
    df = ensure_dataset(raw_path)

    num_cols = df.select_dtypes(include='number').columns.tolist()
    if not num_cols:
        raise ValueError("No numeric columns found for outlier processing.")
    target_col = 'y' if 'y' in df.columns else num_cols[0]

    df_flagged = flag_outliers_df(df, columns=[target_col], method="iqr", method_params={"k": 1.5})

    df_removed = remove_outliers_df(df_flagged, flag_columns=[f"{target_col}_outlier_iqr"], how="any")
    df_wins = winsorize_df(df_flagged, columns=[target_col], lower=0.05, upper=0.95)

    Path("data/interim").mkdir(parents=True, exist_ok=True)
    Path("data/processed").mkdir(parents=True, exist_ok=True)

    df_flagged.to_csv("data/interim/with_outlier_flags.csv", index=False)
    df_removed.to_csv("data/processed/removed_outliers.csv", index=False)
    df_wins.to_csv("data/processed/winsorized.csv", index=False)

    print("Wrote data/interim/with_outlier_flags.csv")
    print("Wrote data/processed/removed_outliers.csv")
    print("Wrote data/processed/winsorized.csv")

if __name__ == "__main__":
    main()
""").lstrip()

with open("scripts/preprocess.py", "w", encoding="utf-8") as f:
    f.write(preprocess_py)

print("Written scripts/preprocess.py")

Written scripts/preprocess.py


In [8]:
from textwrap import dedent
sensitivity_py = dedent("""
from __future__ import annotations
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from src.outliers import flag_outliers_df, winsorize_series

np.random.seed(17)

def ensure_dataset(path: Path) -> pd.DataFrame:
    if path.exists():
        return pd.read_csv(path)
    x = np.linspace(0, 10, 200)
    y = 2.2 * x + 1 + np.random.normal(0, 1.2, size=x.size)
    y[10] += 15; y[120] -= 13; y[160] += 18
    df = pd.DataFrame({'x': x, 'y': y})
    path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(path, index=False)
    return df

def main():
    data_path = Path('data/raw/outliers_homework.csv')
    df = ensure_dataset(data_path)
    target_col = 'y' if 'y' in df.columns else df.select_dtypes(include='number').columns[0]

    df = flag_outliers_df(df, columns=[target_col], method="iqr", method_params={"k": 1.5})
    df = flag_outliers_df(df, columns=[target_col], method="zscore", method_params={"threshold": 3.0})

    pct_iqr = df[f'{target_col}_outlier_iqr'].mean() * 100
    pct_z = df[f'{target_col}_outlier_z'].mean() * 100
    print(f"Flagged (%): IQR={pct_iqr:.2f}%, Z={pct_z:.2f}%")

    plt.figure()
    plt.boxplot(df[target_col].dropna())
    plt.title(f'Boxplot: {target_col}')
    plt.tight_layout()
    plt.savefig('data/interim/boxplot.png')
    plt.close()

    plt.figure()
    plt.hist(df[target_col].dropna(), bins=30)
    plt.title(f'Histogram: {target_col}')
    plt.tight_layout()
    plt.savefig('data/interim/hist.png')
    plt.close()

    summ_all = df[target_col].describe()[['mean', '50%', 'std']].rename({'50%': 'median'})
    summ_filtered = df.loc[~df[f'{target_col}_outlier_iqr'], target_col].describe()[['mean', '50%', 'std']].rename({'50%': 'median'})
    w = winsorize_series(df[target_col], lower=0.05, upper=0.95)
    summ_w = w.describe()[['mean', '50%', 'std']].rename({'50%': 'median'})

    comp = pd.concat({'all': summ_all, 'filtered_iqr': summ_filtered, 'winsorized': summ_w}, axis=1)
    comp.to_csv('data/interim/sensitivity_summary.csv')
    print("Wrote data/interim/sensitivity_summary.csv")

    if 'x' in df.columns:
        X_all = df[['x']].to_numpy(); y_all = df[target_col].to_numpy()
        X_flt = df.loc[~df[f'{target_col}_outlier_iqr'], ['x']].to_numpy()
        y_flt = df.loc[~df[f'{target_col}_outlier_iqr'], target_col].to_numpy()

        model_all = LinearRegression().fit(X_all, y_all)
        model_flt = LinearRegression().fit(X_flt, y_flt)

        mae_all = mean_absolute_error(y_all, model_all.predict(X_all))
        mae_flt = mean_absolute_error(y_flt, model_flt.predict(X_flt))

        results = pd.DataFrame({
            'slope': [model_all.coef_[0], model_flt.coef_[0]],
            'intercept': [model_all.intercept_, model_flt.intercept_],
            'r2': [model_all.score(X_all, y_all), model_flt.score(X_flt, y_flt)],
            'mae': [mae_all, mae_flt]
        }, index=['all', 'filtered_iqr'])
        results.to_csv('data/interim/regression_comparison.csv')
        print("Wrote data/interim/regression_comparison.csv")
    else:
        print("No 'x' column; skipped regression comparison.")

if __name__ == "__main__":
    main()
""").lstrip()

with open("scripts/sensitivity.py", "w", encoding="utf-8") as f:
    f.write(sensitivity_py)

print("Written scripts/sensitivity.py")

Written scripts/sensitivity.py


In [9]:
# Cell 9: Write docs/outliers.md
from textwrap import dedent
outliers_md = dedent("""
# Outlier Handling: Definition, Methods, Assumptions, and Risks  

## Definition  
Observations that significantly deviate from the overall distribution pattern are considered outliers; detection methods include IQR and Z-score.  

## Methods and Thresholds  
- **IQR**: k=1.5, using quartiles and the interquartile range to identify outliers.  
- **Z-score**: Threshold of 3.0, assumes approximate normality, more sensitive to heavy-tailed distributions.  
- **Winsorize**: Trimming at the 5th and 95th percentiles to reduce the impact of extreme values without deleting data.  

## Assumptions  
- Extreme observations are mostly noise or abnormal processes rather than genuine business events.  
- Missing values are excluded from statistical calculations; constant columns are not flagged as outliers (no variance).  

## Sensitivity Analysis Design  
- Compare mean/median/standard deviation between "original data," "IQR outlier removal," and "Winsorize."  
- If an independent variable x exists, perform simple bivariate regression to compare slope/intercept/R²/MAE.  

## Observations and Impact (Update based on actual results)  
- After removing outliers, MAE generally decreases while R² increases; the slope aligns more closely with the main trend (subject to actual output).  

## Risks  
- Risk of mistakenly removing genuine extreme events (e.g., rare but significant peaks).  
- Methods may misjudge in asymmetric or multimodal distributions; Z-score is less robust for heavy-tailed distributions.  
- Over-cleaning reduces sample size and increases model variance.  

## Mitigation Strategies  
- Prioritize Winsorizing or merely flagging key business metrics.  
- Record thresholds, deletion ratios, and changes in performance metrics; conduct business reviews if necessary.  
- Use robust loss functions or robust regression in downstream tasks.  

## Reproducing the Experiment  
- Data: `data/raw/outliers_homework.csv` (if missing, synthetic data will be automatically generated).  
- Scripts: `scripts/preprocess.py`, `scripts/sensitivity.py`.  
- Output: Comparison files and plots in `data/interim` and `data/processed`.
""").lstrip()

with open("docs/outliers.md", "w", encoding="utf-8") as f:
    f.write(outliers_md)

print("Written docs/outliers.md")

Written docs/outliers.md


In [10]:
reqs = """
pandas
numpy
scikit-learn
matplotlib
requests
beautifulsoup4
python-dotenv
yfinance
pyarrow
"""
with open("requirements.txt", "w", encoding="utf-8") as f:
    f.write(reqs.strip() + "\n")
print("Written requirements.txt")

Written requirements.txt


In [11]:
gitignore = """
.env
__pycache__/
.ipynb_checkpoints/
.DS_Store
*.pyc
data/raw/*.csv
data/interim/*
data/processed/*
"""
with open(".gitignore", "w", encoding="utf-8") as f:
    f.write(gitignore.lstrip())
print("Written .gitignore")

Written .gitignore


In [12]:
env_example = "ALPHAVANTAGE_API_KEY=NOL8361L3I5LIPQX\n"
with open(".env.example", "w", encoding="utf-8") as f:
    f.write(env_example)
print("Written .env.example")

Written .env.example


In [13]:
from textwrap import dedent
readme = dedent("""
# Outlier Analysis Project

## Overview
A small end-to-end pipeline integrating data acquisition, storage, preprocessing, and explicit outlier management with sensitivity analysis.

## Structure
- data/{raw,interim,processed}
- src/{config.py,storage.py,cleaning.py,outliers.py}
- scripts/{acquire.py,preprocess.py,sensitivity.py}
- docs/outliers.md
- notebooks/ (optional for visual analysis)

## Quickstart
1) python -m venv .venv && source .venv/bin/activate
2) pip install -r requirements.txt
3) cp .env.example .env  # optionally set ALPHAVANTAGE_API_KEY
4) python scripts/acquire.py  # optional: pull sample API/scrape data
5) python scripts/preprocess.py  # outlier flag/remove/winsorize
6) python scripts/sensitivity.py  # generate summary and plots

## Notes
- If data/raw/outliers_homework.csv doesn't exist, scripts will generate a synthetic dataset.
- Parquet writing gracefully falls back to CSV if engine is unavailable.
""").lstrip()

with open("README.md", "w", encoding="utf-8") as f:
    f.write(readme)

print("Written README.md")

Written README.md
