In [3]:
# First, ensure you have all necessary libraries installed:
# !pip install numpy pandas scipy scikit-learn matplotlib

import os
import math
import warnings
import json
import textwrap
from typing import List, Tuple

import numpy as np
import pandas as pd

from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.feature_selection import mutual_info_regression
from sklearn.ensemble import RandomForestRegressor

import matplotlib
# In Jupyter, you might want to uncomment one of the following for interactive plots.
# For saving "clean plots", 'Agg' is fine as it doesn't try to render interactively.
# %matplotlib inline
# %matplotlib notebook
matplotlib.use("Agg") # Use 'Agg' for non-interactive backend, suitable for saving figures
import matplotlib.pyplot as plt


# -------------------- Utilities --------------------

TORSION_COLS = ["Phi", "Psi", "Omega"]
REQUIRED_COLS = [
    "Organism", "Codon", "AA_from_cDNA", "AA_from_structure",
    "Phi", "Psi", "Omega", "Codon_Frequency", "Relative_Frequency"
]

def safe_mkdir(path: str):
    os.makedirs(path, exist_ok=True)

def save_fig(path: str, tight=True):
    if tight:
        plt.tight_layout()
    plt.savefig(path, dpi=180)
    plt.close()

def clean_angles(df: pd.DataFrame) -> pd.DataFrame:
    """Ensure torsion angles are floats and within plausible ranges."""
    for c in TORSION_COLS:
        df[c] = pd.to_numeric(df[c], errors="coerce")
        # Omega often ~180 or 0 (+-), Phi/Psi in [-180, 180]. We won't clip, just drop gross outliers.
    # Drop rows with missing torsions
    before = len(df)
    df = df.dropna(subset=TORSION_COLS)
    after = len(df)
    if after < before:
        print(f"[clean] Dropped {before - after} rows with missing torsion angles.")
    return df

def basic_clean(df: pd.DataFrame) -> pd.DataFrame:
    # Required columns
    missing = [c for c in REQUIRED_COLS if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}")
    # Normalize text columns
    df["Organism"] = df["Organism"].astype(str).str.strip()
    df["Codon"] = df["Codon"].astype(str).str.strip().str.upper()
    df["AA_from_cDNA"] = df["AA_from_cDNA"].astype(str).str.strip().str.upper()
    df["AA_from_structure"] = df["AA_from_structure"].astype(str).str.strip().str.upper()
    # Frequencies
    for fcol in ["Codon_Frequency", "Relative_Frequency"]:
        df[fcol] = pd.to_numeric(df[fcol], errors="coerce")
    df = df.dropna(subset=["Codon_Frequency", "Relative_Frequency"])
    # Angles
    df = clean_angles(df)
    # Optional: filter rare codons with tiny counts if present (not provided, but can mitigate noise)
    return df

def eta_squared_from_anova(groups: List[np.ndarray]) -> float:
    """
    Compute eta^2 (effect size) from one-way ANOVA inputs.
    eta^2 = SSB / SST
    """
    all_vals = np.concatenate(groups)
    grand_mean = np.mean(all_vals)
    ss_total = np.sum((all_vals - grand_mean) ** 2)
    ss_between = 0.0
    for g in groups:
        if len(g) == 0:
            continue
        ss_between += len(g) * (np.mean(g) - grand_mean) ** 2
    if ss_total <= 0:
        return np.nan
    return ss_between / ss_total

def anova_or_kruskal_by_category(df: pd.DataFrame, cat_col: str, y: str) -> dict:
    """
    For torsion y and categorical predictor cat_col (e.g., Codon or AA_from_cDNA),
    run ANOVA (if assumptions roughly OK) and Kruskal-Wallis as a robust alternative.
    Return statistics including eta^2.
    """
    data = df[[cat_col, y]].dropna()
    groups = [g[y].values for _, g in data.groupby(cat_col)]
    # ANOVA
    try:
        f_stat, p_val = stats.f_oneway(*groups)
    except Exception:
        f_stat, p_val = np.nan, np.nan
    eta2 = eta_squared_from_anova(groups)

    # Kruskal-Wallis (non-parametric)
    try:
        h_stat, p_kw = stats.kruskal(*groups)
    except Exception:
        h_stat, p_kw = np.nan, np.nan

    return {"y": y, "cat": cat_col, "anova_F": f_stat, "anova_p": p_val, "eta2": eta2, "kruskal_H": h_stat, "kruskal_p": p_kw}

def plot_box_by_category(df: pd.DataFrame, cat_col: str, y: str, out_png: str, top_k: int = 30, min_n: int = 20):
    """
    Plot distributions of y across top-k categories by frequency (to keep plots readable).
    """
    data = df[[cat_col, y]].dropna()
    counts = data[cat_col].value_counts()
    keep = counts[counts >= min_n].index.tolist()[:top_k]
    data = data[data[cat_col].isin(keep)]
    plt.figure(figsize=(12, 5))
    data.boxplot(column=y, by=cat_col, grid=False, rot=90)
    plt.title(f"{y} distribution by {cat_col} (top {len(keep)})")
    plt.suptitle("")
    plt.xlabel(cat_col)
    plt.ylabel(y)
    save_fig(out_png)

def feature_pipeline_codon(df: pd.DataFrame, y_col: str) -> Tuple[Pipeline, np.ndarray, np.ndarray, List[str]]:
    """
    Build a pipeline that uses CODON + ORGANISM + USAGE to predict y_col.
    """
    X = df[["Codon", "Organism", "Relative_Frequency"]].copy()
    y = df[y_col].values

    cat_features = ["Codon", "Organism"]
    num_features = ["Relative_Frequency"]

    pre = ColumnTransformer([
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_features),
        ("num", "passthrough", num_features)
    ])

    model = RandomForestRegressor(
        n_estimators=350,
        max_depth=None,
        random_state=42,
        n_jobs=-1
    )

    pipe = Pipeline([("pre", pre), ("rf", model)])
    pipe.fit(X, y)

    # Get feature names for importance plot
    ohe = pipe.named_steps["pre"].named_transformers_["cat"]
    cat_names = list(ohe.get_feature_names_out(cat_features))
    feat_names = cat_names + num_features

    return pipe, X.values, y, feat_names

def feature_pipeline_aa_baseline(df: pd.DataFrame, y_col: str) -> Tuple[Pipeline, np.ndarray, np.ndarray, List[str]]:
    """
    AA-only baseline (for reference). Uses only the amino acid from cDNA (one-hot).
    """
    X = df[["AA_from_cDNA"]].copy()
    y = df[y_col].values
    pre = ColumnTransformer([("aa", OneHotEncoder(handle_unknown="ignore", sparse_output=False), ["AA_from_cDNA"])])
    model = RandomForestRegressor(n_estimators=350, random_state=42, n_jobs=-1)
    pipe = Pipeline([("pre", pre), ("rf", model)])
    pipe.fit(X, y)
    aa_names = list(pipe.named_steps["pre"].named_transformers_["aa"].get_feature_names_out(["AA_from_cDNA"]))
    return pipe, X.values, y, aa_names

def evaluate_model(pipe: Pipeline, X_df: pd.DataFrame, y: np.ndarray, out_prefix: str) -> dict:
    X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.2, random_state=42)
    pipe.fit(X_train, y_train)
    y_hat = pipe.predict(X_test)
    r2 = r2_score(y_test, y_hat)
    mae = mean_absolute_error(y_test, y_hat)

    # Parity plot
    plt.figure(figsize=(5, 5))
    plt.scatter(y_test, y_hat, alpha=0.6)
    lims = [min(y_test.min(), y_hat.min()), max(y_test.max(), y_hat.max())]
    plt.plot(lims, lims, color='red', linestyle='--') # Added a red dashed line for clarity
    plt.xlabel("Actual")
    plt.ylabel("Predicted")
    plt.title(f"Parity: {os.path.basename(out_prefix).replace('_', ' ')}") # Cleaner title
    save_fig(f"{out_prefix}_parity.png")
    return {"r2": r2, "mae": mae}

def rf_feature_importance(pipe: Pipeline, feat_names: List[str], out_png: str, top_k: int = 25):
    rf = pipe.named_steps["rf"]
    importances = rf.feature_importances_
    idx = np.argsort(importances)[::-1][:top_k]
    names = [feat_names[i] for i in idx]
    vals = importances[idx]
    plt.figure(figsize=(8, max(4, int(0.35*len(names)))))
    plt.barh(range(len(names)), vals[::-1])
    plt.yticks(range(len(names)), names[::-1])
    plt.xlabel("Importance")
    plt.title("Random Forest Feature Importance")
    save_fig(out_png)

def compute_mutual_info(df: pd.DataFrame, y_col: str) -> pd.DataFrame:
    """
    Compute MI for (Codon, Organism, Relative_Frequency) against y_col using one-hot for cats.
    """
    X = df[["Codon", "Organism", "Relative_Frequency"]].copy()
    y = df[y_col].values

    cat_features = ["Codon", "Organism"]
    num_features = ["Relative_Frequency"]

    pre = ColumnTransformer([
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_features),
        ("num", "passthrough", num_features)
    ])
    X_enc = pre.fit_transform(X)
    feat_names = list(pre.named_transformers_["cat"].get_feature_names_out(cat_features)) + num_features
    mi = mutual_info_regression(X_enc, y, discrete_features=False, random_state=42)
    return pd.DataFrame({"feature": feat_names, f"MI({y_col})": mi}).sort_values(f"MI({y_col})", ascending=False)

def write_report(out_dir: str, stats_rows: List[dict], perf_rows: List[dict], mi_tables: dict):
    report_path = os.path.join(out_dir, "REPORT_codon_first.md")
    lines = []
    lines.append("# Codon-first structural signal report\n")
    lines.append("This analysis emphasizes **codon + organism + usage** features to predict torsion angles.\n")
    lines.append("Key idea: amino acids are derived from codons (many-to-one), so collapsing to AA loses information.\n")
    lines.append("\n## 1) Group-wise statistics (ANOVA/Kruskal, eta²)\n")
    stats_df = pd.DataFrame(stats_rows)
    lines.append(stats_df.to_markdown(index=False))
    lines.append("\n\n## 2) Predictive performance (Random Forest)\n")
    perf_df = pd.DataFrame(perf_rows)
    lines.append(perf_df.to_markdown(index=False))
    lines.append("\n\n## 3) Mutual Information (top codon features)\n")
    for y_col, mi_df in mi_tables.items():
        lines.append(f"\n### {y_col}\n")
        lines.append(mi_df.head(30).to_markdown(index=False))
    with open(report_path, "w", encoding="utf-8") as f:
        f.write("\n".join(lines))
    print(f"[report] Wrote {report_path}")

# -------------------- Main Execution Block for Jupyter --------------------

# --- User-defined parameters for Jupyter Notebook ---
csv_path = "new_codon_torsions_freq.csv" # <--- IMPORTANT: Change this to your CSV file path
output_dir = "codon_analysis_results" # <--- IMPORTANT: Change this to your desired output directory
topk_plot_categories = 30
# ----------------------------------------------------

# Create a dummy class to mimic argparse.Namespace
class Args:
    def __init__(self, csv, out, topk_plot):
        self.csv = csv
        self.out = out
        self.topk_plot = topk_plot

# Instantiate the args object
args = Args(csv=csv_path, out=output_dir, topk_plot=topk_plot_categories)

# Now, the rest of the main logic as provided in the original script
safe_mkdir(args.out)

print("[io] Reading CSV...")
# Example: Create a dummy CSV if you don't have one for testing
# (Uncomment and run this cell once if you need dummy data)
# if not os.path.exists(args.csv):
#     print(f"[{args.csv}] not found. Creating dummy data.")
#     dummy_data = {
#         "ID": range(1000),
#         "Organism": np.random.choice(["Ecoli", "Human", "Yeast"], 1000),
#         "Codon_Index": np.random.randint(1, 300, 1000),
#         "Codon": np.random.choice(["ATG", "TTA", "CGA", "GGC", "AAA"], 1000),
#         "AA_from_cDNA": np.random.choice(["M", "L", "R", "G", "K"], 1000),
#         "AA_from_structure": np.random.choice(["M", "L", "R", "G", "K"], 1000),
#         "Residue_Number": np.random.randint(1, 300, 1000),
#         "Phi": np.random.uniform(-180, 180, 1000),
#         "Psi": np.random.uniform(-180, 180, 1000),
#         "Omega": np.random.uniform(-180, 180, 1000),
#         "Codon_Frequency": np.random.uniform(0.01, 0.5, 1000),
#         "Relative_Frequency": np.random.uniform(0.01, 0.1, 1000),
#     }
#     pd.DataFrame(dummy_data).to_csv(args.csv, index=False)
#     print(f"Dummy data saved to {args.csv}")

df = pd.read_csv(args.csv)
print(f"[io] Rows in: {len(df)}")

print("[clean] Cleaning...")
df = basic_clean(df)
print(f"[clean] Rows after clean: {len(df)}")

# Save cleaned CSV
cleaned_csv = os.path.join(args.out, "cleaned.csv")
df.to_csv(cleaned_csv, index=False)
print(f"[io] Saved cleaned CSV to {cleaned_csv}")

# ------------------ 1) Category-level stats ------------------
stats_rows = []
for y in TORSION_COLS:
    for cat in ["AA_from_cDNA", "Codon"]:
        res = anova_or_kruskal_by_category(df, cat, y)
        stats_rows.append(res)
        # Distribution plots
        out_png = os.path.join(args.out, f"box_{y}_by_{cat}.png")
        plot_box_by_category(df, cat, y, out_png, top_k=args.topk_plot, min_n=20)

stats_df = pd.DataFrame(stats_rows)
stats_csv = os.path.join(args.out, "stats_groupwise.csv")
stats_df.to_csv(stats_csv, index=False)
print(f"[stats] Saved groupwise stats -> {stats_csv}")

# ------------------ 2) Predictive models (codon-first + AA baseline) ------------------
perf_rows = []

for y in TORSION_COLS:
    # Codon-first
    print(f"[model] Training CODON-first for {y}...")
    X_codon = df[["Codon", "Organism", "Relative_Frequency"]].copy()
    y_vals = df[y].values
    codon_pipe = Pipeline([
        ("pre", ColumnTransformer([
            ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), ["Codon", "Organism"]),
            ("num", "passthrough", ["Relative_Frequency"])
        ])),
        ("rf", RandomForestRegressor(n_estimators=350, random_state=42, n_jobs=-1))
    ])
    perf_codon = evaluate_model(codon_pipe, X_codon, y_vals, os.path.join(args.out, f"codon_{y}"))
    # Feature importance
    # We need to refit the full pipeline to get the feature names correctly for importance plotting
    codon_pipe.fit(X_codon, y_vals)
    ohe = codon_pipe.named_steps["pre"].named_transformers_["cat"]
    feat_names_codon = list(ohe.get_feature_names_out(["Codon", "Organism"])) + ["Relative_Frequency"]
    rf_feature_importance(codon_pipe, feat_names_codon, os.path.join(args.out, f"fi_codon_{y}.png"))

    perf_rows.append({"target": y, "model": "CODON+Organism+Usage", **perf_codon})

    # AA-only baseline
    print(f"[model] Training AA-only baseline for {y}...")
    X_aa = df[["AA_from_cDNA"]].copy()
    aa_pipe = Pipeline([
        ("pre", ColumnTransformer([("aa", OneHotEncoder(handle_unknown="ignore", sparse_output=False), ["AA_from_cDNA"])])),
        ("rf", RandomForestRegressor(n_estimators=350, random_state=42, n_jobs=-1))
    ])
    perf_aa = evaluate_model(aa_pipe, X_aa, y_vals, os.path.join(args.out, f"aa_{y}"))
    perf_rows.append({"target": y, "model": "AA-only", **perf_aa})

perf_df = pd.DataFrame(perf_rows)
perf_csv = os.path.join(args.out, "model_performance.csv")
perf_df.to_csv(perf_csv, index=False)
print(f"[model] Saved model performance -> {perf_csv}")

# ------------------ 3) Mutual Information for codon features ------------------
mi_tables = {}
for y in TORSION_COLS:
    mi_df = compute_mutual_info(df, y)
    mi_tables[y] = mi_df
    mi_df.to_csv(os.path.join(args.out, f"mi_codon_features_{y}.csv"), index=False)

# ------------------ 4) Report ------------------
write_report(args.out, stats_rows, perf_rows, mi_tables)

# ------------------ 5) Bonus plots ------------------
# Relative_Frequency vs torsions (scatter + simple trend line via LOWESS omitted; just scatter)
for y in TORSION_COLS:
    plt.figure(figsize=(6,4))
    plt.scatter(df["Relative_Frequency"].values, df[y].values, alpha=0.4)
    plt.xlabel("Relative_Frequency")
    plt.ylabel(y)
    plt.title(f"{y} vs Relative_Frequency (codon usage)")
    save_fig(os.path.join(args.out, f"scatter_relfreq_{y}.png"))

print("[done] All outputs saved. Review REPORT_codon_first.md for a summary.")

[io] Reading CSV...
[io] Rows in: 6684
[clean] Cleaning...
[clean] Dropped 49 rows with missing torsion angles.
[clean] Rows after clean: 6632
[io] Saved cleaned CSV to codon_analysis_results/cleaned.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[c] = pd.to_numeric(df[c], errors="coerce")


[stats] Saved groupwise stats -> codon_analysis_results/stats_groupwise.csv
[model] Training CODON-first for Phi...
[model] Training AA-only baseline for Phi...
[model] Training CODON-first for Psi...
[model] Training AA-only baseline for Psi...
[model] Training CODON-first for Omega...
[model] Training AA-only baseline for Omega...
[model] Saved model performance -> codon_analysis_results/model_performance.csv


ImportError: Missing optional dependency 'tabulate'.  Use pip or conda to install tabulate.