In [12]:
# ==============================================
# Data Science Assignment – Web3 Trading Team
# Candidate: Hrutik_Adsare
# ==============================================

# ----------------------------------------------
# 1. Setup / Mount Drive
# ----------------------------------------------
from google.colab import drive
drive.mount('/content/drive')

import os, json, warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timezone
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    roc_auc_score, roc_curve, confusion_matrix, classification_report,
    precision_recall_curve, average_precision_score
)

# Tree libraries
import lightgbm as lgb
from lightgbm import early_stopping
import xgboost as xgb
import joblib
import seaborn as sns

# ----------------------------------------------
# 2. Paths & Output Structure
# ----------------------------------------------
CANDIDATE = "Hrutik_Adsare"
BASE_DIR = f"/content/drive/MyDrive/ds_{CANDIDATE}"
os.makedirs(BASE_DIR, exist_ok=True)
NOTEBOOKS_DIR = BASE_DIR
CSV_DIR = os.path.join(BASE_DIR, "csv_files")
OUTPUTS_DIR = os.path.join(BASE_DIR, "outputs")
for d in [CSV_DIR, OUTPUTS_DIR]:
    os.makedirs(d, exist_ok=True)

# ----------------------------------------------
# 3. Load Datasets
# ----------------------------------------------
fg = pd.read_csv("/content/drive/MyDrive/fear_greed_index.csv")
tr = pd.read_csv("/content/drive/MyDrive/historical_data.csv")
print("Fear & Greed:", fg.shape)
print("Trades:", tr.shape)

# ----------------------------------------------
# 4. Clean & Feature Engineering (enhanced)
# ----------------------------------------------
# --- Sentiment ---
fg["date"] = pd.to_datetime(fg["date"], errors="coerce")
fg = fg.dropna(subset=["date"]).copy()
fg.columns = [c.strip().lower() for c in fg.columns]
fg_daily = fg.groupby("date", as_index=False).agg({"value": "last", "classification": "last"})

sentiment_map = {"Extreme Fear": -2, "Fear": -1, "Neutral": 0, "Greed": 1, "Extreme Greed": 2}
fg_daily["sentiment_cat_num"] = fg_daily["classification"].map(sentiment_map).fillna(0)

# create lagged & rolling sentiment features
fg_daily = fg_daily.sort_values("date")
fg_daily["value_lag1"] = fg_daily["value"].shift(1).fillna(method="bfill")
fg_daily["value_3d_mean"] = fg_daily["value"].rolling(3, min_periods=1).mean()
fg_daily["value_7d_mean"] = fg_daily["value"].rolling(7, min_periods=1).mean()

# --- Trades ---
df = tr.copy()
df.columns = [c.strip().replace(" ", "_") for c in df.columns]

# timestamp parse
def to_dt_utc(val):
    try:
        x = float(val)
        if x > 1e12:
            x = x / 1000.0
        return datetime.fromtimestamp(x, tz=timezone.utc)
    except Exception:
        try:
            dt = datetime.strptime(str(val), "%d-%m-%Y %H:%M")
            return (dt - pd.Timedelta(hours=5, minutes=30)).replace(tzinfo=timezone.utc)
        except Exception:
            return pd.NaT

ts_candidates = [c for c in df.columns if "timestamp" in c.lower()]
ts_col = ts_candidates[0] if ts_candidates else None
if ts_col is None:
    raise RuntimeError("No timestamp column detected in trades file.")
df["_dt_utc"] = df[ts_col].apply(to_dt_utc)
df = df.dropna(subset=["_dt_utc"]).copy()
df["_date"] = pd.to_datetime(df["_dt_utc"].dt.date)

# numeric conversions
for c in ["Execution_Price", "Size_Tokens", "Size_USD", "Closed_PnL", "Fee", "Start_Position"]:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

# direction
if "Side" in df.columns:
    df["is_buy"] = df["Side"].astype(str).str.upper().eq("BUY").astype(int)
else:
    df["is_buy"] = 0

# per-coin z-score features
for c in ["Execution_Price", "Size_USD"]:
    if c in df.columns:
        grp = df.groupby("Coin")[c]
        df[f"{c}_z"] = (df[c] - grp.transform("mean")) / grp.transform("std").replace(0, np.nan)
        df[f"{c}_z"] = df[f"{c}_z"].fillna(0)
df["log_size_usd"] = np.log1p(df["Size_USD"].clip(lower=0))

# rolling account stats
df = df.sort_values(["Account", "_dt_utc"]).copy()
def rolling_stats(g, window=50):
    pnl = g["Closed_PnL"].fillna(0)
    g["roll50_winrate"] = (pnl > 0).rolling(window, min_periods=5).mean()
    g["roll50_avg_size_usd"] = g["Size_USD"].rolling(window, min_periods=5).mean()
    g["roll50_buy_ratio"] = g["is_buy"].rolling(window, min_periods=5).mean()
    g["roll50_pnl_std"] = pnl.rolling(window, min_periods=5).std().fillna(0)
    g["roll50_pnl_mean"] = pnl.rolling(window, min_periods=5).mean().fillna(0)
    g["inter_trade_sec"] = g["_dt_utc"].diff().dt.total_seconds().fillna(0)
    g["roll50_intertrade_mean"] = g["inter_trade_sec"].rolling(window, min_periods=5).mean().fillna(0)
    return g

df = df.groupby("Account", group_keys=False).apply(rolling_stats)

# merge trades with sentiment
feat = df.merge(fg_daily.rename(columns={"date": "_date"}), on="_date", how="left")
for c in ["value", "classification", "sentiment_cat_num", "value_lag1", "value_3d_mean", "value_7d_mean"]:
    if c in feat.columns:
        feat[c] = feat[c].ffill().bfill()

# interaction features
feat["sentiment_x_size"] = feat["sentiment_cat_num"] * feat["log_size_usd"]
feat["sentiment_x_rollwin"] = feat["sentiment_cat_num"] * feat["roll50_winrate"]

# target
feat["target_profit"] = (feat["Closed_PnL"] > 0).astype(int)

# feature set
feature_cols = [c for c in [
    "Execution_Price_z", "Size_USD_z", "log_size_usd",
    "roll50_winrate", "roll50_avg_size_usd", "roll50_buy_ratio",
    "roll50_pnl_std", "roll50_pnl_mean", "roll50_intertrade_mean",
    "is_buy", "value", "sentiment_cat_num", "value_lag1", "value_3d_mean", "value_7d_mean",
    "sentiment_x_size", "sentiment_x_rollwin"
] if c in feat.columns]

dataset = feat[["_dt_utc", "_date", "Account", "Coin", "Closed_PnL", "target_profit"] + feature_cols].dropna().copy()
print("Engineered dataset:", dataset.shape)
dataset.to_csv(os.path.join(CSV_DIR, "engineered_dataset.csv"), index=False)

# ----------------------------------------------
# 5. Exploratory visuals
# ----------------------------------------------
plt.figure(figsize=(10,4))
fg_daily.set_index("date")["value"].plot(title="Fear & Greed Index Over Time")
plt.ylabel("Sentiment Value")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUTS_DIR, "sentiment_time.png"))
plt.close()

plt.figure(figsize=(8,4))
sns.histplot(dataset["Closed_PnL"], bins=120)
plt.title("Closed PnL Distribution")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUTS_DIR, "pnl_distribution.png"))
plt.close()

agg = dataset.groupby("_date").agg({"log_size_usd": "mean", "value": "mean"}).reset_index()
plt.figure(figsize=(6,4))
plt.scatter(agg["value"], agg["log_size_usd"], alpha=0.5, s=8)
plt.xlabel("Sentiment Value")
plt.ylabel("Avg log(Size USD)")
plt.title("Avg Trade Size vs Sentiment")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUTS_DIR, "trade_size_vs_sentiment.png"))
plt.close()

# ----------------------------------------------
# 6. Time-based split
# ----------------------------------------------
dataset = dataset.sort_values("_dt_utc")
cut = int(len(dataset) * 0.8)
train = dataset.iloc[:cut].copy()
valid = dataset.iloc[cut:].copy()

X_train = train[feature_cols].copy()
y_train = train["target_profit"]
X_valid = valid[feature_cols].copy()
y_valid = valid["target_profit"]

print("Train / Valid sizes:", X_train.shape, X_valid.shape)

# ----------------------------------------------
# 7. Modeling
# ----------------------------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)

# LightGBM
lgb_params = {
    "n_estimators": 1000,
    "learning_rate": 0.05,
    "num_leaves": 31,
    "colsample_bytree": 0.8,
    "subsample": 0.8,
    "random_state": 42,
    "n_jobs": -1,
    "class_weight": "balanced"
}
lgbm = lgb.LGBMClassifier(**lgb_params)
lgbm.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric="auc",
    callbacks=[early_stopping(50, verbose=False)]
)

# XGBoost (<2.0 syntax: early_stopping_rounds)
xgb_params = {
    "n_estimators": 1000,
    "learning_rate": 0.05,
    "max_depth": 6,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "use_label_encoder": False,
    "eval_metric": "auc",
    "n_jobs": -1,
    "random_state": 42,
    "tree_method": "hist",
}
xgbc = xgb.XGBClassifier(**xgb_params)
xgbc.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    #callbacks=[xgb.callback.EarlyStopping(rounds=50, save_best=True)], # Removed callbacks
    verbose=False
)

# RandomForest
rf = RandomForestClassifier(n_estimators=400, class_weight="balanced", random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

# Logistic Regression
logit = Pipeline([("scaler", StandardScaler()), ("clf", LogisticRegression(max_iter=500, class_weight="balanced"))])
logit.fit(X_train, y_train)

# ----------------------------------------------
# 8. Evaluate & choose best
# ----------------------------------------------
models = {"lightgbm": lgbm, "xgboost": xgbc, "random_forest": rf, "logistic": logit}

def get_proba(model, X):
    if hasattr(model, "predict_proba"):
        return model.predict_proba(X)[:, 1]
    return model.decision_function(X)

results = {}
for name, model in models.items():
    proba = get_proba(model, X_valid)
    thresholds = np.linspace(0.01, 0.99, 99)
    best_acc, best_t = -1, 0.5
    for t in thresholds:
        preds = (proba >= t).astype(int)
        acc = accuracy_score(y_valid, preds)
        if acc > best_acc:
            best_acc, best_t = acc, t
    preds_best = (proba >= best_t).astype(int)
    res = {
        "best_threshold": float(best_t),
        "accuracy": float(accuracy_score(y_valid, preds_best)),
        "f1": float(f1_score(y_valid, preds_best)),
        "precision": float(precision_score(y_valid, preds_best)),
        "recall": float(recall_score(y_valid, preds_best)),
        "roc_auc": float(roc_auc_score(y_valid, proba))
    }
    results[name] = res

metrics_df = pd.DataFrame.from_dict(results, orient="index").reset_index().rename(columns={"index": "model"})
metrics_df.to_csv(os.path.join(CSV_DIR, "metrics.csv"), index=False)
print(metrics_df)

# ----------------------------------------------
# 9. Save best model
# ----------------------------------------------
best_model_name = metrics_df.sort_values("accuracy", ascending=False).iloc[0]["model"]
best_model = models[best_model_name]
best_threshold = results[best_model_name]["best_threshold"]
print("Best model:", best_model_name, "threshold:", best_threshold)

joblib.dump(best_model, os.path.join(CSV_DIR, f"model_best_{best_model_name}.pkl"))
joblib.dump(scaler, os.path.join(CSV_DIR, "scaler.pkl"))
with open(os.path.join(CSV_DIR, "best_info.json"), "w") as f:
    json.dump({"best_model": best_model_name, "best_threshold": best_threshold, "metrics": results[best_model_name]}, f, indent=2)

# ----------------------------------------------
# 10. Visuals for the best model
# ----------------------------------------------
best_proba = get_proba(best_model, X_valid)

# ROC
fpr, tpr, _ = roc_curve(y_valid, best_proba)
auc_score = roc_auc_score(y_valid, best_proba)
plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, label=f"ROC (AUC={auc_score:.3f})")
plt.plot([0, 1], [0, 1], "--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Validation ROC Curve")
plt.legend()
plt.tight_layout()
plt.savefig(os.path.join(OUTPUTS_DIR, "roc_curve.png"))
plt.close()

# Precision-Recall Curve
precision, recall, _ = precision_recall_curve(y_valid, best_proba)
ap_score = average_precision_score(y_valid, best_proba)
plt.figure(figsize=(6,5))
plt.plot(recall, precision, label=f"PR (AP={ap_score:.3f})")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend()
plt.tight_layout()
plt.savefig(os.path.join(OUTPUTS_DIR, "precision_recall_curve.png"))
plt.close()

# Confusion matrix
best_preds = (best_proba >= best_threshold).astype(int)
cm = confusion_matrix(y_valid, best_preds)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=["Loss","Profit"], yticklabels=["Loss","Profit"])
plt.title(f"Confusion Matrix ({best_model_name})")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUTS_DIR, "confusion_matrix.png"))
plt.close()

# Feature importance
def save_feature_importance(model, name):
    if hasattr(model, "feature_importances_"):
        imp = pd.Series(model.feature_importances_, index=feature_cols).sort_values(ascending=False)
        imp.to_csv(os.path.join(CSV_DIR, f"feature_importance_{name}.csv"))
        plt.figure(figsize=(6,6))
        imp.head(20).iloc[::-1].plot(kind="barh")
        plt.title(f"Feature Importance ({name})")
        plt.tight_layout()
        plt.savefig(os.path.join(OUTPUTS_DIR, f"feature_importance_{name}.png"))
        plt.close()

for n in ["lightgbm", "xgboost", "random_forest"]:
    save_feature_importance(models[n], n)

# Classification report
report = classification_report(y_valid, best_preds, output_dict=True)
pd.DataFrame(report).T.to_csv(os.path.join(CSV_DIR, "classification_report_best.csv"))

# ----------------------------------------------
# 11. Final save & summary
# ----------------------------------------------
print("Saved artifacts to:", BASE_DIR)
print("Files in csv_files:", os.listdir(CSV_DIR)[:20])
print("Files in outputs:", os.listdir(OUTPUTS_DIR)[:20])
print("Best model:", best_model_name, "accuracy:", results[best_model_name]["accuracy"])

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Fear & Greed: (2644, 4)
Trades: (211224, 16)
Engineered dataset: (211096, 23)
Train / Valid sizes: (168876, 17) (42220, 17)
[LightGBM] [Info] Number of positive: 71139, number of negative: 97737
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006956 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3279
[LightGBM] [Info] Number of data points in the train set: 168876, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
           model  best_threshold  accuracy        f1  precision    recall  \
0       lightgbm            0.54  0.891639  0.852014   0.866049  0.838426   
1        xgboost            0.59  0.