In [4]:
# ===============================================
# 📊 BITCOIN SENTIMENT VS TRADER PERFORMANCE
# Web3 Trading Team - Data Science Assignment
# Assignment Submitter: Khushpreet
# ===============================================

# --- STEP 1: Mount Google Drive (if needed)
from google.colab import drive
# Note: Uncomment the line below if you are loading private files from your mounted Drive
# drive.mount('/content/drive')

# --- STEP 2: Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os # Added for directory creation
import nbformat # Added for notebook generation
!pip install reportlab
from reportlab.lib.pagesizes import A4 # Added for PDF generation
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer # Added for PDF generation
from reportlab.lib.styles import getSampleStyleSheet # Added for PDF generation

# --- STEP 3: Create Output Directory Structure (CRITICAL for compliance)
ROOT_DIR = 'ds_Khushpreet'
OUTPUTS_DIR = os.path.join(ROOT_DIR, 'outputs')
CSV_DIR = os.path.join(ROOT_DIR, 'csv_files')

# Create the required directories
os.makedirs(OUTPUTS_DIR, exist_ok=True)
os.makedirs(CSV_DIR, exist_ok=True)
print(f"✅ Created root directory: {ROOT_DIR} and subdirectories.")

# --- STEP 4: Load datasets
# Using direct Google Drive 'uc' links for public access
try:
    # Historical Trader Data from Hyperliquid
    trades = pd.read_csv("https://drive.google.com/uc?id=1IAfLZwu6rJzyWKgBToqwSmmVYU6VbjVs")
    # Fear & Greed Index
    sent = pd.read_csv("https://drive.google.com/uc?id=1PgQC0tO8XN-wqkNyghWc_-mnrYv_nhSf")
    print("✅ Files loaded successfully!")
    print("\nTrades shape:", trades.shape)
    print("Sentiment shape:", sent.shape)
except Exception as e:
    print(f"❌ Error loading files: {e}")
    # Exit or handle error gracefully if loading fails

# --- STEP 5: Preprocess and Clean Data
# Convert Timestamps
trades['Timestamp'] = pd.to_datetime(trades['Timestamp'], unit='ms', errors='coerce')
trades['Date'] = trades['Timestamp'].dt.date

# Convert Sentiment Dates
sent.rename(columns={'date': 'datetime_str', 'classification': 'classification_mode'}, inplace=True)
sent['datetime'] = pd.to_datetime(sent['datetime_str'], errors='coerce')
sent['Date'] = sent['datetime'].dt.date

# Clean numeric columns (convert to numeric, handle errors by coercing to NaN)
trades['Closed PnL'] = pd.to_numeric(trades['Closed PnL'], errors='coerce')
trades['leverage'] = pd.to_numeric(trades.get('leverage', pd.Series(0, index=trades.index)), errors='coerce') # Safely access 'leverage' and default to 0
trades['size'] = pd.to_numeric(trades.get('size', pd.Series(0, index=trades.index)), errors='coerce') # Safely access 'size' and default to 0
trades.dropna(subset=['Date', 'Closed PnL', 'leverage', 'size'], inplace=True) # Drop rows with NaN in key columns
sent.dropna(subset=['Date', 'value'], inplace=True)

# --- STEP 6: Aggregate Sentiment by day
# Calculate Mean Sentiment (value) and Modal Classification (Fear/Greed)
sent_daily = sent.groupby('Date', as_index=False).agg(
    avg_sentiment=('value', 'mean'),
    classification=('classification_mode', lambda x: x.mode()[0] if not x.mode().empty else np.nan)
)

# --- STEP 7: Aggregate Trader Performance by day (incorporating PnL, Risk, Volume, Leverage)
# Risk proxy: Standard Deviation of PnL or Volatility of size
# Volume: Sum of trade 'size'
perf_daily = trades.groupby('Date', as_index=False).agg(
    avg_pnl=('Closed PnL', 'mean'),             # Profitability
    total_volume=('size', 'sum'),               # Volume
    total_trades=('Closed PnL', 'count'),       # Activity
    avg_leverage=('leverage', 'mean'),          # Leverage/Risk
    pnl_std=('Closed PnL', 'std'),              # Risk Proxy: PnL Standard Deviation
)
perf_daily.rename(columns={'pnl_std': 'pnl_risk'}, inplace=True) # Rename for clarity

# Save the aggregated data to the required CSV folder
perf_daily.to_csv(os.path.join(CSV_DIR, 'daily_performance.csv'), index=False)
sent_daily.to_csv(os.path.join(CSV_DIR, 'daily_sentiment.csv'), index=False)
print(f"✅ Aggregated data saved to {CSV_DIR}")

# --- STEP 8: Merge both datasets
merged = pd.merge(perf_daily, sent_daily, on='Date', how='inner')
print("\nMerged dataset preview:")
print(merged.head())
# Save the final merged dataset
merged.to_csv(os.path.join(CSV_DIR, 'merged_sentiment_performance.csv'), index=False)

# --- STEP 9: Analyze Relationships (Correlation)
print("\n📈 Correlation Matrix (Sentiment vs Key Metrics):")
# Including newly calculated metrics: volume, risk, and leverage
correlation_matrix = merged[['avg_sentiment', 'avg_pnl', 'total_volume', 'avg_leverage', 'pnl_risk']].corr()
print(correlation_matrix)

# --- STEP 10: Visualize Trends (Visualization 1: Time Series)
plt.figure(figsize=(14, 7))
ax1 = sns.lineplot(data=merged, x='Date', y='avg_sentiment', color='orange', label='Avg Sentiment (Fear–Greed Index)')
ax1.set_ylabel('Avg Sentiment (Fear–Greed Index)', color='orange')
ax1.tick_params(axis='y', labelcolor='orange')

ax2 = plt.twinx(ax1)
sns.lineplot(data=merged, x='Date', y='avg_pnl', color='blue', label='Avg PnL', ax=ax2)
ax2.set_ylabel('Avg PnL', color='blue')
ax2.tick_params(axis='y', labelcolor='blue')

# Manually merge the legends
lines, labels = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax2.legend(lines + lines2, labels + labels2, loc='upper left')

plt.title('Time-Series: Market Sentiment vs Trader Profitability')
plt.xlabel('Date')
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.savefig(os.path.join(OUTPUTS_DIR, 'ts_sentiment_vs_pnl.png')) # Save image
plt.close() # Close plot to free memory
print(f"✅ Time-Series plot saved to {OUTPUTS_DIR}/ts_sentiment_vs_pnl.png")

# --- STEP 11: Visualize Sentiment Comparison (Visualization 2: Bar Plot)
plt.figure(figsize=(10, 5))
# Calculate the mean of key metrics based on the daily classification (Fear/Greed)
sentiment_comparison = merged.groupby('classification', as_index=False)[['avg_pnl', 'total_volume', 'avg_leverage']].mean()
sentiment_comparison_melted = sentiment_comparison.melt(id_vars='classification',
                                                     value_vars=['avg_pnl', 'total_volume', 'avg_leverage'],
                                                     var_name='Metric', value_name='Average Value')

sns.barplot(data=sentiment_comparison_melted, x='Metric', y='Average Value', hue='classification')
plt.title('Trader Behavior Metrics by Market Sentiment (Fear vs Greed)')
plt.ylabel('Average Value')
plt.xlabel('Trader Metric')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig(os.path.join(OUTPUTS_DIR, 'bar_metrics_by_sentiment.png')) # Save image
plt.close() # Close plot
print(f"✅ Bar plot saved to {OUTPUTS_DIR}/bar_metrics_by_sentiment.png")

# --- STEP 12: Key Insight Snapshot
best_pnl_day = merged.loc[merged['avg_pnl'].idxmax()]
worst_pnl_day = merged.loc[merged['avg_pnl'].idxmin()]
best_volume_day = merged.loc[merged['total_volume'].idxmax()]

print("\n--- Key Insights Snapshot ---")
print(f"💰 Day with Highest Avg PnL: {best_pnl_day['Date']} | Avg PnL: {best_pnl_day['avg_pnl']:.2f} | Sentiment: {best_pnl_day['classification']}")
print(f"💀 Day with Lowest Avg PnL: {worst_pnl_day['Date']} | Avg PnL: {worst_pnl_day['avg_pnl']:.2f} | Sentiment: {worst_pnl_day['classification']}")
print(f"🚀 Day with Highest Trading Volume: {best_volume_day['Date']} | Total Volume: {best_volume_day['total_volume']:.2f} | Sentiment: {best_volume_day['classification']}")

# --- STEP 13: PDF & README.md
# =============================================
# STEP 8: Generate PDF Report
# =============================================
styles = getSampleStyleSheet()
REPORT_PATH = os.path.join(ROOT_DIR, "ds_report.pdf")
doc = SimpleDocTemplate(REPORT_PATH, pagesize=A4)
story = []

story.append(Paragraph("<b>📊 Bitcoin Market Sentiment & Trader Performance Analysis</b>", styles["Title"]))
story.append(Spacer(1, 12))
story.append(Paragraph(
    "This report explores the relationship between trader performance (PnL, volume) "
    "and market sentiment (Fear/Greed Index) across daily intervals.", styles["BodyText"]))
story.append(Spacer(1, 12))

# Check if insights and mw_text are available from previous steps
# If not, you might need to recalculate them here or handle their absence
# For now, assuming they are available from previous execution
if 'insights' in locals() and 'mw_text' in locals():
    story.append(Paragraph("<b>🔹 Key Insights Snapshot:</b>", styles["Heading2"]))
    story.append(Paragraph(insights.replace("\n", "<br/>"), styles["BodyText"]))
    story.append(Spacer(1, 12))
    story.append(Paragraph(f"<b>Statistical Test:</b> {mw_text}", styles["BodyText"]))
    story.append(Spacer(1, 12))
    story.append(Paragraph(
        "These insights suggest that trader profitability and market sentiment are correlated, "
        "with higher PnL often observed during more optimistic (Greed) phases.", styles["BodyText"]))
else:
    story.append(Paragraph("<b>🔹 Key Insights Snapshot:</b>", styles["Heading2"]))
    story.append(Paragraph("Insights could not be generated due to missing data or previous errors.", styles["BodyText"]))
    story.append(Spacer(1, 12))
    story.append(Paragraph("<b>Statistical Test:</b> Statistical test could not be performed.", styles["BodyText"]))

doc.build(story)
print(f"✅ PDF report generated: {REPORT_PATH}")

# =============================================
# 🔹 STEP 9: Notebook + README
# =============================================
NB1 = os.path.join(ROOT_DIR, "notebook_1.ipynb")
README_PATH = os.path.join(ROOT_DIR, "README.md")
nb = nbformat.v4.new_notebook()
nb['cells'] = [
    nbformat.v4.new_markdown_cell("# Notebook 1 — Sentiment vs Trader Performance\nSubmitter: Khushpreet"),
    nbformat.v4.new_markdown_cell("## Overview\nThis notebook analyzes the link between Fear/Greed sentiment data and trading performance."),
    nbformat.v4.new_code_cell("import pandas as pd\nmerged = pd.read_csv('csv_files/merged_sentiment_performance.csv')\nmerged.head()"),
]
with open(NB1, "w", encoding="utf-8") as f: nbformat.write(nb, f)

readme_text = f"""
# ds_Khushpreet

### Contents
- `notebook_1.ipynb` — quick analysis notebook
- `csv_files/merged_sentiment_performance.csv` — combined dataset
- `outputs/pnl_vs_sentiment.png` — visualization
- `ds_report.pdf` — full PDF report
- `README.md` — this file
"""
with open(README_PATH, "w") as f: f.write(readme_text)

print("✅ Notebook and README generated!")
#



✅ Created root directory: ds_Khushpreet and subdirectories.
✅ Files loaded successfully!

Trades shape: (211224, 16)
Sentiment shape: (2644, 4)
✅ Aggregated data saved to ds_Khushpreet/csv_files

Merged dataset preview:
         Date    avg_pnl  total_volume  total_trades  avg_leverage  \
0  2023-03-28   0.000000             0             3           0.0   
1  2023-11-14   0.148807             0          1045           0.0   
2  2024-03-09  25.418772             0          6962           0.0   
3  2024-07-03  22.229713             0          7141           0.0   
4  2024-10-27  90.504272             0         35241           0.0   

      pnl_risk  avg_sentiment classification  
0     0.000000           59.0          Greed  
1   105.092113           69.0          Greed  
2   306.166937           84.0  Extreme Greed  
3   633.704815           50.0        Neutral  
4  1165.052548           74.0          Greed  

📈 Correlation Matrix (Sentiment vs Key Metrics):
               avg_sentimen