In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Cryptocurrency Data Processing Pipeline\n",
    "## Step 1: Data Loading and Preprocessing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import required libraries\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from datetime import datetime\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "# Set display options\n",
    "pd.set_option('display.max_columns', None)\n",
    "pd.set_option('display.float_format', lambda x: '%.8f' % x)\n",
    "\n",
    "print(\"Libraries imported successfully!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the cryptocurrency data\n",
    "df = pd.read_csv('crypto_data.csv')\n",
    "\n",
    "print(f\"Dataset shape: {df.shape}\")\n",
    "print(f\"\\nColumn names: {df.columns.tolist()}\")\n",
    "print(f\"\\nFirst few rows:\")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Data info and missing values\n",
    "print(\"Dataset Information:\")\n",
    "print(df.info())\n",
    "print(\"\\nMissing Values:\")\n",
    "print(df.isnull().sum())\n",
    "print(\"\\nBasic Statistics:\")\n",
    "df.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Data Cleaning\n",
    "# Convert Date column to datetime\n",
    "df['Date'] = pd.to_datetime(df['Date'])\n",
    "\n",
    "# Handle missing values\n",
    "df = df.dropna(subset=['Close', 'Volume'])\n",
    "\n",
    "# Remove duplicates\n",
    "df = df.drop_duplicates(subset=['Symbol', 'Date'])\n",
    "\n",
    "# Sort by Symbol and Date\n",
    "df = df.sort_values(['Symbol', 'Date']).reset_index(drop=True)\n",
    "\n",
    "print(f\"Cleaned dataset shape: {df.shape}\")\n",
    "print(f\"Date range: {df['Date'].min()} to {df['Date'].max()}\")\n",
    "print(f\"Unique cryptocurrencies: {df['Symbol'].nunique()}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Feature Engineering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create technical indicators and features\n",
    "def add_technical_indicators(df):\n",
    "    df_copy = df.copy()\n",
    "    \n",
    "    # Price features\n",
    "    df_copy['Price_Change'] = df_copy['Close'] - df_copy['Open']\n",
    "    df_copy['Price_Change_Pct'] = (df_copy['Price_Change'] / df_copy['Open']) * 100\n",
    "    df_copy['Daily_Range'] = df_copy['High'] - df_copy['Low']\n",
    "    df_copy['Volatility'] = (df_copy['Daily_Range'] / df_copy['High']) * 100\n",
    "    \n",
    "    # Average price\n",
    "    df_copy['Avg_Price'] = (df_copy['High'] + df_copy['Low']) / 2\n",
    "    \n",
    "    # Body size (candle)\n",
    "    df_copy['Body_Size'] = abs(df_copy['Close'] - df_copy['Open'])\n",
    "    \n",
    "    # Upper and Lower shadows\n",
    "    df_copy['Upper_Shadow'] = df_copy['High'] - df_copy[['Open', 'Close']].max(axis=1)\n",
    "    df_copy['Lower_Shadow'] = df_copy[['Open', 'Close']].min(axis=1) - df_copy['Low']\n",
    "    \n",
    "    return df_copy\n",
    "\n",
    "df_processed = add_technical_indicators(df)\n",
    "print(\"Technical indicators added!\")\n",
    "df_processed.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Add moving averages and rolling statistics\n",
    "def add_rolling_features(df, windows=[7, 14, 30]):\n",
    "    df_copy = df.copy()\n",
    "    \n",
    "    for window in windows:\n",
    "        # Moving averages\n",
    "        df_copy[f'MA_{window}'] = df_copy.groupby('Symbol')['Close'].transform(\n",
    "            lambda x: x.rolling(window=window, min_periods=1).mean()\n",
    "        )\n",
    "        \n",
    "        # Rolling volatility\n",
    "        df_copy[f'Volatility_{window}d'] = df_copy.groupby('Symbol')['Close'].transform(\n",
    "            lambda x: x.rolling(window=window, min_periods=1).std()\n",
    "        )\n",
    "        \n",
    "        # Rolling volume average\n",
    "        df_copy[f'Volume_MA_{window}'] = df_copy.groupby('Symbol')['Volume'].transform(\n",
    "            lambda x: x.rolling(window=window, min_periods=1).mean()\n",
    "        )\n",
    "    \n",
    "    return df_copy\n",
    "\n",
    "df_processed = add_rolling_features(df_processed)\n",
    "print(\"Rolling features added!\")\n",
    "print(f\"Total features: {df_processed.shape[1]}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Add lag features\n",
    "def add_lag_features(df, lags=[1, 3, 7]):\n",
    "    df_copy = df.copy()\n",
    "    \n",
    "    for lag in lags:\n",
    "        df_copy[f'Close_Lag_{lag}'] = df_copy.groupby('Symbol')['Close'].shift(lag)\n",
    "        df_copy[f'Volume_Lag_{lag}'] = df_copy.groupby('Symbol')['Volume'].shift(lag)\n",
    "        df_copy[f'Return_Lag_{lag}'] = df_copy.groupby('Symbol')['Price_Change_Pct'].shift(lag)\n",
    "    \n",
    "    return df_copy\n",
    "\n",
    "df_processed = add_lag_features(df_processed)\n",
    "print(\"Lag features added!\")\n",
    "df_processed.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Add time-based features\n",
    "df_processed['Year'] = df_processed['Date'].dt.year\n",
    "df_processed['Month'] = df_processed['Date'].dt.month\n",
    "df_processed['Day'] = df_processed['Date'].dt.day\n",
    "df_processed['DayOfWeek'] = df_processed['Date'].dt.dayofweek\n",
    "df_processed['Quarter'] = df_processed['Date'].dt.quarter\n",
    "df_processed['DayOfYear'] = df_processed['Date'].dt.dayofyear\n",
    "\n",
    "print(\"Time-based features added!\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Save Processed Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save the processed dataset\n",
    "df_processed.to_csv('crypto_data_processed.csv', index=False)\n",
    "print(\"Processed data saved to 'crypto_data_processed.csv'\")\n",
    "\n",
    "# Save statistics per coin\n",
    "coin_stats = df_processed.groupby('Symbol').agg({\n",
    "    'Name': 'first',\n",
    "    'Close': ['mean', 'std', 'min', 'max'],\n",
    "    'Volume': ['mean', 'sum'],\n",
    "    'Volatility': 'mean',\n",
    "    'Price_Change_Pct': ['mean', 'std'],\n",
    "    'Date': ['min', 'max', 'count']\n",
    "}).reset_index()\n",
    "\n",
    "coin_stats.columns = ['_'.join(col).strip('_') for col in coin_stats.columns.values]\n",
    "coin_stats.to_csv('coin_statistics.csv', index=False)\n",
    "print(\"Coin statistics saved to 'coin_statistics.csv'\")\n",
    "\n",
    "print(f\"\\nFinal processed dataset shape: {df_processed.shape}\")\n",
    "print(f\"Total features: {df_processed.shape[1]}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.8.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
