# Cryptocurrency Data Processing Pipeline
## Processing Combined Multi-Crypto Dataset

In [5]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.8f' % x)

print("✅ Libraries imported successfully!")
print(f"Current date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

✅ Libraries imported successfully!
Current date: 2025-11-14 17:27:56


## 1. Load Combined Dataset

In [6]:
# Load the combined cryptocurrency data
df = pd.read_csv('crypto_data_combined.csv')

print("=" * 70)
print("DATASET LOADED")
print("=" * 70)
print(f"Dataset shape: {df.shape}")
print(f"Total records: {len(df):,}")
print(f"Unique cryptocurrencies: {df['Symbol'].nunique()}")
print(f"\nColumn names: {df.columns.tolist()}")
print("=" * 70)

print("\nFirst few rows:")
df.head(10)

FileNotFoundError: [Errno 2] No such file or directory: 'crypto_data_combined.csv'

In [3]:
# Data info and missing values
print("\nDataset Information:")
print(df.info())

print("\n" + "=" * 70)
print("MISSING VALUES CHECK")
print("=" * 70)
missing = df.isnull().sum()
if missing.sum() > 0:
    print(missing[missing > 0])
else:
    print("✅ No missing values found!")

print("\n" + "=" * 70)
print("CRYPTOCURRENCY DISTRIBUTION")
print("=" * 70)
print(df['Symbol'].value_counts())


Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52287 entries, 0 to 52286
Columns: 130 entries, SNo to Volume.24
dtypes: float64(2), object(128)
memory usage: 51.9+ MB
None

MISSING VALUES CHECK
SNo              1
Name             1
Symbol           1
Date             1
High         50142
             ...  
High.24      50142
Low.24       50142
Open.24      50142
Close.24     50142
Volume.24    50142
Length: 130, dtype: int64

CRYPTOCURRENCY DISTRIBUTION
Symbol
ADA      2144
DOGE     2144
ALGO     2144
ATOM     2144
BNB      2144
BTC      2144
ETH      2144
ETC      2144
EOS      2144
XRP      2144
XMR      2144
VET      2144
FIL      2144
LINK     2144
LTC      2144
TRX      2144
THETA    2144
XTZ      2144
XLM      2144
SOL      2044
UNI      1934
DOT      1912
MATIC    1910
AVAX     1881
AAVE     1869
Name: count, dtype: int64


## 2. Data Cleaning

In [4]:
# Convert Date column to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Handle missing values (if any)
initial_rows = len(df)
df = df.dropna(subset=['Close', 'Volume', 'Open', 'High', 'Low'])
dropped_rows = initial_rows - len(df)

# Remove duplicates
initial_rows = len(df)
df = df.drop_duplicates(subset=['Symbol', 'Date'])
duplicate_rows = initial_rows - len(df)

# Sort by Symbol and Date
df = df.sort_values(['Symbol', 'Date']).reset_index(drop=True)

# Remove any rows with zero or negative prices
df = df[(df['Close'] > 0) & (df['Open'] > 0) & (df['High'] > 0) & (df['Low'] > 0)]

print("=" * 70)
print("DATA CLEANING SUMMARY")
print("=" * 70)
print(f"Rows dropped (missing values): {dropped_rows}")
print(f"Duplicate rows removed: {duplicate_rows}")
print(f"Cleaned dataset shape: {df.shape}")
print(f"Date range: {df['Date'].min().date()} to {df['Date'].max().date()}")
print(f"Total trading days: {df['Date'].nunique()}")
print(f"Cryptocurrencies: {df['Symbol'].nunique()}")
print("=" * 70)

TypeError: '>' not supported between instances of 'str' and 'int'

## 3. Feature Engineering - Technical Indicators

In [17]:
# Create technical indicators and features
def add_technical_indicators(df):
    """
    Add technical indicators to the dataframe
    """
    df_copy = df.copy()
    
    # Price features
    df_copy['Price_Change'] = df_copy['Close'] - df_copy['Open']
    df_copy['Price_Change_Pct'] = (df_copy['Price_Change'] / df_copy['Open']) * 100
    df_copy['Daily_Range'] = df_copy['High'] - df_copy['Low']
    df_copy['Volatility'] = (df_copy['Daily_Range'] / df_copy['High']) * 100
    
    # Average price
    df_copy['Avg_Price'] = (df_copy['High'] + df_copy['Low'] + df_copy['Close']) / 3
    
    # Typical Price (HLC/3)
    df_copy['Typical_Price'] = (df_copy['High'] + df_copy['Low'] + df_copy['Close']) / 3
    
    # Body size (candle)
    df_copy['Body_Size'] = abs(df_copy['Close'] - df_copy['Open'])
    df_copy['Body_Size_Pct'] = (df_copy['Body_Size'] / df_copy['Open']) * 100
    
    # Upper and Lower shadows
    df_copy['Upper_Shadow'] = df_copy['High'] - df_copy[['Open', 'Close']].max(axis=1)
    df_copy['Lower_Shadow'] = df_copy[['Open', 'Close']].min(axis=1) - df_copy['Low']
    
    # Bullish/Bearish indicator
    df_copy['Is_Bullish'] = (df_copy['Close'] > df_copy['Open']).astype(int)
    
    return df_copy

print("Adding technical indicators...")
df_processed = add_technical_indicators(df)
print(f"✅ Technical indicators added! New shape: {df_processed.shape}")
print(f"New columns: {df_processed.shape[1] - df.shape[1]}")

df_processed.head()

Adding technical indicators...


TypeError: unsupported operand type(s) for -: 'str' and 'str'

## 4. Feature Engineering - Moving Averages

In [None]:
# Add moving averages and rolling statistics
def add_rolling_features(df, windows=[7, 14, 21, 30]):
    """
    Add moving averages and rolling statistics
    Calculated per cryptocurrency (grouped by Symbol)
    """
    df_copy = df.copy()
    
    print(f"Calculating rolling features for windows: {windows}")
    
    for window in windows:
        print(f"  - Processing {window}-day window...")
        
        # Moving averages (SMA)
        df_copy[f'SMA_{window}'] = df_copy.groupby('Symbol')['Close'].transform(
            lambda x: x.rolling(window=window, min_periods=1).mean()
        )
        
        # Exponential Moving Average (EMA)
        df_copy[f'EMA_{window}'] = df_copy.groupby('Symbol')['Close'].transform(
            lambda x: x.ewm(span=window, adjust=False).mean()
        )
        
        # Rolling volatility (standard deviation)
        df_copy[f'Rolling_Std_{window}'] = df_copy.groupby('Symbol')['Close'].transform(
            lambda x: x.rolling(window=window, min_periods=1).std()
        )
        
        # Rolling volume average
        df_copy[f'Volume_MA_{window}'] = df_copy.groupby('Symbol')['Volume'].transform(
            lambda x: x.rolling(window=window, min_periods=1).mean()
        )
        
        # Rolling min and max
        df_copy[f'Rolling_Max_{window}'] = df_copy.groupby('Symbol')['High'].transform(
            lambda x: x.rolling(window=window, min_periods=1).max()
        )
        df_copy[f'Rolling_Min_{window}'] = df_copy.groupby('Symbol')['Low'].transform(
            lambda x: x.rolling(window=window, min_periods=1).min()
        )
    
    return df_copy

df_processed = add_rolling_features(df_processed)
print(f"\n✅ Rolling features added! New shape: {df_processed.shape}")

## 5. Feature Engineering - Lag Features

In [None]:
# Add lag features (previous day values)
def add_lag_features(df, lags=[1, 2, 3, 5, 7]):
    """
    Add lag features (previous values)
    Grouped by Symbol to avoid mixing different cryptocurrencies
    """
    df_copy = df.copy()
    
    print(f"Adding lag features for periods: {lags}")
    
    for lag in lags:
        print(f"  - Creating {lag}-day lag features...")
        
        # Lag close prices
        df_copy[f'Close_Lag_{lag}'] = df_copy.groupby('Symbol')['Close'].shift(lag)
        
        # Lag volume
        df_copy[f'Volume_Lag_{lag}'] = df_copy.groupby('Symbol')['Volume'].shift(lag)
        
        # Lag returns
        df_copy[f'Return_Lag_{lag}'] = df_copy.groupby('Symbol')['Price_Change_Pct'].shift(lag)
        
        # Lag volatility
        df_copy[f'Volatility_Lag_{lag}'] = df_copy.groupby('Symbol')['Volatility'].shift(lag)
    
    return df_copy

df_processed = add_lag_features(df_processed)
print(f"\n✅ Lag features added! New shape: {df_processed.shape}")

## 6. Feature Engineering - Time-Based Features

In [None]:
# Add time-based features
print("Adding time-based features...")

df_processed['Year'] = df_processed['Date'].dt.year
df_processed['Month'] = df_processed['Date'].dt.month
df_processed['Day'] = df_processed['Date'].dt.day
df_processed['DayOfWeek'] = df_processed['Date'].dt.dayofweek  # Monday=0, Sunday=6
df_processed['Quarter'] = df_processed['Date'].dt.quarter
df_processed['DayOfYear'] = df_processed['Date'].dt.dayofyear
df_processed['WeekOfYear'] = df_processed['Date'].dt.isocalendar().week

# Is weekend?
df_processed['Is_Weekend'] = (df_processed['DayOfWeek'] >= 5).astype(int)

# Month start/end
df_processed['Is_Month_Start'] = df_processed['Date'].dt.is_month_start.astype(int)
df_processed['Is_Month_End'] = df_processed['Date'].dt.is_month_end.astype(int)

print(f"✅ Time-based features added! Final shape: {df_processed.shape}")

## 7. Target Variable - Next Day Close Price

In [None]:
# Create target variable: Next day's closing price
print("Creating target variable (Next Day Close Price)...")

df_processed['Next_Day_Close'] = df_processed.groupby('Symbol')['Close'].shift(-1)

# Also create target for price change
df_processed['Next_Day_Price_Change'] = df_processed['Next_Day_Close'] - df_processed['Close']
df_processed['Next_Day_Price_Change_Pct'] = (
    (df_processed['Next_Day_Close'] - df_processed['Close']) / df_processed['Close']
) * 100

# Binary classification target (Up/Down)
df_processed['Next_Day_Direction'] = (df_processed['Next_Day_Close'] > df_processed['Close']).astype(int)

print("✅ Target variables created!")
print("\nTarget variables:")
print("  - Next_Day_Close: Actual next day closing price (Regression)")
print("  - Next_Day_Price_Change_Pct: Percentage change (Regression)")
print("  - Next_Day_Direction: 1=Up, 0=Down (Classification)")

## 8. Final Data Quality Check

In [None]:
print("=" * 70)
print("FINAL PROCESSED DATASET SUMMARY")
print("=" * 70)
print(f"Total records: {len(df_processed):,}")
print(f"Total features: {df_processed.shape[1]}")
print(f"Cryptocurrencies: {df_processed['Symbol'].nunique()}")
print(f"Date range: {df_processed['Date'].min().date()} to {df_processed['Date'].max().date()}")

print("\nMissing values per column:")
missing = df_processed.isnull().sum()
missing_pct = (missing / len(df_processed)) * 100
missing_df = pd.DataFrame({
    'Missing_Count': missing[missing > 0],
    'Missing_Pct': missing_pct[missing > 0]
}).sort_values('Missing_Count', ascending=False)

if len(missing_df) > 0:
    print(missing_df)
else:
    print("✅ No missing values!")

print("\nFeature categories:")
feature_cols = [col for col in df_processed.columns if col not in ['SNo', 'Name', 'Symbol', 'Date']]
print(f"  - Total features: {len(feature_cols)}")
print("=" * 70)

## 9. Save Processed Data

In [None]:
# Save the processed dataset
output_file = 'crypto_data_processed.csv'
df_processed.to_csv(output_file, index=False)
print(f"✅ Processed data saved to: {output_file}")

# Save statistics per coin
print("\nGenerating coin-wise statistics...")
coin_stats = df_processed.groupby('Symbol').agg({
    'Name': 'first',
    'Close': ['count', 'mean', 'std', 'min', 'max'],
    'Volume': ['mean', 'sum'],
    'Volatility': 'mean',
    'Price_Change_Pct': ['mean', 'std'],
    'Date': ['min', 'max']
}).reset_index()

coin_stats.columns = ['_'.join(str(col)).strip('_') for col in coin_stats.columns.values]
coin_stats.to_csv('coin_statistics.csv', index=False)
print("✅ Coin statistics saved to: coin_statistics.csv")

# Save feature list
feature_list = pd.DataFrame({
    'Feature_Name': df_processed.columns.tolist(),
    'Data_Type': df_processed.dtypes.values.astype(str)
})
feature_list.to_csv('feature_list.csv', index=False)
print("✅ Feature list saved to: feature_list.csv")

print("\n" + "=" * 70)
print("DATA PROCESSING COMPLETE!")
print("=" * 70)
print("\nGenerated files:")
print("  1. crypto_data_processed.csv - Full processed dataset")
print("  2. coin_statistics.csv - Summary stats per cryptocurrency")
print("  3. feature_list.csv - List of all features")
print("\nNext step: Run 02_exploratory_data_analysis.ipynb")
print("=" * 70)

In [None]:
# Display sample of processed data
print("\nSample of processed data:")
df_processed[['Date', 'Symbol', 'Close', 'Next_Day_Close', 'Next_Day_Direction', 
              'SMA_7', 'SMA_14', 'Volatility', 'Price_Change_Pct']].head(15)