# Cryptocurrency Data Processing Pipeline
## Step 1: Data Loading and Preprocessing

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.8f' % x)

print("Libraries imported successfully!")

In [None]:
# Load the cryptocurrency data
df = pd.read_csv('crypto_data.csv')

print(f"Dataset shape: {df.shape}")
print(f"\nColumn names: {df.columns.tolist()}")
print(f"\nFirst few rows:")
df.head()

In [None]:
# Data info and missing values
print("Dataset Information:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())
print("\nBasic Statistics:")
df.describe()

In [None]:
# Data Cleaning
# Convert Date column to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Handle missing values
df = df.dropna(subset=['Close', 'Volume'])

# Remove duplicates
df = df.drop_duplicates(subset=['Symbol', 'Date'])

# Sort by Symbol and Date
df = df.sort_values(['Symbol', 'Date']).reset_index(drop=True)

print(f"Cleaned dataset shape: {df.shape}")
print(f"Date range: {df['Date'].min()} to {df['Date'].max()}")
print(f"Unique cryptocurrencies: {df['Symbol'].nunique()}")

## Feature Engineering

In [None]:
# Create technical indicators and features
def add_technical_indicators(df):
    df_copy = df.copy()
    
    # Price features
    df_copy['Price_Change'] = df_copy['Close'] - df_copy['Open']
    df_copy['Price_Change_Pct'] = (df_copy['Price_Change'] / df_copy['Open']) * 100
    df_copy['Daily_Range'] = df_copy['High'] - df_copy['Low']
    df_copy['Volatility'] = (df_copy['Daily_Range'] / df_copy['High']) * 100
    
    # Average price
    df_copy['Avg_Price'] = (df_copy['High'] + df_copy['Low']) / 2
    
    # Body size (candle)
    df_copy['Body_Size'] = abs(df_copy['Close'] - df_copy['Open'])
    
    # Upper and Lower shadows
    df_copy['Upper_Shadow'] = df_copy['High'] - df_copy[['Open', 'Close']].max(axis=1)
    df_copy['Lower_Shadow'] = df_copy[['Open', 'Close']].min(axis=1) - df_copy['Low']
    
    return df_copy

df_processed = add_technical_indicators(df)
print("Technical indicators added!")
df_processed.head()

In [None]:
# Add moving averages and rolling statistics
def add_rolling_features(df, windows=[7, 14, 30]):
    df_copy = df.copy()
    
    for window in windows:
        # Moving averages
        df_copy[f'MA_{window}'] = df_copy.groupby('Symbol')['Close'].transform(
            lambda x: x.rolling(window=window, min_periods=1).mean()
        )
        
        # Rolling volatility
        df_copy[f'Volatility_{window}d'] = df_copy.groupby('Symbol')['Close'].transform(
            lambda x: x.rolling(window=window, min_periods=1).std()
        )
        
        # Rolling volume average
        df_copy[f'Volume_MA_{window}'] = df_copy.groupby('Symbol')['Volume'].transform(
            lambda x: x.rolling(window=window, min_periods=1).mean()
        )
    
    return df_copy

df_processed = add_rolling_features(df_processed)
print("Rolling features added!")
print(f"Total features: {df_processed.shape[1]}")

In [None]:
# Add lag features
def add_lag_features(df, lags=[1, 3, 7]):
    df_copy = df.copy()
    
    for lag in lags:
        df_copy[f'Close_Lag_{lag}'] = df_copy.groupby('Symbol')['Close'].shift(lag)
        df_copy[f'Volume_Lag_{lag}'] = df_copy.groupby('Symbol')['Volume'].shift(lag)
        df_copy[f'Return_Lag_{lag}'] = df_copy.groupby('Symbol')['Price_Change_Pct'].shift(lag)
    
    return df_copy

df_processed = add_lag_features(df_processed)
print("Lag features added!")
df_processed.head(10)

In [None]:
# Add time-based features
df_processed['Year'] = df_processed['Date'].dt.year
df_processed['Month'] = df_processed['Date'].dt.month
df_processed['Day'] = df_processed['Date'].dt.day
df_processed['DayOfWeek'] = df_processed['Date'].dt.dayofweek
df_processed['Quarter'] = df_processed['Date'].dt.quarter
df_processed['DayOfYear'] = df_processed['Date'].dt.dayofyear

print("Time-based features added!")

## Save Processed Data

In [None]:
# Save the processed dataset
df_processed.to_csv('crypto_data_processed.csv', index=False)
print("Processed data saved to 'crypto_data_processed.csv'")

# Save statistics per coin
coin_stats = df_processed.groupby('Symbol').agg({
    'Name': 'first',
    'Close': ['mean', 'std', 'min', 'max'],
    'Volume': ['mean', 'sum'],
    'Volatility': 'mean',
    'Price_Change_Pct': ['mean', 'std'],
    'Date': ['min', 'max', 'count']
}).reset_index()

coin_stats.columns = ['_'.join(col).strip('_') for col in coin_stats.columns.values]
coin_stats.to_csv('coin_statistics.csv', index=False)
print("Coin statistics saved to 'coin_statistics.csv'")

print(f"\nFinal processed dataset shape: {df_processed.shape}")
print(f"Total features: {df_processed.shape[1]}")