# Cryptocurrency Price Prediction - ML Models
## Multiple ML Algorithms for Price Prediction

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import joblib
import warnings
warnings.filterwarnings('ignore')

print("Libraries loaded successfully!")

In [None]:
# Load processed data
df = pd.read_csv('crypto_data_processed.csv')
df['Date'] = pd.to_datetime(df['Date'])

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")

## 1. Feature Selection and Preparation

In [None]:
# Select features for modeling
feature_cols = [
    'Open', 'High', 'Low', 'Volume', 'Marketcap',
    'Price_Change', 'Daily_Range', 'Volatility', 'Avg_Price',
    'Body_Size', 'Upper_Shadow', 'Lower_Shadow',
    'MA_7', 'MA_14', 'MA_30',
    'Volatility_7d', 'Volatility_14d', 'Volatility_30d',
    'Volume_MA_7', 'Volume_MA_14', 'Volume_MA_30',
    'Close_Lag_1', 'Close_Lag_3', 'Close_Lag_7',
    'Volume_Lag_1', 'Volume_Lag_3', 'Volume_Lag_7',
    'Return_Lag_1', 'Return_Lag_3', 'Return_Lag_7',
    'Year', 'Month', 'Day', 'DayOfWeek', 'Quarter', 'DayOfYear'
]

# Filter only existing columns
feature_cols = [col for col in feature_cols if col in df.columns]
target_col = 'Close'

# Remove rows with missing values
df_model = df[feature_cols + [target_col, 'Symbol']].dropna()

print(f"Model dataset shape: {df_model.shape}")
print(f"Features used: {len(feature_cols)}")
print(f"\nFeature list: {feature_cols}")

In [None]:
# Prepare features and target
X = df_model[feature_cols]
y = df_model[target_col]

# Split data (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"\nTarget statistics:")
print(f"  Train mean: ${y_train.mean():,.2f}")
print(f"  Test mean: ${y_test.mean():,.2f}")

In [None]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save scaler
joblib.dump(scaler, 'models/scaler.pkl')
print("Feature scaling completed and scaler saved!")

## 2. Model Training and Evaluation

In [None]:
# Function to evaluate models
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    # Train
    model.fit(X_train, y_train)
    
    # Predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Metrics
    train_mse = mean_squared_error(y_train, y_train_pred)
    test_mse = mean_squared_error(y_test, y_test_pred)
    train_rmse = np.sqrt(train_mse)
    test_rmse = np.sqrt(test_mse)
    train_mae = mean_absolute_error(y_train, y_train_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    
    results = {
        'Model': model_name,
        'Train_RMSE': train_rmse,
        'Test_RMSE': test_rmse,
        'Train_MAE': train_mae,
        'Test_MAE': test_mae,
        'Train_R2': train_r2,
        'Test_R2': test_r2
    }
    
    return results, model, y_test_pred

print("Evaluation function defined!")

In [None]:
# Dictionary to store models and results
models_dict = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(alpha=1.0),
    'Lasso': Lasso(alpha=1.0),
    'Decision Tree': DecisionTreeRegressor(max_depth=10, random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=15, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, max_depth=5, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, max_depth=7, learning_rate=0.1, random_state=42),
    'LightGBM': LGBMRegressor(n_estimators=100, max_depth=7, learning_rate=0.1, random_state=42, verbose=-1)
}

all_results = []
trained_models = {}
predictions = {}

print("Training models...\n")
for name, model in models_dict.items():
    print(f"Training {name}...")
    
    # Use scaled data for linear models, original for tree-based
    if name in ['Linear Regression', 'Ridge', 'Lasso', 'SVR']:
        results, trained_model, y_pred = evaluate_model(
            model, X_train_scaled, X_test_scaled, y_train, y_test, name
        )
    else:
        results, trained_model, y_pred = evaluate_model(
            model, X_train, X_test, y_train, y_test, name
        )
    
    all_results.append(results)
    trained_models[name] = trained_model
    predictions[name] = y_pred
    
    print(f"  Test RMSE: ${results['Test_RMSE']:,.2f}, Test R²: {results['Test_R2']:.4f}\n")

print("All models trained!")

In [None]:
# Results comparison
results_df = pd.DataFrame(all_results)
results_df = results_df.sort_values('Test_R2', ascending=False)

print("\n" + "="*100)
print("MODEL PERFORMANCE COMPARISON")
print("="*100)
print(results_df.to_string(index=False))
print("="*100)

# Save results
results_df.to_csv('models/model_comparison.csv', index=False)
print("\nResults saved to 'models/model_comparison.csv'")

## 3. Model Visualization

In [None]:
# Performance comparison plots
fig, axes = plt.subplots(2, 2, figsize=(18, 12))

# RMSE Comparison
x_pos = np.arange(len(results_df))
axes[0, 0].barh(x_pos, results_df['Test_RMSE'], color='steelblue', alpha=0.7, label='Test')
axes[0, 0].barh(x_pos, results_df['Train_RMSE'], color='lightcoral', alpha=0.5, label='Train')
axes[0, 0].set_yticks(x_pos)
axes[0, 0].set_yticklabels(results_df['Model'])
axes[0, 0].set_xlabel('RMSE')
axes[0, 0].set_title('Model RMSE Comparison', fontsize=14, fontweight='bold')
axes[0, 0].legend()
axes[0, 0].grid(axis='x', alpha=0.3)

# R² Comparison
axes[0, 1].barh(x_pos, results_df['Test_R2'], color='darkgreen', alpha=0.7, label='Test')
axes[0, 1].barh(x_pos, results_df['Train_R2'], color='lightgreen', alpha=0.5, label='Train')
axes[0, 1].set_yticks(x_pos)
axes[0, 1].set_yticklabels(results_df['Model'])
axes[0, 1].set_xlabel('R² Score')
axes[0, 1].set_title('Model R² Comparison', fontsize=14, fontweight='bold')
axes[0, 1].legend()
axes[0, 1].grid(axis='x', alpha=0.3)

# MAE Comparison
axes[1, 0].barh(x_pos, results_df['Test_MAE'], color='darkorange', alpha=0.7, label='Test')
axes[1, 0].barh(x_pos, results_df['Train_MAE'], color='gold', alpha=0.5, label='Train')
axes[1, 0].set_yticks(x_pos)
axes[1, 0].set_yticklabels(results_df['Model'])
axes[1, 0].set_xlabel('MAE')
axes[1, 0].set_title('Model MAE Comparison', fontsize=14, fontweight='bold')
axes[1, 0].legend()
axes[1, 0].grid(axis='x', alpha=0.3)

# Best model prediction vs actual
best_model_name = results_df.iloc[0]['Model']
best_predictions = predictions[best_model_name]
sample_size = min(1000, len(y_test))
sample_idx = np.random.choice(len(y_test), sample_size, replace=False)

axes[1, 1].scatter(y_test.iloc[sample_idx], best_predictions[sample_idx], 
                   alpha=0.5, s=20, color='purple')
axes[1, 1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 
                'r--', linewidth=2, label='Perfect Prediction')
axes[1, 1].set_xlabel('Actual Price')
axes[1, 1].set_ylabel('Predicted Price')
axes[1, 1].set_title(f'Best Model: {best_model_name}\nPrediction vs Actual', 
                     fontsize=14, fontweight='bold')
axes[1, 1].legend()
axes[1, 1].grid(alpha=0.3)

plt.tight_layout()
plt.savefig('model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

## 4. Feature Importance (Best Model)

In [None]:
# Feature importance for best tree-based model
best_model = trained_models[best_model_name]

if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'Feature': feature_cols,
        'Importance': best_model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    # Plot top 20 features
    plt.figure(figsize=(12, 8))
    plt.barh(feature_importance.head(20)['Feature'], 
             feature_importance.head(20)['Importance'],
             color='teal')
    plt.xlabel('Importance', fontsize=12)
    plt.title(f'Top 20 Feature Importance - {best_model_name}', 
              fontsize=14, fontweight='bold')
    plt.gca().invert_yaxis()
    plt.grid(axis='x', alpha=0.3)
    plt.tight_layout()
    plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("\nTop 10 Most Important Features:")
    print(feature_importance.head(10).to_string(index=False))
    
    # Save feature importance
    feature_importance.to_csv('models/feature_importance.csv', index=False)
else:
    print(f"{best_model_name} does not provide feature importance.")

## 5. Save Best Model

In [None]:
# Save the best model
best_model_path = f'models/best_model_{best_model_name.replace(" ", "_").lower()}.pkl'
joblib.dump(best_model, best_model_path)

# Save all models
for name, model in trained_models.items():
    model_path = f'models/{name.replace(" ", "_").lower()}_model.pkl'
    joblib.dump(model, model_path)

print(f"\nBest model ({best_model_name}) saved to: {best_model_path}")
print(f"All models saved to 'models/' directory")
print(f"\nBest Model Performance:")
print(f"  Test R²: {results_df.iloc[0]['Test_R2']:.4f}")
print(f"  Test RMSE: ${results_df.iloc[0]['Test_RMSE']:,.2f}")
print(f"  Test MAE: ${results_df.iloc[0]['Test_MAE']:,.2f}")

## 6. Model Residual Analysis

In [None]:
# Residual analysis for best model
residuals = y_test - best_predictions

fig, axes = plt.subplots(2, 2, figsize=(16, 10))

# Residuals distribution
axes[0, 0].hist(residuals, bins=50, edgecolor='black', alpha=0.7, color='navy')
axes[0, 0].axvline(0, color='red', linestyle='--', linewidth=2)
axes[0, 0].set_xlabel('Residuals')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Residuals Distribution', fontsize=12, fontweight='bold')

# Residuals vs Predicted
axes[0, 1].scatter(best_predictions, residuals, alpha=0.5, s=20, color='darkgreen')
axes[0, 1].axhline(0, color='red', linestyle='--', linewidth=2)
axes[0, 1].set_xlabel('Predicted Values')
axes[0, 1].set_ylabel('Residuals')
axes[0, 1].set_title('Residuals vs Predicted', fontsize=12, fontweight='bold')
axes[0, 1].grid(alpha=0.3)

# Q-Q plot
from scipy import stats
stats.probplot(residuals, dist="norm", plot=axes[1, 0])
axes[1, 0].set_title('Q-Q Plot', fontsize=12, fontweight='bold')

# Residuals over index
axes[1, 1].scatter(range(len(residuals)), residuals, alpha=0.5, s=20, color='purple')
axes[1, 1].axhline(0, color='red', linestyle='--', linewidth=2)
axes[1, 1].set_xlabel('Index')
axes[1, 1].set_ylabel('Residuals')
axes[1, 1].set_title('Residuals Over Index', fontsize=12, fontweight='bold')
axes[1, 1].grid(alpha=0.3)

plt.suptitle(f'Residual Analysis - {best_model_name}', 
             fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('residual_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\nResidual Statistics:")
print(f"  Mean: {residuals.mean():.6f}")
print(f"  Std: {residuals.std():.6f}")
print(f"  Min: {residuals.min():.6f}")
print(f"  Max: {residuals.max():.6f}")