# Dogecoin Price Prediction

## Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from joblib import dump, load

## Data Preprocessing

In [2]:
# Load the data from the CSV file
doge_data = pd.read_csv('../data/dogecoin_data.csv')

# Ensure the 'Date' column is present and set it as the index
doge_data.reset_index(inplace=True)

# Feature engineering: create additional features
doge_data['Date'] = pd.to_datetime(doge_data['Date'])
doge_data['Day'] = doge_data['Date'].dt.day
doge_data['Month'] = doge_data['Date'].dt.month
doge_data['Year'] = doge_data['Date'].dt.year
doge_data['DayOfWeek'] = doge_data['Date'].dt.dayofweek
doge_data.set_index('Date', inplace=True)
doge_data.drop(columns=['index'], inplace=True)
doge_data.ffill(inplace=True)

# Verify that columns are unique
if doge_data.columns.duplicated().any():
    raise ValueError(f"Duplicate columns found: {doge_data.columns[doge_data.columns.duplicated()]}")

# Print the first few rows of the DataFrame to verify the data
print(doge_data.head())

               Close      High       Low      Open    Volume  Day  Month  \
Date                                                                       
2020-01-01  0.001812  0.001829  0.001802  0.001808  45619467    1      1   
2020-01-02  0.001798  0.001889  0.001775  0.001813  58247425    2      1   
2020-01-03  0.001922  0.001951  0.001782  0.001797  56113646    3      1   
2020-01-04  0.002008  0.002231  0.001837  0.001921  84437147    4      1   
2020-01-05  0.002168  0.002231  0.001897  0.002007  47158934    5      1   

            Year  DayOfWeek  
Date                         
2020-01-01  2020          2  
2020-01-02  2020          3  
2020-01-03  2020          4  
2020-01-04  2020          5  
2020-01-05  2020          6  


### Define Columns Transformer and Pipeline

In [3]:
# Define the columns transformer
transformer = ColumnTransformer([
    ('date', Pipeline([
        ('imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
        ('scaler', StandardScaler())
    ]), make_column_selector(pattern='date')),
    ('high', Pipeline([
        ('imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
        ('scaler', StandardScaler())
    ]), make_column_selector(pattern='high')),
    ('low', Pipeline([
        ('imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
        ('scaler', StandardScaler())
    ]), make_column_selector(pattern='low')),
    ('open', Pipeline([
        ('imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
        ('scaler', StandardScaler())
    ]), make_column_selector(pattern='open')),
    ('volume', Pipeline([
        ('imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
        ('scaler', StandardScaler())
    ]), make_column_selector(pattern='volume')),
    ('day', Pipeline([
        ('imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
        ('scaler', StandardScaler())
    ]), make_column_selector(pattern='day')),
    ('month', Pipeline([
        ('imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
        ('scaler', StandardScaler())
    ]), make_column_selector(pattern='month')),
    ('year', Pipeline([
        ('imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
        ('scaler', StandardScaler())
    ]), make_column_selector(pattern='year')),
    ('dayofweek', Pipeline([
        ('imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
        ('scaler', StandardScaler())
    ]), make_column_selector(pattern='dayofweek')),
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(categories='auto', drop='first', handle_unknown='ignore'))
    ]), make_column_selector(dtype_include='category'))
], remainder='passthrough', verbose_feature_names_out=True, sparse_threshold=0)

# Define the steps for the pipeline
steps = [
    ('transformer', transformer),
    ('regressor', RandomForestRegressor())
]

# Define the RandomForest pipeline
rf_pipeline = Pipeline(steps=steps)

## Split Data

In [4]:
X = doge_data.drop(columns=['Close'], axis=1)
y = doge_data['Close']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Hyperparameter Tuning

In [5]:
# Define the seeds for reproducibility
cl_seeds = (42, 123, 643, 1337, 543, 1, 99, 885, 23, 77)

rf_param_grid = {
    'regressor__random_state': cl_seeds,
    'regressor__n_estimators': [200, 300, 400],
    'regressor__max_depth': [None, 2, 5, 10, 20, 50],
    'regressor__max_features': ['sqrt'],
    'regressor__bootstrap': [True],
}

model_list = [('rf', rf_pipeline)]
model_params_grid = [rf_param_grid]

## Best Model

In [6]:
# Get the best model from grid search
best_models_gs = []
for (model_name, model), hp in zip(model_list, model_params_grid):
    grid = GridSearchCV(model, hp, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
    grid.fit(X_train, y_train)
    print(f"Model: {model_name}")
    print(f"Best params: {grid.best_params_}, Best score: {grid.best_score_}")
    best_models_gs.append((model_name, grid.best_estimator_, grid.best_score_, grid.best_params_))

hof_model_gs = max(best_models_gs, key=lambda item:item[2])
print(f"HoF Model: {hof_model_gs}")

Model: rf
Best params: {'regressor__bootstrap': True, 'regressor__max_depth': 10, 'regressor__max_features': 'sqrt', 'regressor__n_estimators': 200, 'regressor__random_state': 77}, Best score: -5.554124705695861e-05
HoF Model: ('rf', Pipeline(steps=[('transformer',
                 ColumnTransformer(remainder='passthrough', sparse_threshold=0,
                                   transformers=[('date',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7f9770922cf0>),
                                                 ('high',
                                                  Pipelin

## Evaluation

In [7]:
# Best model prediction and scores
hof_gs = hof_model_gs[1]
y_pred_train = hof_gs.predict(X_train)
y_pred_test = hof_gs.predict(X_test)
print(f'Mean Squared Error on training set: {mean_squared_error(y_train, y_pred_train)}')
print(f'Mean Squared Error on test set: {mean_squared_error(y_test, y_pred_test)}')

Mean Squared Error on training set: 8.347484636243886e-06
Mean Squared Error on test set: 2.9353643991832407e-05


## Save the Model

In [8]:
# Re-fit the model on the entire dataset
y = doge_data['Close']
x = doge_data.drop('Close', axis = 1)
columns = x.columns

scaler = StandardScaler()
scaler = scaler.fit(x)
X = scaler.transform(x)
features = pd.DataFrame(X, columns = columns)

# Save the transformer and model
dump(transformer, '../models/doge_transformer.joblib')
dump(hof_gs, '../models/doge_model.joblib')

['../models/doge_model.joblib']

## Inference

In [9]:
# Load new data for prediction
high = 0.06
low = 0.04
open = 0.05
volume = 1000000
day = 1
month = 1
year = 2023
dayofweek = 0
feat_cols = features.columns

row = [high, low, open, volume, day, month, year, dayofweek]

# Load the transformer and model
transformer = load('../models/doge_transformer.joblib')
model = load('../models/doge_model.joblib')

# Check the feature columns
print(feat_cols)

# Transform the new data
df = pd.DataFrame([row], columns = feat_cols)
tr = transformer.fit(df)
X = tr.transform(df)
features = pd.DataFrame(X, columns = feat_cols)

# Making a prediction
try:
    prediction = model.predict(features)
    print(f'Predicted Close Price: {prediction[0]}')
except Exception as e:
    print(f'Error during prediction: {e}')

Index(['High', 'Low', 'Open', 'Volume', 'Day', 'Month', 'Year', 'DayOfWeek'], dtype='object')
Predicted Close Price: 0.0567910090986183
