In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

In [3]:
# Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

In [6]:
# Load the train and test datasets
train_df = pd.read_csv("/Users/mac/Downloads/directional-forecasting-in-cryptocurrencies/train.csv")
test_df = pd.read_csv("/Users/mac/Downloads/directional-forecasting-in-cryptocurrencies/test.csv")

# Display the first few rows of the datasets to ensure they are loaded correctly
print(train_df.head())
print(test_df.head())

    timestamp     open     high      low    close   volume  \
0  1525471260  0.90120  0.90130  0.90120  0.90130   134.98   
1  1525471320  0.90185  0.90195  0.90185  0.90195  1070.54   
2  1525471380  0.90140  0.90140  0.90139  0.90139  2293.06   
3  1525471440  0.90139  0.90140  0.90138  0.90139  6850.59   
4  1525471500  0.90139  0.90139  0.90130  0.90130   832.30   

   quote_asset_volume  number_of_trades  taker_buy_base_volume  \
0          121.646459               4.0                 125.08   
1          965.505313              12.0                 879.94   
2         2066.963991               5.0                   0.00   
3         6175.000909              19.0                1786.30   
4          750.222624               3.0                 784.82   

   taker_buy_quote_volume  target  
0              112.723589     1.0  
1              793.612703     0.0  
2                0.000000     0.0  
3             1610.149485     0.0  
4              707.428900     0.0  
   row_id   ti

In [8]:
# Create new features in train and test datasets
train_df['price_return'] = (train_df['close'] - train_df['open']) / train_df['open']
train_df['volatility'] = (train_df['high'] - train_df['low']) / train_df['low']
train_df['rolling_mean_3'] = train_df['close'].rolling(window=3).mean().fillna(0)
train_df['rolling_mean_5'] = train_df['close'].rolling(window=5).mean().fillna(0)
train_df['price_diff'] = train_df['close'] - train_df['open']
train_df['rolling_std_3'] = train_df['close'].rolling(window=3).std().fillna(0)
train_df['rolling_std_5'] = train_df['close'].rolling(window=5).std().fillna(0)
train_df['ema_3'] = train_df['close'].ewm(span=3, adjust=False).mean()
train_df['ema_5'] = train_df['close'].ewm(span=5, adjust=False).mean()

# Apply the same transformations to the test dataset
test_df['price_return'] = (test_df['close'] - test_df['open']) / test_df['open']
test_df['volatility'] = (test_df['high'] - test_df['low']) / test_df['low']
test_df['rolling_mean_3'] = test_df['close'].rolling(window=3).mean().fillna(0)
test_df['rolling_mean_5'] = test_df['close'].rolling(window=5).mean().fillna(0)
test_df['price_diff'] = test_df['close'] - test_df['open']
test_df['rolling_std_3'] = test_df['close'].rolling(window=3).std().fillna(0)
test_df['rolling_std_5'] = test_df['close'].rolling(window=5).std().fillna(0)
test_df['ema_3'] = test_df['close'].ewm(span=3, adjust=False).mean()
test_df['ema_5'] = test_df['close'].ewm(span=5, adjust=False).mean()

# Display the first few rows of the train dataset to check the new features
train_df.head()

Unnamed: 0,timestamp,open,high,low,close,volume,quote_asset_volume,number_of_trades,taker_buy_base_volume,taker_buy_quote_volume,target,price_return,volatility,rolling_mean_3,rolling_mean_5,price_diff,rolling_std_3,rolling_std_5,ema_3,ema_5
0,1525471260,0.9012,0.9013,0.9012,0.9013,134.98,121.646459,4.0,125.08,112.723589,1.0,0.000111,0.000111,0.0,0.0,0.0001,0.0,0.0,0.9013,0.9013
1,1525471320,0.90185,0.90195,0.90185,0.90195,1070.54,965.505313,12.0,879.94,793.612703,0.0,0.000111,0.000111,0.0,0.0,0.0001,0.0,0.0,0.901625,0.901517
2,1525471380,0.9014,0.9014,0.90139,0.90139,2293.06,2066.963991,5.0,0.0,0.0,0.0,-1.1e-05,1.1e-05,0.901547,0.0,-1e-05,0.000352,0.0,0.901508,0.901474
3,1525471440,0.90139,0.9014,0.90138,0.90139,6850.59,6175.000909,19.0,1786.3,1610.149485,0.0,0.0,2.2e-05,0.901577,0.0,0.0,0.000323,0.0,0.901449,0.901446
4,1525471500,0.90139,0.90139,0.9013,0.9013,832.3,750.222624,3.0,784.82,707.4289,0.0,-0.0001,0.0001,0.90136,0.901466,-9e-05,5.2e-05,0.000274,0.901374,0.901398


In [10]:
# Select the top features for modeling
top_features = ['close', 'rolling_mean_3', 'rolling_std_3', 'volatility', 'price_diff']

# Split the training data into features (X) and target (y)
X_train = train_df[top_features]
y_train = train_df['target']  # The target column contains the labels (1 for up, 0 for not up)

# Split the data into training and validation sets (80% train, 20% validation)
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Display the shape of the training and validation sets
print(X_train_split.shape, X_val_split.shape)

(1697950, 5) (424488, 5)


In [12]:
# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, max_depth=20, class_weight='balanced', random_state=42)

# Train the model
rf_model.fit(X_train_split, y_train_split)

In [14]:
# Make predictions on the validation set
y_val_pred = rf_model.predict(X_val_split)

# Calculate the Macro-Averaged F1 Score
macro_f1 = f1_score(y_val_split, y_val_pred, average='macro')
print(f"Macro-Averaged F1 Score on Validation Set: {macro_f1}")


Macro-Averaged F1 Score on Validation Set: 0.5264201871824679


In [16]:
# Make predictions on the test dataset
X_test = test_df[top_features]
test_predictions = rf_model.predict(X_test)

# Create a submission dataframe
submission_df = pd.DataFrame({
    'row_id': test_df.index,
    'target': test_predictions
})

# Save the submission to a CSV file
submission_df.to_csv("crypto_price_direction_submission.csv", index=False)
print("Submission file created: crypto_price_direction_submission.csv")


Submission file created: crypto_price_direction_submission.csv


In [18]:
train_df['rolling_mean_open_3'] = train_df['open'].rolling(window=3).mean().fillna(0)
train_df['rolling_mean_high_3'] = train_df['high'].rolling(window=3).mean().fillna(0)
train_df['rolling_mean_low_3'] = train_df['low'].rolling(window=3).mean().fillna(0)

In [20]:
train_df['momentum_3'] = train_df['close'] - train_df['close'].shift(3).fillna(0)

In [None]:
from sklearn.model_selection import GridSearchCV

# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(class_weight='balanced', random_state=42)

# Define the hyperparameter grid to search
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [10, 15, 20],
    'min_samples_split': [2, 5, 10]
}

# Initialize Grid Search with cross-validation
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='f1_macro')

# Fit the model with Grid Search
grid_search.fit(X_train_split, y_train_split)

# Find the best model and print the parameters
best_rf_model = grid_search.best_estimator_
print("Best Hyperparameters:", grid_search.best_params_)