In [1]:
# Import libraries
import pandas as pd
from ta import momentum
from ta.trend import MACD, ADXIndicator, CCIIndicator
from ta.volatility import BollingerBands
from ta.volatility import AverageTrueRange

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, recall_score, f1_score

import json

In [34]:
# Load data from csv files
train_df = pd.read_csv('training_set.csv')
test_df = pd.read_csv('testing_set.csv')

In [35]:
# Data preprocessing

# Remove missing values
train_df = train_df.dropna()
#test_df = test_df.dropna() # We don't have missing values in test_df
# We lose 198 rows (5,64%)

# Drop irrelevant columns
train_df = train_df.drop(['train_idx'], axis=1)
train_df = train_df.drop(['Time'], axis=1)

test_df = test_df.drop(['test_idx'], axis=1)
test_df = test_df.drop(['Time'], axis=1)

# Remove outliers
# En el análisis exploratorio con apex hemos visto que los valores suelen estar entre 1 y 2 para open high low close
# Por tanto, eliminamos los valores que no estén en ese rango, es probable que sean errores de medición.
train_df = train_df[(train_df['Open'] > 1) & (train_df['Open'] < 2)]
train_df = train_df[(train_df['High'] > 1) & (train_df['High'] < 2)]
train_df = train_df[(train_df['Low'] > 1) & (train_df['Low'] < 2)]
train_df = train_df[(train_df['Close'] > 1) & (train_df['Close'] < 2)]

# 65 rows removed. Counting the previous 198, we have removed 263 rows (7.5% of the original dataset)


In [36]:
# Feature engineering (add new features)

# Calculate RSI
train_df['RSI'] = momentum.RSIIndicator(train_df['Close']).rsi()

test_df['RSI'] = momentum.RSIIndicator(test_df['Close']).rsi()

# Calculate MACD
macd = MACD(train_df['Close'])
train_df['MACD'] = macd.macd()
train_df['MACD_signal'] = macd.macd_signal()

macd = MACD(test_df['Close'])
test_df['MACD'] = macd.macd()
test_df['MACD_signal'] = macd.macd_signal()

# Calculate 10-day moving average of Open price
train_df['Open_10day_ma'] = train_df['Open'].rolling(window=10).mean()

test_df['Open_10day_ma'] = test_df['Open'].rolling(window=10).mean()

# Calculate Average Directional Index (ADX)
adx = ADXIndicator(train_df['High'], train_df['Low'], train_df['Close'])
train_df['ADX'] = adx.adx()

adx = ADXIndicator(test_df['High'], test_df['Low'], test_df['Close'])
test_df['ADX'] = adx.adx()

# Calculate Commodity Channel Index (CCI)
cci = CCIIndicator(train_df['High'], train_df['Low'], train_df['Close'])
train_df['CCI'] = cci.cci()

cci = CCIIndicator(test_df['High'], test_df['Low'], test_df['Close'])
test_df['CCI'] = cci.cci()

# Calculate Bollinger Bands
bb = BollingerBands(train_df['Close'], window=20, window_dev=2)
train_df['BB_upper'] = bb.bollinger_hband()
train_df['BB_lower'] = bb.bollinger_lband()

bb = BollingerBands(test_df['Close'], window=20, window_dev=2)
test_df['BB_upper'] = bb.bollinger_hband()
test_df['BB_lower'] = bb.bollinger_lband()


# Calculate Average True Range (ATR)
atr = AverageTrueRange(high=train_df['High'], low=train_df['Low'], close=train_df['Close'], window=14)
train_df['ATR'] = atr.average_true_range()

atr = AverageTrueRange(high=test_df['High'], low=test_df['Low'], close=test_df['Close'], window=14)
test_df['ATR'] = atr.average_true_range()

# Calculate 10-day moving average of ATR
train_df['ATR_10day_ma'] = train_df['ATR'].rolling(window=10).mean()

test_df['ATR_10day_ma'] = test_df['ATR'].rolling(window=10).mean()

# Calculate 10-day moving average of RSI
train_df['RSI_10day_ma'] = train_df['RSI'].rolling(window=10).mean()

test_df['RSI_10day_ma'] = test_df['RSI'].rolling(window=10).mean()


# Calculate Price Rate of Change (ROC)
train_df['ROC_5day'] = momentum.roc(train_df['Close'], window=5)
train_df['ROC_10day'] = momentum.roc(train_df['Close'], window=10)

test_df['ROC_5day'] = momentum.roc(test_df['Close'], window=5)
test_df['ROC_10day'] = momentum.roc(test_df['Close'], window=10)

# Calculate 10-day moving average of ROC
train_df['ROC_10day_ma'] = train_df['ROC_10day'].rolling(window=10).mean()

test_df['ROC_10day_ma'] = test_df['ROC_10day'].rolling(window=10).mean()

# Calculate 10-day moving average of MACD
train_df['MACD_10day_ma'] = train_df['MACD'].rolling(window=10).mean()

test_df['MACD_10day_ma'] = test_df['MACD'].rolling(window=10).mean()

# Calculate 10-day moving average of CCI
train_df['CCI_10day_ma'] = train_df['CCI'].rolling(window=10).mean()

test_df['CCI_10day_ma'] = test_df['CCI'].rolling(window=10).mean()

# Calculate 10-day moving average of ADX
train_df['ADX_10day_ma'] = train_df['ADX'].rolling(window=10).mean()

test_df['ADX_10day_ma'] = test_df['ADX'].rolling(window=10).mean()

# Calculate awesome oscillator
ao = momentum.AwesomeOscillatorIndicator(train_df['High'], train_df['Low'], window1=5, window2=34)
train_df['AO'] = ao.awesome_oscillator()

ao = momentum.AwesomeOscillatorIndicator(test_df['High'], test_df['Low'], window1=5, window2=34)
test_df['AO'] = ao.awesome_oscillator()


train_df

  dip[idx] = 100 * (self._dip[idx] / value)
  din[idx] = 100 * (self._din[idx] / value)
  dip[idx] = 100 * (self._dip[idx] / value)
  din[idx] = 100 * (self._din[idx] / value)


Unnamed: 0,Open,High,Low,Close,Volume,label,RSI,MACD,MACD_signal,Open_10day_ma,ADX,CCI,BB_upper,BB_lower,ATR,ATR_10day_ma,RSI_10day_ma
0,1.31258,1.31844,1.31086,1.31648,1807377.0,0,,,,,0.000000,,,,0.000000,,
1,1.31667,1.31813,1.31154,1.31396,1995920.0,1,,,,,0.000000,,,,0.000000,,
2,1.31396,1.31583,1.30864,1.31118,1859100.0,1,,,,,0.000000,,,,0.000000,,
3,1.31236,1.31415,1.31068,1.31175,192381.0,1,,,,,0.000000,,,,0.000000,,
5,1.31811,1.32186,1.31547,1.31927,2003765.0,1,,,,,0.000000,,,,0.000000,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3504,1.18053,1.18221,1.17498,1.17689,220609.0,1,26.321066,-0.011345,-0.010300,1.188540,48.127098,-123.552963,1.222887,1.171151,0.007568,0.007360,28.727000
3505,1.17684,1.17756,1.17441,1.17474,18855.0,0,25.261274,-0.011657,-0.010572,1.186527,49.467339,-132.156197,1.219323,1.169895,0.007252,0.007390,28.643269
3506,1.17473,1.17973,1.17165,1.17913,227861.0,0,31.340247,-0.011418,-0.010741,1.184639,50.850432,-113.648305,1.215982,1.169426,0.007311,0.007405,29.553814
3507,1.17914,1.18296,1.17567,1.17824,255729.0,0,30.793416,-0.011172,-0.010827,1.184011,51.305369,-89.649698,1.213615,1.168541,0.007310,0.007418,30.431084


In [37]:
# Handle NaN values
train_df = train_df.fillna(0)
test_df = test_df.fillna(0)

# Normalize volume
scaler = MinMaxScaler()
train_df['Volume'] = scaler.fit_transform(train_df[['Volume']])
test_df['Volume'] = scaler.transform(test_df[['Volume']])

In [None]:
test_df

In [38]:
# Divide train_df into train and validation
# We will use 80% of the data for training and 20% for validation

train, validation = train_test_split(train_df, test_size=0.2, random_state=42)

train_features = train.drop(['label'], axis=1)
train_labels = train['label']

validation_features = validation.drop(['label'], axis=1)
validation_labels = validation['label']

In [39]:
# Crear un modelo de Random Forest
rf_model = RandomForestClassifier(n_estimators=300, random_state=42)

# Entrenar el modelo con los datos de entrenamiento
rf_model.fit(train_features, train_labels)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Define the parameter distributions to search over
param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': randint(5, 20),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 4)
}

# Create a Random Forest regressor object
rf = RandomForestRegressor()

# Create a RandomizedSearchCV object and fit it to the data
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, cv=5, n_iter=5)
random_search.fit(train_features, train_labels)

# Print the best hyperparameters and the corresponding mean squared error
print('Best hyperparameters:', random_search.best_params_)
print('Mean squared error:', random_search.best_score_)

In [None]:
# Crear un modelo de regresión logística
lr_model = LogisticRegression(random_state=42)

# Entrenar el modelo con los datos de entrenamiento
lr_model.fit(train_features, train_labels)

In [None]:
# Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(train_features, train_labels)

In [None]:
# Support Vector Machine
svm_model = SVC(random_state=42)
svm_model.fit(train_features, train_labels)

In [None]:
# Neural Network
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras import regularizers

# Define the model
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(train_features.shape[1],)))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid', kernel_regularizer=regularizers.l2(0.01)))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(train_features, train_labels, epochs=200, batch_size=32, validation_data=(validation_features, validation_labels))

In [41]:
# Hacer predicciones sobre los datos de prueba
test_predictions = rf_model.predict(validation_features)

# Evaluar el rendimiento del modelo
accuracy = accuracy_score(validation_labels, test_predictions)
recall = recall_score(validation_labels, test_predictions)
f1 = f1_score(validation_labels, test_predictions)

print('Precisión: ', accuracy)
print('Recuperación: ', recall)
print('Puntuación F1: ', f1)

Precisión:  0.6892307692307692
Recuperación:  0.6956521739130435
Puntuación F1:  0.6892307692307692


In [42]:
# Predcciones sobre los datos de test
test_set_predictions = rf_model.predict(test_df)

# Create JSON file with predictions
# Create a dictionary with the predictions
predictions_dict = {}
for i in range(0, len(test_set_predictions)):
    predictions_dict[str(i)] = (int)(test_set_predictions[i])

# Save the dictionary to a JSON file
with open('predictions.json', 'w') as f:
    json.dump(predictions_dict, f)
