In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Load the training data
train_data = pd.read_csv("train.csv")

# Data preprocessing
# Assuming that your dataset has columns like 'Date', 'Open', 'Close', 'Volume', and 'Strategy'
# You may need to perform more advanced data preprocessing based on the dataset characteristics.

# Feature engineering
# In this example, we'll add lag features for 'Open' prices:

# 1. Lag features for 'Open' prices
train_data['Date'] = pd.to_datetime(train_data['Date'])
train_data.set_index('Date', inplace=True)
train_data['Volume_SMA'] = train_data['Volume'].rolling(window=10).mean()
train_data['Volume_Rate_Of_Change'] = train_data['Volume'].pct_change(1)
train_data['SMA_Open'] = train_data['Open'].rolling(window=10).mean()
train_data['EMA_Open'] = train_data['Open'].ewm(span=10, adjust=False).mean()

X = train_data[['Open', 'Volume', 'Volume_SMA', 'Volume_Rate_Of_Change', 'SMA_Open', 'EMA_Open']]
y_close = train_data['Close']

# Impute missing values
imputer = SimpleImputer(strategy='mean')  # You can choose a different strategy based on your data
X = imputer.fit_transform(X)

# Split the data into training and validation sets
X_train, X_valid, y_close_train, y_close_valid = train_test_split(
    X, y_close, test_size=0.2, random_state=42)

# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)

# Train a Random Forest Regressor to predict 'Close' prices
close_model = LinearRegression()
close_model.fit(X_train, y_close_train)

# Make predictions on the validation set
close_preds = close_model.predict(X_valid)

# Evaluate the model
mae = mean_absolute_error(y_close_valid, close_preds)

print(f"Mean Absolute Error for Close Price Prediction: {mae}")

# Load the test data
test_data = pd.read_csv("test.csv")

# Feature engineering for the test data (similar to the training data)
test_data['Date'] = pd.to_datetime(test_data['Date'], format='%Y-%m-%d')  # Specify the correct date format
test_data.set_index('Date', inplace=True)
test_data['Volume_SMA'] = test_data['Volume'].rolling(window=10).mean()
test_data['Volume_Rate_Of_Change'] = test_data['Volume'].pct_change(1)
test_data['SMA_Open'] = test_data['Open'].rolling(window=10).mean()
test_data['EMA_Open'] = test_data['Open'].ewm(span=10, adjust=False).mean()

# Select the same features as used in the training data
X_test = test_data[['Open', 'Volume', 'Volume_SMA', 'Volume_Rate_Of_Change', 'SMA_Open', 'EMA_Open']]

# Impute missing values in the test data
X_test = imputer.transform(X_test)

# Normalize test data
X_test = scaler.transform(X_test)

# Make predictions on the test data
test_close_preds = close_model.predict(X_test)

# Add the predictions to the test data
test_data['Close'] = test_close_preds
columns_to_drop = ['Volume_SMA','Volume_Rate_Of_Change','SMA_Open','EMA_Open']
test_data = test_data.drop(columns=columns_to_drop)
test_data.to_csv('test.csv',index=False)
smape = np.mean(np.abs(close_preds - y_close_valid) / (np.abs(close_preds) + np.abs(y_close_valid)))

print(smape)

Mean Absolute Error for Close Price Prediction: 2.6489800195181905
0.018137997925656204


In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

# Load the dataset
data = pd.read_csv("train.csv")  # Replace with your dataset file path

# Encode the 'Strategy' column into numerical labels
le = LabelEncoder()
data['Strategy'] = le.fit_transform(data['Strategy'])

# Feature Engineering
# Create lag features for both 'Open' and 'Close' prices
n_lags = 10
for lag in range(1, n_lags + 1):
    data[f'Open_Lag_{lag}'] = data['Open'].shift(lag)
    data[f'Close_Lag_{lag}'] = data['Close'].shift(lag)

# Additional Features (Add more features here)
# Example: Calculate 20-day Simple Moving Average (SMA) of 'Close' prices
n_sma = 20
data['SMA_Close'] = data['Close'].rolling(window=n_sma).mean()

# Select the features for training
# You can include more features in this list
features = ['Open'] + ['Close'] + [f'Open_Lag_{lag}' for lag in range(1, n_lags + 1)] + \
           [f'Close_Lag_{lag}' for lag in range(1, n_lags + 1)] + ['SMA_Close']

X = data[features]
y = data['Strategy']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create individual XGBoost classifiers with different hyperparameters
model1 = xgb.XGBClassifier(verbosity=0, n_estimators=100, max_depth=4, learning_rate=0.1)
model2 = xgb.XGBClassifier(verbosity=0, n_estimators=100, max_depth=5, learning_rate=0.05)
model3 = xgb.XGBClassifier(verbosity=0, n_estimators=100, max_depth=6, learning_rate=0.02)

# Create a VotingClassifier ensemble of the individual XGBoost models
ensemble_model = VotingClassifier(estimators=[('model1', model1), ('model2', model2), ('model3', model3)], voting='soft')

# Train the ensemble model
ensemble_model.fit(X_train, y_train)

# Make predictions
y_pred = ensemble_model.predict(X_test)

# Decode the numerical labels back to the original 'Strategy' labels
y_pred = le.inverse_transform(y_pred)
y_test = le.inverse_transform(y_test)

# Evaluate the ensemble model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.80


In [18]:
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

# Load the test dataset
test_data = pd.read_csv("test.csv")  # Replace with your test dataset file path

# Feature Engineering for the Test Dataset
n_lags = 10
for lag in range(1, n_lags + 1):
    test_data[f'Open_Lag_{lag}'] = test_data['Open'].shift(lag)
    test_data[f'Close_Lag_{lag}'] = test_data['Close'].shift(lag)

# Additional Features (Add more features here)
n_sma = 20
test_data['SMA_Close'] = test_data['Close'].rolling(window=n_sma).mean()

# Select the features for testing
features = ['Open'] + ['Close'] + [f'Open_Lag_{lag}' for lag in range(1, n_lags + 1)] + \
           [f'Close_Lag_{lag}' for lag in range(1, n_lags + 1)] + ['SMA_Close']

X_test = test_data[features]

# Make predictions using the ensemble model
y_pred = ensemble_model.predict(X_test)

# Decode the numerical labels back to the original 'Strategy' labels
y_pred = le.inverse_transform(y_pred)

# Add the predicted 'Strategy' back to the test dataset
test_data['Strategy'] = y_pred
columns_to_drop = ['Open','Volume','Close_Lag_1','Open_Lag_1','Close_Lag_2','Open_Lag_2','Close_Lag_3','Open_Lag_3','Close_Lag_4','Open_Lag_4','Close_Lag_4','Open_Lag_5','Close_Lag_5','Open_Lag_6','Close_Lag_6','Open_Lag_7','Close_Lag_7','Open_Lag_8','Close_Lag_8','Open_Lag_9','Close_Lag_9','Open_Lag_10','Close_Lag_10','SMA_Close']
# Save the test dataset with the predicted 'Strategy' to a new CSV file
test_data = test_data.drop(columns=columns_to_drop)
test_data.to_csv('submission.csv',index=False)