In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Other packages

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, chi2
import optuna
from xgboost import XGBClassifier
from sklearn.model_selection import KFold

# Loading datasets and processing the data

In [4]:
train=pd.read_csv('full_train.csv')
sentiment=pd.read_csv('concatenated_sentiment_ff.csv')

In [5]:
train.head()

Unnamed: 0,Date,CL=F,GC=F,HG=F,JPY=X,SI=F,^DJI,^GSPC,^IXIC,^RUT,^TNX,^TYX,^VIX,target
0,2000-08-30,33.400002,273.899994,0.885,106.610001,4.93,11103.009766,1502.589966,4103.810059,532.330017,5.8,5.736,17.690001,1
1,2000-08-31,33.099998,278.299988,0.885,106.669998,5.003,11215.099609,1517.680054,4206.350098,537.890015,5.729,5.674,16.84,1
2,2000-09-01,33.380001,277.0,0.889,105.949997,5.004,11238.780273,1520.77002,4234.330078,541.909973,5.675,5.655,17.530001,0
3,2000-09-05,33.799999,275.799988,0.906,105.860001,4.998,11260.610352,1507.079956,4143.180176,539.02002,5.683,5.663,19.82,0
4,2000-09-06,34.950001,274.200012,0.9015,105.93,4.983,11310.639648,1492.25,4013.340088,536.320007,5.712,5.7,20.790001,1


In [6]:
sentiment.head()

Unnamed: 0,Date,Title,Sentiment
0,2010-01-04,Global Stocks and Commodities Rally on First T...,positive
1,2010-01-04,Dollar Slumps Amid Worldwide Manufacturing Imp...,negative
2,2010-01-04,Oil Prices Surge Above $81 a Barrel Due to U.S...,negative
3,2010-01-04,"S&P 500 Sees 1.6 Percent Increase, Hits 15-Mon...",positive
4,2010-01-04,"Argentina's Merval Index Reaches Record High, ...",positive


- We need to create a numerical signal from the Sentiment column (ideally between -1 and 1) to give a general sentiment about the day.  
- We have the generic, widely used signal in literature ie S=(number of positive-number of negative)/total number.  
- We tried experimenting with a weighted sum as well ie S=(lambda x number of positive-(1-lambda) x number of negative)/(lambda x number of positive+(1-lambda)number of negative).  
Here lambda>0.5 would mean that the agents have an optimistic point of view/risk-taking approach (negative news wouldn't have as much effect as positive news), whereas lambda<0.5 would mean that the agents have a pessimistic point of view/conservative approach.

In [7]:
def signal_func(L): #signal used in literature
    S=0
    for ch in L:
        if ch=='positive':
            S+=1
        if ch=='negative':
            S-=1
    return S/len(L)
def weighted_sum(L,lamda=0.5): #weighted sum
    S_p=S_n=0
    for ch in L:
        if ch=='positive':
            S_p+=lamda
        if ch=='negative':
            S_n+=1-lamda
    if S_p==S_n==0:
        return 0
    return (S_p-S_n)/len(L)

First, we tried the non-weighted sum.

In [8]:
#grouping by date and applying the signal_func to the results of sentiment analysis for each day
sentiment_result = sentiment.groupby('Date')['Sentiment'].apply(signal_func).reset_index()

Now we can merge all the necessary data in a big dataset

In [9]:
#changing the Date column to datetime to ensure correct merge
sentiment_result['Date']=pd.to_datetime(sentiment_result['Date'].values)
train['Date']=pd.to_datetime(train['Date'].values)
#inner join on Date
train=pd.merge(sentiment_result, train, on='Date', how='inner')

In [11]:
train

Unnamed: 0,Date,Sentiment,CL=F,GC=F,HG=F,JPY=X,SI=F,^DJI,^GSPC,^IXIC,^RUT,^TNX,^TYX,^VIX,target
0,2000-09-11,-1.000000,35.099998,273.100006,0.9130,106.089996,4.885000,11195.490234,1489.260010,3896.350098,533.619995,5.758,5.721,18.400000,0
1,2000-09-29,-1.000000,30.860001,273.600006,0.9155,108.129997,4.899000,10650.919922,1436.510010,3672.820068,521.369995,5.778,5.869,20.570000,0
2,2000-10-03,0.000000,32.070000,271.600006,0.8985,108.889999,4.913000,10719.740234,1426.459961,3455.830078,504.670013,5.867,5.945,21.850000,1
3,2000-10-12,-1.000000,36.099998,276.399994,0.8795,107.750000,4.923000,10034.580078,1329.780029,3074.679932,462.970001,5.715,5.809,30.510000,1
4,2000-10-17,-1.000000,32.950001,271.100006,0.8805,107.910004,4.843000,10089.709961,1349.969971,3213.959961,470.880005,5.665,5.761,27.840000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
852,2011-07-29,-0.812500,95.699997,1628.300049,4.4740,77.730003,40.091999,12143.240234,1292.280029,2756.379883,797.030029,2.805,4.132,25.250000,0
853,2011-08-01,-0.500000,94.889999,1619.000000,4.4040,77.212997,39.298000,12132.490234,1286.939941,2744.610107,792.849976,2.740,4.071,23.660000,0
854,2011-08-02,-0.785714,93.790001,1641.900024,4.3890,77.459999,40.081001,11866.620117,1254.050049,2669.239990,767.010010,2.624,3.919,24.790001,1
855,2011-08-03,-0.310345,91.930000,1663.400024,4.3200,77.334999,41.747002,11896.440430,1260.339966,2693.070068,772.780029,2.599,3.873,23.379999,0


Feature selection

We experimented with keeping and eliminating features manually and we found that removing 'SI=F' yielded the best results for us.

In [57]:
L=list(train.columns)
L.remove('Date')
L.remove('target')
L.remove('SI=F') 
X=train[L]
Y=train['target']

Splitting training data into training set and validation set.

In [58]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

We tried using a scikit learn function for feature selection as well but it didn't yield better results.

In [None]:
#selector = SelectKBest(score_func=chi2, k=10)
#X_train_selected = selector.fit_transform(X_train, Y_train)
#X_test_selected = selector.transform(X_test)

# Trying classical machine learning classification algorithms: Decision tree, random forest, xgboost, logistic regression

Throughout this process we'll finetune the hyperparameters with the optuna library

## Decision tree

In [59]:
#checking the performance without specifying the parameters
model = DecisionTreeClassifier()
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
score=f1_score(Y_test, Y_pred)
print('f1 score=', score)

f1 score= 0.5989847715736041


In [None]:
#selecting the best hyperparameters
def objective_dec_tree(trial):
    X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size=0.2, random_state=42)

    # Define the hyperparameter search space
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
    splitter = trial.suggest_categorical('splitter', ['best', 'random'])
    max_depth = trial.suggest_int('max_depth', 1, 32, log=True)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
    
    # Initialize and train the model
    model = DecisionTreeClassifier(
        criterion=criterion,
        splitter=splitter,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    model.fit(X_train, Y_train)
    
    # Make predictions and evaluate the model
    preds = model.predict(X_valid)
    score=f1_score(Y_valid, preds)
    return score

# Create a study object and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective_dec_tree, n_trials=100)

# Get the best parameters and best score
print("Best parameters:", study.best_params)
print("Best score:", study.best_value)

[I 2024-06-06 21:44:41,245] A new study created in memory with name: no-name-0bd7e8e3-34ce-43bb-b89b-39e47ae71ad7
[I 2024-06-06 21:44:41,261] Trial 0 finished with value: 0.6153846153846154 and parameters: {'criterion': 'gini', 'splitter': 'random', 'max_depth': 1, 'min_samples_split': 15, 'min_samples_leaf': 12}. Best is trial 0 with value: 0.6153846153846154.
[I 2024-06-06 21:44:41,262] Trial 1 finished with value: 0.6311475409836066 and parameters: {'criterion': 'gini', 'splitter': 'best', 'max_depth': 1, 'min_samples_split': 14, 'min_samples_leaf': 6}. Best is trial 1 with value: 0.6311475409836066.
[I 2024-06-06 21:44:41,284] Trial 2 finished with value: 0.5139664804469274 and parameters: {'criterion': 'entropy', 'splitter': 'best', 'max_depth': 4, 'min_samples_split': 16, 'min_samples_leaf': 20}. Best is trial 1 with value: 0.6311475409836066.
[I 2024-06-06 21:44:41,292] Trial 3 finished with value: 0.6209677419354839 and parameters: {'criterion': 'gini', 'splitter': 'random', 'm

Best parameters: {'criterion': 'entropy', 'splitter': 'best', 'max_depth': 2, 'min_samples_split': 5, 'min_samples_leaf': 20}
Best score: 0.6386554621848739


## Random Forest

In [None]:
model = RandomForestClassifier()
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
score=f1_score(Y_test, Y_pred)
print('f1 score=', score)

f1 score= 0.59375


In [None]:
def objective_random_forest(trial):
    X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size=0.2, random_state=42)
    # Define the hyperparameter search space
    n_estimators = trial.suggest_int('n_estimators', 10, 200)
    max_depth = trial.suggest_int('max_depth', 1, 32, log=True)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
    bootstrap = trial.suggest_categorical('bootstrap', [True, False])
    
    # Initialize and train the model
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        bootstrap=bootstrap,
        random_state=42
    )
    model.fit(X_train, Y_train)
    
    # Make predictions and evaluate the model
    preds = model.predict(X_valid)
    score=f1_score(Y_valid, preds)
    
    return score

# Create a study object and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective_random_forest, n_trials=100)

# Get the best parameters and best score
print("Best parameters:", study.best_params)
print("Best score:", study.best_value)

[I 2024-06-06 21:49:55,693] A new study created in memory with name: no-name-88610a31-42ec-4908-b75c-7375e7a9d88c
[I 2024-06-06 21:49:55,953] Trial 0 finished with value: 0.5866666666666667 and parameters: {'n_estimators': 190, 'max_depth': 3, 'min_samples_split': 7, 'min_samples_leaf': 2, 'max_features': 'log2', 'bootstrap': True}. Best is trial 0 with value: 0.5866666666666667.
[I 2024-06-06 21:49:56,235] Trial 1 finished with value: 0.5157894736842106 and parameters: {'n_estimators': 90, 'max_depth': 6, 'min_samples_split': 10, 'min_samples_leaf': 14, 'max_features': None, 'bootstrap': True}. Best is trial 0 with value: 0.5866666666666667.
[I 2024-06-06 21:49:56,348] Trial 2 finished with value: 0.6337448559670782 and parameters: {'n_estimators': 91, 'max_depth': 1, 'min_samples_split': 19, 'min_samples_leaf': 18, 'max_features': 'log2', 'bootstrap': True}. Best is trial 2 with value: 0.6337448559670782.
[I 2024-06-06 21:49:56,496] Trial 3 finished with value: 0.6311475409836066 and

Best parameters: {'n_estimators': 90, 'max_depth': 1, 'min_samples_split': 10, 'min_samples_leaf': 3, 'max_features': None, 'bootstrap': True}
Best score: 0.6363636363636364


## Logistic regression

For the logistic regression, we experimented with the LASSO and RIDGE penalizations and selected the best performing one and the best hyperparameter with optuna.

In [None]:
model_regress = LogisticRegression()
model_regress.fit(X_train, Y_train)
Y_pred = model_regress.predict(X_test)
score=f1_score(Y_test, np.round(Y_pred))
print('f1 score=', score)

f1 score= 0.6521739130434783


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
def objective_regress(trial):
    X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size=0.2, random_state=42)

    # Define the hyperparameter search space
    C = trial.suggest_loguniform('C', 1e-4, 1e4)
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2'])
    solver = 'liblinear' if penalty == 'l1' else 'lbfgs'
    
    # Initialize and train the model
    model = LogisticRegression(C=C, penalty=penalty, solver=solver, max_iter=10000)
    model.fit(X_train, Y_train)
    
    # Make predictions and evaluate the model
    preds = model.predict(X_valid)
    score=f1_score(Y_valid, preds)
    return score

# Create a study object and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective_regress, n_trials=100)

# Get the best parameters and best score
print("Best parameters (Optuna):", study.best_params)
print("Best score (Optuna):", study.best_value)

# Evaluate on the test set
best_optuna_model = LogisticRegression(**study.best_params, solver='liblinear' if study.best_params['penalty'] == 'l1' else 'lbfgs', max_iter=10000)
best_optuna_model.fit(X_train, Y_train)
Y_pred = best_optuna_model.predict(X_test)
print("Test Accuracy (Optuna):", f1_score(Y_test, Y_pred))

[I 2024-06-06 22:06:14,560] A new study created in memory with name: no-name-569e6cee-2e0e-448a-9075-78111892f554
  C = trial.suggest_loguniform('C', 1e-4, 1e4)
[I 2024-06-06 22:06:14,932] Trial 0 finished with value: 0.5964912280701754 and parameters: {'C': 0.0006157450990732934, 'penalty': 'l2'}. Best is trial 0 with value: 0.5964912280701754.
  C = trial.suggest_loguniform('C', 1e-4, 1e4)
[I 2024-06-06 22:06:15,178] Trial 1 finished with value: 0.56 and parameters: {'C': 678.3814316528747, 'penalty': 'l1'}. Best is trial 0 with value: 0.5964912280701754.
  C = trial.suggest_loguniform('C', 1e-4, 1e4)
[I 2024-06-06 22:06:15,320] Trial 2 finished with value: 0.56 and parameters: {'C': 2524.980029072977, 'penalty': 'l1'}. Best is trial 0 with value: 0.5964912280701754.
  C = trial.suggest_loguniform('C', 1e-4, 1e4)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preproc

Best parameters (Optuna): {'C': 0.00010030785637449065, 'penalty': 'l1'}
Best score (Optuna): 0.6349206349206349
Test Accuracy (Optuna): 0.706766917293233


## XGBoost

In [None]:
model_XGB = XGBClassifier()
model_XGB.fit(X_train, Y_train)
Y_pred = model_XGB.predict(X_test)
score=f1_score(Y_test, Y_pred)
print('f1 score=', score)

f1 score= 0.5698924731182796


In [None]:
def objective_XGB(trial):
    X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size=0.2, random_state=42)
    
    param = {
        'verbosity': 0,
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'max_depth': trial.suggest_int('max_depth', 3, 7),
        'subsample': trial.suggest_float('subsample', 0.8, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.8, 1.0)
    }
    
    model = XGBClassifier(**param)
    model.fit(X_train, Y_train)
    preds = model.predict(X_valid)
    score=f1_score(Y_valid, preds)
    
    return score

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_XGB, n_trials=100)

# Get the best parameters and best score
print("Best parameters:", study.best_params)
print("Best score:", study.best_value)

[I 2024-06-07 10:23:03,486] A new study created in memory with name: no-name-8776770c-135e-4637-9ee2-28ba4eea75df
[I 2024-06-07 10:23:03,701] Trial 0 finished with value: 0.5402298850574713 and parameters: {'n_estimators': 196, 'learning_rate': 0.14647663230183336, 'max_depth': 5, 'subsample': 0.8727307269755952, 'colsample_bytree': 0.9545138707706515}. Best is trial 0 with value: 0.5402298850574713.
[I 2024-06-07 10:23:03,854] Trial 1 finished with value: 0.5333333333333333 and parameters: {'n_estimators': 179, 'learning_rate': 0.05695711565935688, 'max_depth': 5, 'subsample': 0.9198395029258059, 'colsample_bytree': 0.8988396238789153}. Best is trial 0 with value: 0.5402298850574713.
[I 2024-06-07 10:23:04,071] Trial 2 finished with value: 0.5573770491803278 and parameters: {'n_estimators': 194, 'learning_rate': 0.0263060024898018, 'max_depth': 6, 'subsample': 0.854227279798709, 'colsample_bytree': 0.8820982811707584}. Best is trial 2 with value: 0.5573770491803278.
[I 2024-06-07 10:2

Best parameters: {'n_estimators': 127, 'learning_rate': 0.06820467796385368, 'max_depth': 6, 'subsample': 0.9897260768924012, 'colsample_bytree': 0.9893899543604808}
Best score: 0.5901639344262295


# Cross-validation

In [24]:
kf = KFold(n_splits=5) #k-fold cross-validation with k=5
Models={'Decision tree':DecisionTreeClassifier(**{'criterion': 'entropy', 'splitter': 'best', 'max_depth': 2, 'min_samples_split': 5, 'min_samples_leaf': 20}),
        'Random Forest':RandomForestClassifier(**{'n_estimators': 90, 'max_depth': 1, 'min_samples_split': 10, 'min_samples_leaf': 3, 'max_features': None, 'bootstrap': True}),
        'Logistic regression':LogisticRegression(**{'C': 0.00010030785637449065, 'penalty': 'l1'},max_iter=10000,solver = 'liblinear'),
        'XGBoost': XGBClassifier(**{'n_estimators': 127, 'learning_rate': 0.06820467796385368, 'max_depth': 6, 'subsample': 0.9897260768924012, 'colsample_bytree': 0.9893899543604808})}

In [25]:
scores={k: [] for k in Models}
for k in scores:
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]
        model=Models[k]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        scores[k].append(f1_score(y_test, y_pred))
scores={k: np.mean(v) for k,v in scores.items()}

In [26]:
scores

{'Decision tree': 0.4970327424713947,
 'Random Forest': 0.6945006422648758,
 'Logistic regression': 0.6941348204723491,
 'XGBoost': 0.4965655487985585}

After cross-validation, we conclude that the best models are Random Forest and Logistic regression

In [45]:
#fitting the two selected models on the whole training set
model_regress=Models['Logistic regression']
model_random_forest=Models['Random Forest']
model_regress.fit(X,Y)
model_random_forest.fit(X,Y)

# Preparing the test set

In [37]:
test=pd.read_csv('test.csv')

In [38]:
#Handling missing values
test['Date']=pd.to_datetime(test['Date'].values)
test.set_index('Date',inplace=True)
test.interpolate(method='time',inplace=True)
test.reset_index()

In [41]:
test=pd.merge(sentiment_result, test, on='Date', how='inner')

In [42]:
test

Unnamed: 0,Date,Sentiment,CL=F,EURUSD=X,GC=F,HG=F,JPY=X,SI=F,XWD.TO,^DJI,^GSPC,^IXIC,^RUT,^TNX,^TYX,^VIX
0,2011-08-08,-0.673077,81.309998,1.429409,1710.199951,3.957500,78.253998,39.374001,20.521042,10809.849609,1119.459961,2357.689941,650.960022,2.33900,3.66300,48.000000
1,2011-08-09,0.413793,79.300003,1.417073,1740.000000,3.967000,77.584999,37.876999,21.162325,11239.769531,1172.530029,2482.520020,696.159973,2.18200,3.57300,35.060001
2,2011-08-10,-0.407407,82.889999,1.438311,1781.300049,3.885500,77.050003,39.325001,20.591183,10719.940430,1120.760010,2381.050049,660.210022,2.13700,3.53800,42.990002
3,2011-08-11,-0.347826,85.720001,1.412469,1748.800049,4.004500,76.759003,38.655998,21.412827,11143.309570,1172.640015,2492.679932,695.890015,2.33600,3.78700,39.000000
4,2011-08-12,0.068966,85.379997,1.422637,1740.199951,4.009500,76.860001,39.101002,21.653307,11269.019531,1178.810059,2507.979980,697.500000,2.23700,3.70300,36.360001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100,2011-12-26,0.800000,100.924997,1.304495,1596.824951,3.418875,78.080002,28.784250,22.174349,12292.012207,1265.405029,2623.559937,750.477493,2.01425,3.04525,21.615000
101,2011-12-27,-0.533333,101.339996,1.305057,1594.199951,3.404000,78.029999,28.697001,22.174349,12291.349609,1265.430054,2625.199951,751.309998,2.00900,3.03900,21.910000
102,2011-12-28,-0.862069,99.360001,1.307207,1562.900024,3.360000,77.870003,27.191999,22.174349,12151.410156,1249.640015,2589.979980,735.210022,1.90800,2.89700,23.520000
103,2011-12-29,0.296296,99.650002,1.292725,1539.900024,3.365500,77.945999,27.274000,22.134270,12287.040039,1263.020020,2613.739990,744.979980,1.89700,2.90700,22.650000


In [49]:
X = test[L]
Y_regress = model_regress.predict(X)
Y_random_forest=model_random_forest.predict(X)

Creating file for logistic regression

In [51]:
test['target']=Y_regress

In [52]:
test['id']=test.index.values

In [53]:
test[['id','target']].to_csv('submission_regression.csv',index=False)

and for random forest

In [55]:
test['target']=Y_random_forest
test[['id','target']].to_csv('submission_random_forest.csv',index=False)