KEEP IN MIND:
- before submission train on entire train and val
- how deal with test
- clean up code and everything to submit
- make sure to zip properly for submission
- are all important columns included

In [247]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
from scipy.stats import spearmanr

In [248]:
# functions used


# extends data with simple features:
# > convert strand from + - to 1 0
# > gene length
# > transcription site length
# > ratio of transciption site length to gene length
def extend_df(df, cols_to_keep):
    # Add a binary column for strand
    df['strand_binary'] = df['strand'].map({'+': 1, '-': 0})
    
    # gene length
    df['gene_length'] = df['gene_end'] - df['gene_start']
    
    # transcription site length
    df['trans_site_len'] = df['TSS_end'] - df['TSS_start']
    
    # ratio transcription site length & land gene length
    df['trans_gene_ratio'] = df['trans_site_len'] / df['gene_length']

    new_cols = ['strand_binary', 'gene_length', 'trans_site_len', 'trans_gene_ratio']

    
    return df[cols_to_keep + new_cols]

In [249]:
## load already feature engineered data


# Paths
X_1_train_path = 'data/X1-train/features.tsv'
y_1_train_path = 'data/X1-train/y.tsv'

X_1_val_path = 'data/X1-val/features.tsv'
y_1_val_path = 'data/X1-val/y.tsv'

X_2_train_path = 'data/X2-train/features.tsv'
y_2_train_path = 'data/X2-train/y.tsv'

X_2_val_path = 'data/X2-val/features.tsv'
y_2_val_path = 'data/X2-val/y.tsv'

# Load data
X_1_train = pd.read_csv(X_1_train_path, sep='\t')
y_1_train = pd.read_csv(y_1_train_path, sep='\t')

X_1_val = pd.read_csv(X_1_val_path, sep='\t')
y_1_val = pd.read_csv(y_1_val_path, sep='\t')

X_2_train = pd.read_csv(X_2_train_path, sep='\t')
y_2_train = pd.read_csv(y_2_train_path, sep='\t')

X_2_val = pd.read_csv(X_2_val_path, sep='\t')
y_2_val = pd.read_csv(y_2_val_path, sep='\t')

In [250]:
# some further data preprocessing


cols_to_keep = ['DNase_num_peaks', 'DNase_avg_peaks', 'H3K4me1_num_peaks', 'H3K4me1_avg_peaks', 'H3K4me3_num_peaks', 
                'H3K4me3_avg_peaks', 'H3K27ac_num_peaks', 'H3K27ac_avg_peaks']

# cols_to_keep = ['dnase_val', 'dnase_dist', 'H3K4me3_signal', 'H3K4me3_distance', 'H3K4me3_num_peaks', 
#                 'H3K4me3_avg_peaks', 'H3K27ac_signal', 'H3K27ac_distance', 'H3K27ac_num_peaks', 
#                 'H3K27ac_avg_peaks']


# extend data by some basic transcription site features
X_1_train = extend_df(X_1_train, cols_to_keep)
X_1_val = extend_df(X_1_val, cols_to_keep)
X_2_train = extend_df(X_2_train, cols_to_keep)
X_2_val = extend_df(X_2_val, cols_to_keep)


# standardize data

# Standardize X_1 data
scaler_1 = StandardScaler()
X_1_train = scaler_1.fit_transform(X_1_train)
X_1_val = scaler_1.transform(X_1_val)

# Standardize X_2 data
scaler_2 = StandardScaler()
X_2_train = scaler_2.fit_transform(X_2_train)
X_2_val = scaler_2.transform(X_2_val)


# stack data

# Stack the training data and validation data for X and y
X_train = np.vstack([X_1_train, X_2_train])
X_val = np.vstack([X_1_val, X_2_val])

# Combine y data for training and validation
y_train = pd.concat([y_1_train, y_2_train], ignore_index=True)
y_val = pd.concat([y_1_val, y_2_val], ignore_index=True)


In [251]:
# Prepare the LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

# Parameters for LightGBM with Huber loss
params = {
    'boosting_type': 'gbdt',
    'objective': 'huber',
    'alpha': 0.9,
    'learning_rate': 0.06,
    'num_leaves': 40,
    'n_estimators': 100
}

# Train the model
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, val_data],
    valid_names=['train', 'val'],
)

# Predict on validation set
y_pred_val = model.predict(X_val)

# Calculate Spearman correlation for the predictions on validation set
spearman_corr, _ = spearmanr(y_val, y_pred_val)
print(f'Spearman correlation on validation set: {spearman_corr}')



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000311 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1352
[LightGBM] [Info] Number of data points in the train set: 28620, number of used features: 11
[LightGBM] [Info] Start training from score 50.915895
Spearman correlation on validation set: 0.7671909991090514


In [252]:
# train final model with all data (train and valid)

In [253]:
# add zipping as they suggested in the end !!!