NOTE: the feature engineering is performed via functions in gen.py (see scripts). To create the feature engineered datasets that we use in this script just run the bash.sh

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
from scipy.stats import spearmanr

In [2]:
# functions used


# extends data with simple features:
# > convert strand from + - to 1 0
# > gene length
# > transcription site length
# > ratio of transciption site length to gene length
def extend_df(df, cols_to_keep):
    # Add a binary column for strand
    df['strand_binary'] = df['strand'].map({'+': 1, '-': 0})
    
    # gene length
    df['gene_length'] = df['gene_end'] - df['gene_start']
    
    # transcription site length
    df['trans_site_len'] = df['TSS_end'] - df['TSS_start']
    
    # ratio transcription site length & land gene length
    df['trans_gene_ratio'] = df['trans_site_len'] / df['gene_length']

    new_cols = ['strand_binary', 'gene_length', 'trans_site_len', 'trans_gene_ratio']

    
    return df[cols_to_keep + new_cols]

In [3]:
## load already feature engineered data


# Paths
X_1_train_path = 'data/X1-train/features.tsv'
y_1_train_path = 'data/X1-train/y.tsv'

X_1_val_path = 'data/X1-val/features.tsv'
y_1_val_path = 'data/X1-val/y.tsv'

X_2_train_path = 'data/X2-train/features.tsv'
y_2_train_path = 'data/X2-train/y.tsv'

X_2_val_path = 'data/X2-val/features.tsv'
y_2_val_path = 'data/X2-val/y.tsv'

# Load data
X_1_train = pd.read_csv(X_1_train_path, sep='\t')
y_1_train = pd.read_csv(y_1_train_path, sep='\t')

X_1_val = pd.read_csv(X_1_val_path, sep='\t')
y_1_val = pd.read_csv(y_1_val_path, sep='\t')

X_2_train = pd.read_csv(X_2_train_path, sep='\t')
y_2_train = pd.read_csv(y_2_train_path, sep='\t')

X_2_val = pd.read_csv(X_2_val_path, sep='\t')
y_2_val = pd.read_csv(y_2_val_path, sep='\t')

In [4]:
# some further data preprocessing


cols_to_keep = ['DNase_num_peaks', 'DNase_avg_peaks', 'H3K4me1_num_peaks', 'H3K4me1_avg_peaks', 'H3K4me3_num_peaks', 
                'H3K4me3_avg_peaks', 'H3K27ac_num_peaks', 'H3K27ac_avg_peaks']

# cols_to_keep = ['dnase_val', 'dnase_dist', 'H3K4me3_signal', 'H3K4me3_distance', 'H3K4me3_num_peaks', 
#                 'H3K4me3_avg_peaks', 'H3K27ac_signal', 'H3K27ac_distance', 'H3K27ac_num_peaks', 
#                 'H3K27ac_avg_peaks']


# extend data by some basic transcription site features
X_1_train = extend_df(X_1_train, cols_to_keep)
X_1_val = extend_df(X_1_val, cols_to_keep)
X_2_train = extend_df(X_2_train, cols_to_keep)
X_2_val = extend_df(X_2_val, cols_to_keep)


# standardize data

# Standardize X_1 data
scaler_1 = StandardScaler()
X_1_train = scaler_1.fit_transform(X_1_train)
X_1_val = scaler_1.transform(X_1_val)

# Standardize X_2 data
scaler_2 = StandardScaler()
X_2_train = scaler_2.fit_transform(X_2_train)
X_2_val = scaler_2.transform(X_2_val)


# stack data

# Stack the training data and validation data for X and y
X_train = np.vstack([X_1_train, X_2_train])
X_val = np.vstack([X_1_val, X_2_val])

# Combine y data for training and validation
y_train = pd.concat([y_1_train, y_2_train], ignore_index=True)
y_val = pd.concat([y_1_val, y_2_val], ignore_index=True)


In [5]:
# Prepare the LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

# Parameters for LightGBM with Huber loss
params = {
    'boosting_type': 'gbdt',
    'objective': 'huber',
    'alpha': 0.9,
    'learning_rate': 0.5,
    'num_leaves': 50,
    'n_estimators': 100
}

# Train the model
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, val_data],
    valid_names=['train', 'val'],
)

# Predict on validation set
y_pred_val = model.predict(X_val)

# Calculate Spearman correlation for the predictions on validation set
spearman_corr, _ = spearmanr(y_val, y_pred_val)
print(f'Spearman correlation on validation set: {spearman_corr}')



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000695 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1588
[LightGBM] [Info] Number of data points in the train set: 28620, number of used features: 12
[LightGBM] [Info] Start training from score 50.915895
Spearman correlation on validation set: 0.7871156138306246


In [6]:
# --- Final Model Training for Deployment (/ real test data prediction) ---
# Combine the training and validation data for final training
X_final_train = np.vstack([X_train, X_val])
y_final_train = pd.concat([y_train, y_val], ignore_index=True)

# Prepare the LightGBM dataset for the combined data
final_train_data = lgb.Dataset(X_final_train, label=y_final_train)

# Train the final model on the entire combined dataset with the optimal parameters
final_model = lgb.train(
    params,
    final_train_data,
    valid_sets=[final_train_data],
    valid_names=['train'],
)

print("Final model trained on the combined dataset for deployment.")



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011796 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1588
[LightGBM] [Info] Number of data points in the train set: 32568, number of used features: 12
[LightGBM] [Info] Start training from score 52.361132
Final model trained on the combined dataset for deployment.


In [9]:
# load initial test data for initial gene name order (before feature engineering)
original_test = pd.read_csv('data/CAGE-train/X3_test_info.tsv', sep='\t')
gene_names_orig_order = original_test['gene_name']


# load feature engineered test data (with different order of rows)
X_test_path = 'data/X3-test/features.tsv'
X_test = pd.read_csv(X_test_path, sep='\t')

# gene names (in changed order)
gene_names_test_feat_eng = X_test['gene_name']

# extend data by some basic transcription site features
X_test = extend_df(X_test, cols_to_keep)

# standardize data
scaler_test = StandardScaler()
X_test = scaler_test.fit_transform(X_test)

In [10]:
# FINAL GENE EXPRESSION PREDICTION FOR TEST DATA
pred = final_model.predict(X_test)
pred

array([ 8.65539254, 12.89496183,  8.65539254, ..., 41.27389379,
        8.91241666, 29.23367441])

In [13]:
# Concatenate gene names and predictions for feature engineered features (in wrong gene order)
final_pred_unsorted = pd.concat([gene_names_test_feat_eng, pd.Series(pred)], ignore_index=True, axis=1)

# Rename the columns
final_pred_unsorted.columns = ['gene_name', 'gex_predicted']

# sort the predicted genes according to intially order for test data
final_pred_sorted = final_pred_unsorted.set_index('gene_name').loc[gene_names_orig_order].reset_index()

test_genes = final_pred_sorted

In [16]:
# Store predictions in a ZIP. 
# Upload this zip on the project website under "Your submission".
# Zip this notebook along with the conda environment (and README, optional) and upload this under "Your code".

save_dir = './submission_res'  # TODO
file_name = 'gex_predicted.csv'         # PLEASE DO NOT CHANGE THIS
zip_name = "Makonnen_Mikael_Project1.zip" # TODO
save_path = f'{save_dir}/{zip_name}'
compression_options = dict(method="zip", archive_name=file_name)

#test_genes['gex_predicted'] = pred.tolist()
test_genes[['gene_name', 'gex_predicted']].to_csv(save_path, compression=compression_options)