In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
import math
import sklearn.metrics

In [2]:
# Load the training and test data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Extract IDs for submission
submission_ids = test_data.iloc[:, 1]

In [3]:
train_data.head()

Unnamed: 0,id,id_seqpos,sequence,structure,predicted_loop_type,reactivity,reactivity_error,deg_Mg_pH10,deg_error_Mg_pH10,deg_pH10,...,b4_structure,a4_structure,b4_predicted_loop_type,a4_predicted_loop_type,b5_sequence,a5_sequence,b5_structure,a5_structure,b5_predicted_loop_type,a5_predicted_loop_type
0,id_159f0ff46,id_159f0ff46_44,A,.,H,1.7786,0.1195,0.7346,0.079,0.368,...,(,),S,S,C,A,.,),I,S
1,id_d73546f4e,id_d73546f4e_38,G,.,I,0.4397,0.1033,0.163,0.0799,0.0878,...,(,.,S,H,C,C,.,.,B,H
2,id_f3037fedf,id_f3037fedf_34,G,(,S,0.112,0.1375,0.0,0.0744,0.1835,...,(,(,S,S,A,G,(,(,S,S
3,id_468300749,id_468300749_12,G,(,S,0.0622,0.0471,0.1179,0.1075,0.0657,...,.,(,E,S,A,U,.,(,E,S
4,id_69e3878a2,id_69e3878a2_28,U,(,S,0.1378,0.1352,0.4069,0.2168,0.3375,...,(,(,S,S,G,G,(,.,S,H


In [4]:
# Mapping for encoding categorical features
sequence_enc_map = {'A': 0, 'G': 1, 'C': 3, 'U': 2}
structure_enc_map = {'.': 0, '(': 1, ')': 1}
looptype_enc_map = {'S': 6, 'E': 2, 'H': 0, 'I': 5, 'X': 4, 'M': 3, 'B': 1}

enc_targets = ['sequence', 'a1_sequence', 'a2_sequence', 'a3_sequence',
               'b1_sequence', 'b2_sequence', 'b3_sequence', 'b4_sequence',
               'a4_sequence', 'b5_sequence', 'a5_sequence', 'structure', 'a1_structure',
               'b1_structure', 'b2_structure', 'a2_structure', 'b3_structure', 'a3_structure',
               'b4_structure', 'a4_structure', 'a5_structure', 'b5_structure', 'predicted_loop_type',
               'b1_predicted_loop_type', 'a1_predicted_loop_type', 'b2_predicted_loop_type', 'a2_predicted_loop_type',
               'b3_predicted_loop_type', 'a3_predicted_loop_type', 'b4_predicted_loop_type', 'a4_predicted_loop_type',
               'b5_predicted_loop_type', 'a5_predicted_loop_type', 'predicted_loop_type'
               ]
enc_maps = [sequence_enc_map, sequence_enc_map, sequence_enc_map, sequence_enc_map,
            sequence_enc_map, sequence_enc_map, sequence_enc_map,
            sequence_enc_map, sequence_enc_map, sequence_enc_map, sequence_enc_map,
            structure_enc_map, structure_enc_map, structure_enc_map, structure_enc_map,
            structure_enc_map, structure_enc_map, structure_enc_map, structure_enc_map,
            structure_enc_map, structure_enc_map, structure_enc_map, structure_enc_map,
            looptype_enc_map, looptype_enc_map, looptype_enc_map,
            looptype_enc_map, looptype_enc_map, looptype_enc_map,
            looptype_enc_map, looptype_enc_map, looptype_enc_map, looptype_enc_map, looptype_enc_map
            ]


In [5]:
train_data

Unnamed: 0,id,id_seqpos,sequence,structure,predicted_loop_type,reactivity,reactivity_error,deg_Mg_pH10,deg_error_Mg_pH10,deg_pH10,...,b4_structure,a4_structure,b4_predicted_loop_type,a4_predicted_loop_type,b5_sequence,a5_sequence,b5_structure,a5_structure,b5_predicted_loop_type,a5_predicted_loop_type
0,id_159f0ff46,id_159f0ff46_44,A,.,H,1.7786,0.1195,0.7346,0.0790,0.3680,...,(,),S,S,C,A,.,),I,S
1,id_d73546f4e,id_d73546f4e_38,G,.,I,0.4397,0.1033,0.1630,0.0799,0.0878,...,(,.,S,H,C,C,.,.,B,H
2,id_f3037fedf,id_f3037fedf_34,G,(,S,0.1120,0.1375,0.0000,0.0744,0.1835,...,(,(,S,S,A,G,(,(,S,S
3,id_468300749,id_468300749_12,G,(,S,0.0622,0.0471,0.1179,0.1075,0.0657,...,.,(,E,S,A,U,.,(,E,S
4,id_69e3878a2,id_69e3878a2_28,U,(,S,0.1378,0.1352,0.4069,0.2168,0.3375,...,(,(,S,S,G,G,(,.,S,H
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130555,id_796128149,id_796128149_27,U,),S,0.1244,0.0393,0.2672,0.0728,0.1479,...,),),S,S,A,A,.,.,H,M
130556,id_8a466e770,id_8a466e770_39,A,.,X,0.0201,0.0351,0.1271,0.0583,0.0580,...,.,.,H,X,A,U,.,.,H,X
130557,id_998669e76,id_998669e76_52,A,.,H,0.4597,0.0601,0.2014,0.0515,0.2838,...,.,.,H,I,U,U,.,),H,S
130558,id_6fe4a1abc,id_6fe4a1abc_45,G,.,I,0.3634,0.1063,0.1928,0.0738,0.1885,...,),.,S,I,G,G,),),S,S


In [6]:
# Apply encoding to categorical features
for target_col, enc_map in zip(enc_targets, enc_maps):
    train_data[target_col] = train_data[target_col].apply(lambda x: enc_map.get(x, -1))
    test_data[target_col] = test_data[target_col].apply(lambda x: enc_map.get(x, -1))

In [19]:
test_data

Unnamed: 0,id,id_seqpos,sequence,structure,predicted_loop_type,reactivity_error,deg_error_Mg_pH10,deg_pH10,deg_error_pH10,deg_error_Mg_50C,...,b4_structure,a4_structure,b4_predicted_loop_type,a4_predicted_loop_type,b5_sequence,a5_sequence,b5_structure,a5_structure,b5_predicted_loop_type,a5_predicted_loop_type
0,id_001f94081,id_001f94081_0,1,0,-1,0.1359,0.2613,2.3375,0.2631,0.1501,...,-1,0,-1,2,-1,0,-1,1,-1,6
1,id_001f94081,id_001f94081_4,0,0,-1,0.1314,0.1798,0.2635,0.1000,0.1369,...,0,1,2,6,-1,3,-1,1,-1,6
2,id_001f94081,id_001f94081_8,2,1,-1,0.0756,0.1056,0.0530,0.0517,0.0705,...,0,0,2,0,0,2,0,0,2,0
3,id_001f94081,id_001f94081_9,3,1,-1,0.1087,0.1896,0.5348,0.1474,0.1588,...,1,0,6,0,0,0,0,0,2,0
4,id_001f94081,id_001f94081_13,2,0,-1,0.1388,0.1810,0.7699,0.1417,0.1382,...,1,0,6,0,2,1,1,1,6,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32635,id_fff546103,id_fff546103_39,0,1,-1,0.0091,0.0344,0.0140,0.0303,0.0284,...,1,1,6,6,3,0,1,0,6,0
32636,id_fff546103,id_fff546103_52,0,1,-1,0.0237,0.0390,0.0806,0.0529,0.0361,...,1,1,6,6,1,1,1,1,6,6
32637,id_fff546103,id_fff546103_62,0,0,-1,0.0655,0.0672,0.5538,0.1022,0.0694,...,0,1,3,6,1,3,1,1,6,6
32638,id_fff546103,id_fff546103_65,0,1,-1,0.0218,0.0391,0.1642,0.0611,0.0520,...,1,1,6,6,2,3,1,1,6,6


In [7]:
train_data

Unnamed: 0,id,id_seqpos,sequence,structure,predicted_loop_type,reactivity,reactivity_error,deg_Mg_pH10,deg_error_Mg_pH10,deg_pH10,...,b4_structure,a4_structure,b4_predicted_loop_type,a4_predicted_loop_type,b5_sequence,a5_sequence,b5_structure,a5_structure,b5_predicted_loop_type,a5_predicted_loop_type
0,id_159f0ff46,id_159f0ff46_44,0,0,-1,1.7786,0.1195,0.7346,0.0790,0.3680,...,1,1,6,6,3,0,0,1,5,6
1,id_d73546f4e,id_d73546f4e_38,1,0,-1,0.4397,0.1033,0.1630,0.0799,0.0878,...,1,0,6,0,3,3,0,0,1,0
2,id_f3037fedf,id_f3037fedf_34,1,1,-1,0.1120,0.1375,0.0000,0.0744,0.1835,...,1,1,6,6,0,1,1,1,6,6
3,id_468300749,id_468300749_12,1,1,-1,0.0622,0.0471,0.1179,0.1075,0.0657,...,0,1,2,6,0,2,0,1,2,6
4,id_69e3878a2,id_69e3878a2_28,2,1,-1,0.1378,0.1352,0.4069,0.2168,0.3375,...,1,1,6,6,1,1,1,0,6,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130555,id_796128149,id_796128149_27,2,1,-1,0.1244,0.0393,0.2672,0.0728,0.1479,...,1,1,6,6,0,0,0,0,0,3
130556,id_8a466e770,id_8a466e770_39,0,0,-1,0.0201,0.0351,0.1271,0.0583,0.0580,...,0,0,0,4,0,2,0,0,0,4
130557,id_998669e76,id_998669e76_52,0,0,-1,0.4597,0.0601,0.2014,0.0515,0.2838,...,0,0,0,5,2,2,0,1,0,6
130558,id_6fe4a1abc,id_6fe4a1abc_45,1,0,-1,0.3634,0.1063,0.1928,0.0738,0.1885,...,1,0,6,5,1,1,1,1,6,6


In [23]:
test_data.drop(['id', 'id_seqpos'], axis=1).head(1).to_csv("Before_Encoding.csv", index=False)

In [8]:

# Prepare features and target variables
target_columns = ['reactivity', 'deg_Mg_pH10', 'deg_Mg_50C']
features = train_data.drop(target_columns, axis=1)
features = features.drop(['id', 'id_seqpos'], axis=1)
targets = train_data[target_columns]

In [9]:


# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(features, targets, test_size=0.2, random_state=79)

In [11]:
# Initialize and train different regressor models
def train_regressor(model, X_train, y_train, X_val):

    """
    Trains a regressor model and returns the predictions on the validation set

    Parameters
    ----------
    model : Regressor model
        Regressor model to be trained
    X_train : pandas dataframe
        Training features
    y_train : pandas dataframe
        Training targets
    X_val : pandas dataframe
        Validation features

    Returns
    -------
    y_pred : numpy array
        Predictions on the validation set

    """

    regressor = MultiOutputRegressor(model)
    regressor.fit(X_train, y_train)
    y_pred = regressor.predict(X_val)
    return y_pred

models = [
    XGBRegressor(n_estimators=1000, learning_rate=0.05, max_depth=4, subsample=0.8),
    GradientBoostingRegressor(n_estimators=1000, learning_rate=0.05, max_depth=4, subsample=0.8),
    CatBoostRegressor(n_estimators=1000, learning_rate=0.05, max_depth=4)
]
model_names = ['XGBoost', 'Gradient Boosting', 'CatBoost']

for model, model_name in zip(models, model_names):
    print(f"Training {model_name} model...")
    y_pred = train_regressor(model, X_train, y_train, X_val)
    model_mcrmse = mean_squared_error(y_val, y_pred)
    print(f"{model_name} MCRMSE:", model_mcrmse)

Training XGBoost model...
XGBoost MCRMSE: 0.11053216469615257
Training Gradient Boosting model...
Gradient Boosting MCRMSE: 0.11404849963169801
Training CatBoost model...
0:	learn: 0.7427093	total: 167ms	remaining: 2m 46s
1:	learn: 0.7331762	total: 177ms	remaining: 1m 28s
2:	learn: 0.7242923	total: 186ms	remaining: 1m 1s
3:	learn: 0.7163590	total: 193ms	remaining: 48.1s
4:	learn: 0.7087718	total: 200ms	remaining: 39.8s
5:	learn: 0.7016632	total: 205ms	remaining: 34s
6:	learn: 0.6949824	total: 209ms	remaining: 29.7s
7:	learn: 0.6888490	total: 213ms	remaining: 26.5s
8:	learn: 0.6833194	total: 218ms	remaining: 24s
9:	learn: 0.6780041	total: 223ms	remaining: 22.1s
10:	learn: 0.6729519	total: 227ms	remaining: 20.4s
11:	learn: 0.6683190	total: 232ms	remaining: 19.1s
12:	learn: 0.6637141	total: 236ms	remaining: 17.9s
13:	learn: 0.6595685	total: 242ms	remaining: 17s
14:	learn: 0.6557113	total: 246ms	remaining: 16.1s
15:	learn: 0.6524444	total: 251ms	remaining: 15.4s
16:	learn: 0.6489820	total:

In [12]:
# Use the best model (based on the lowest MCRMSE) for prediction on the test data
best_model_idx = np.argmin([mean_squared_error(y_val, train_regressor(model, X_train, y_train, X_val)) for model in models])
best_model = models[best_model_idx]
best_model.fit(features, targets)
test_features = test_data.drop(['id', 'id_seqpos'], axis=1)
final_y_pred = best_model.predict(test_features)

KeyboardInterrupt: 

In [16]:
test_data.drop(['id', 'id_seqpos'], axis=1).head(1)

Unnamed: 0,sequence,structure,predicted_loop_type,reactivity_error,deg_error_Mg_pH10,deg_pH10,deg_error_pH10,deg_error_Mg_50C,deg_50C,deg_error_50C,...,b4_structure,a4_structure,b4_predicted_loop_type,a4_predicted_loop_type,b5_sequence,a5_sequence,b5_structure,a5_structure,b5_predicted_loop_type,a5_predicted_loop_type
0,1,0,-1,0.1359,0.2613,2.3375,0.2631,0.1501,0.6382,0.2167,...,-1,0,-1,2,-1,0,-1,1,-1,6


In [18]:
test_data.drop(['id', 'id_seqpos'], axis=1).head(1).to_csv('web_app_test.csv', index=False)

In [None]:
test_features

In [None]:
final_y_pred

In [None]:
# Prepare submission DataFrame
submission_df = pd.DataFrame({
    'id_seqpos': submission_ids,
    'reactivity': final_y_pred[:, 0],
    'deg_Mg_pH10': final_y_pred[:, 1],
    'deg_Mg_50C': final_y_pred[:, 2]
})

In [None]:
# Save submission to a CSV file
submission_df.to_csv("submission04.csv", index=False)