In [106]:
from scipy.stats import spearmanr
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor

In [107]:
train_mult = pd.read_csv('train_dataset_multiple.csv', header = None)
train_sing = pd.read_csv('single_train.csv', header = None)
test_sing = pd.read_csv('single_test.csv', header = None)
test_mult = pd.read_csv('./multiple_test.csv', header = None)

train_embedings = pd.read_csv('embedings_tensors_train.txt', header = None)

In [103]:
lens = []
for row in train_embedings.transpose().iterrows():
    lens.append(len(set(row[1])))

In [105]:
[i for i in lens if i != 1]

[5, 2, 6, 3, 6, 2, 6, 5]

In [108]:
# Separate features (X) and target (y)
X = train_embedings.to_numpy()
y = train_sing.iloc[:,2].to_numpy()

In [109]:
idx = np.argwhere(np.all(X[..., :] == 0, axis=0))


In [110]:
X_del = np.delete(X, idx, axis = 1)

In [120]:
X.shape

(8539, 2560)

In [45]:
mask_list = []
for row in train_mult.iloc[:,0:2].iterrows():
    seq1 = row[1][0]
    seq2 = row[1][1]
    mask = [seq1[i] == seq2[i] for i in range(1, len(seq1))]
    mask_list.append(sum(mask))

In [119]:
mask = [i == 1 for i in mask_list]
train_mult.iloc[mask,:] #[i == 41 for i in mask_list]

Unnamed: 0,0,1,2,3
204,GSVETRSNGQTYTFRSPEEAREWAEKYGVQTWRTENGRVESRP,GSSWYNTGNPFAEEKEEVARSRTSYVPTTREETVRQRQWRGEG,0.14,2_3_4_5_6_7_8_9_10_11_12_13_14_15_16_17_18_19_...
212,GSPELHVGNYTVTGNDEEAKKKAKKTFGPRRYTEDGETIQFQP,GSGRENPFEAVKQFKPQALYYEGETIKTVTKKETTPGDNGDRH,0.11,2_3_4_5_6_7_8_9_10_11_12_13_14_15_16_17_18_19_...
281,GSVKLEENGQTYTFRTTEEAQRWAKKNGARELKSENGRVESRF,GSGSTKFETGWNKNSRVFTREAENREGQQLAETKAERERVKYL,0.24,2_3_4_5_6_7_8_9_10_11_12_13_14_15_16_17_18_19_...
288,GSPELNVRGKTYDAGSEENAERQARKEGAQRITSDGNQVTVQV,GSVPQTAEQGVGENVDQSEYAERTSNLRVRATGNRIGEQADKK,0.28,2_3_4_5_6_7_8_9_10_11_12_13_14_15_16_17_18_19_...
318,GSPEITSNNRTHTTDNPDKWAKEEAKKKGKTVKNENGEVKERG,GSIWEENTTDGTPKKESVEPNAKTKAHEEGVGRKKKNNRDNTK,0.35,2_3_4_5_6_7_8_9_10_11_12_13_14_15_16_17_18_19_...
...,...,...,...,...
14208,SDESEKLSELARRLGLDEDQARTAKKLFSENPEKAKRYIKKAK,LAKLRLGKDKENFPSRAELKEAQSISKTKREKDAYAESDLEKR,0.54,0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15_16_18_19...
14214,NEDEKKAKELEKRANGDTEKAKRLARKLGNERVERLIEKRQRK,NRRALREQREKAKEILKKRELRLEAEDTEDKGRKKVNNGKKAE,0.05,1_2_3_4_5_6_7_8_9_10_11_12_13_14_15_16_17_18_1...
14296,TTKDELRKVLKKLYGVSSEIAERIIRKYNSEQAEKDKRQLEKG,TKTVKEKKGQNLRSLGKDYVKRDRQKEEKEIAEIYLRLSIAES,0.55,1_2_3_4_5_6_8_9_10_11_12_13_14_15_16_17_18_19_...
14355,SDEAKKRAEELRKRGYSEDQIEKWARDQNNEDAREYLERQRKN,WAYQKRYRRNDEDEEEDLRAEQENKNKEAIGEQKLDRASRKRS,-0.08,0_1_2_3_5_6_7_8_9_10_11_12_13_14_15_16_17_18_1...


In [113]:
X_train, X_test, y_train, y_test = train_test_split(X_del, y, test_size=0.1, random_state=42)

In [114]:
# Initialize the Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=1000, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_rf_pred = rf_model.predict(X_test)

# Calculate Spearman
spearm_rf = spearmanr(y_test, y_rf_pred)
print(f"Spearman: {spearm_rf}")

Spearman: SignificanceResult(statistic=0.04337538918361097, pvalue=0.205401946001942)


In [115]:
# GradientBoostingRegressor
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)

y_gb_pred = gb_model.predict(X_test)

# Calculate Spearman
spearm_gb = spearmanr(y_test, y_gb_pred)
print(f"Spearman: {spearm_gb}")

Spearman: SignificanceResult(statistic=0.042883123701136845, pvalue=0.2105968791572273)


In [116]:
# SVR (Support Vector Regressor)
svr_model = SVR(kernel='linear')  # You can experiment with different kernels
svr_model.fit(X_train, y_train)

y_svr_pred = svr_model.predict(X_test)
# Calculate Spearman
spearm_svr = spearmanr(y_test, y_svr_pred)
print(f"Spearman: {spearm_svr}")

Spearman: SignificanceResult(statistic=0.04257535046711635, pvalue=0.21389268040730808)


In [121]:
# CatBoostRegressor
catboost_model = CatBoostRegressor(iterations=10000, random_state=42, verbose=0)
catboost_model.fit(X_train, y_train)

y_catboost_pred = catboost_model.predict(X_test)
# Calculate Spearman
spearm_cb = spearmanr(y_test, y_catboost_pred)
print(f"Spearman: {spearm_cb}")

Spearman: SignificanceResult(statistic=0.04186155120538809, pvalue=0.22167900857239006)
