In [52]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

from utils.file_utils import open_json

#2 Importing the dataset
dataset = pd.read_csv("./dataset/main_dataset.csv")
features = open_json("./dataset/features.json")

# clean dataset:
dataset = dataset.drop(columns=features["identification"], axis=1)

use_features = {"identification": False, "basics": True, "blosum": True,
                "demask": True, "protein_analysis": True, 
                "deltas_protein_analysis": True, 
                "3D_structure_analysis": True, "dssp_3D_analysis": True,
                "target": False}

feature_columns = [features[key] for key in features if use_features[key]]
feature_columns = sum(feature_columns, []) # flatten the list of columns

X = dataset[feature_columns]
y = dataset[["ddG"]].values.astype(float)

print(f"there are {X.isna().sum().sum()} na occurences in the X dataset")
print(f"there are {np.isnan(y).sum()} na occurences in the y dataset")

# y
X.iloc[1,:3]

there are 0 na occurences in the X dataset
there are 0 na occurences in the y dataset


wild_aa_int          11.0
mutated_aa_int        3.0
mutated_chain_int    65.0
Name: 1, dtype: float64

In [53]:
#3 Feature Scaling
sc_X = StandardScaler()
sc_y = StandardScaler()
X = sc_X.fit_transform(X)
y = sc_y.fit_transform(y)
y = y.ravel()

X[1,:], len(X[1,:])

(array([ 1.41057529e-01, -7.98828451e-01, -1.27915516e-01, -6.61592897e-01,
         2.47340964e-01,  4.93923757e-01,  2.11656958e-01, -1.13802824e-01,
        -1.43432906e+00, -1.57020011e+00, -1.36189185e+00, -1.18023032e+00,
        -4.90464015e-01, -1.90570785e+00, -1.78869241e+00, -6.77788766e-01,
        -7.97958178e-01,  9.75948187e-01, -9.27786446e-02, -1.06171239e+00,
        -9.59537924e-01, -8.25306160e-01,  2.05608271e-01, -4.92466385e-02,
        -5.27967313e-02,  1.10115584e+00, -9.00290337e-01, -8.03548436e-02,
         1.35614183e+00, -1.22683087e-01,  6.54984678e-01, -2.26302966e-01,
         1.54858135e-01, -4.59664345e-02,  1.19775867e-01, -4.10125788e-02,
        -4.10334914e-02,  7.16791023e-01,  5.17175492e-02, -3.67340763e+00,
         3.68480985e+00, -6.09968198e-01, -5.75994351e-01,  1.53593190e+00,
         2.22672074e+00,  3.91719772e-01,  8.14749532e+00,  1.45391740e+00,
         5.84356896e-02,  1.21892679e+00,  9.32061370e-03,  1.02430721e+00,
        -3.8

In [54]:

#4 Fitting the Support Vector Regression Model to the dataset
# Create your support vector regressor here
# most important SVR parameter is Kernel type. It can be 
# linear,polynomial or gaussian SVR. We have a non-linear condition 
# so we can select polynomial or gaussian but here we select 
# RBF(a gaussian type) kernel.
regressor = SVR(kernel='rbf')
regressor.fit(X,y)

# Predicting on submission

In [55]:
# loading processed_test, ie. test.csv with added infos
df = pd.read_csv("./dataset/processed_test.csv")

# clean dataset:
df = df.drop(columns=features["identification"], axis=1)

# using the same use_features as in training:
feature_columns = [features[key] for key in features if use_features[key]]
feature_columns = sum(feature_columns, []) # flatten the list of columns

X_test = df[feature_columns]

print(f"there are {X_test.isna().sum().sum()} na occurences in the X df")

print(X_test.iloc[:5,:3])

there are 0 na occurences in the X df
   wild_aa_int  mutated_aa_int  mutated_chain_int
0           10               4                 65
1           10               9                 65
2           10               0                 65
3            9               2                 65
4            9               5                 65


In [56]:
X_test = sc_X.transform(X_test)
X_test[1, :3]

array([-0.02665269,  0.16235828, -0.12791552])

In [57]:
results = regressor.predict(X_test)
results[:10]

array([ 0.14079074, -0.36843532, -0.17187205, -0.00180886, -0.19009549,
       -0.04443055, -0.14264955, -0.16276204, -0.45465314, -0.43073576])

In [62]:
submission = pd.DataFrame(columns=["seq_id", "tm"])
submission["seq_id"] = df["seq_id"]
submission["tm"] = results * -1
submission.head()

Unnamed: 0,seq_id,tm
0,31390,-0.140791
1,31391,0.368435
2,31392,0.171872
3,31393,0.001809
4,31394,0.190095


In [63]:
from datetime import datetime
date_time = datetime.now()
timestamp = date_time.strftime("%Y-%m-%d_%H-%M-%S")
submission.to_csv(f"./submission/submission_{timestamp}.csv", index=False)
