In [60]:
import pandas as pd
from scipy import stats as st
from sklearn.svm import LinearSVR

SEED = 0

## Read in Data

In [64]:
df = pd.read_csv("../../src/data/source_data/Series3_6.15.17_padel.csv").set_index("Name")
test = pd.isna(df["IC50"])
train = ~test

## Remove Features with 0 Variance

In [65]:
before = len(df.columns) - 1 
print("Removing Features with no Variance...")
df = df.loc[:, df.std() > 0]
print("\t  Features Removed: {}".format(before - len(df.columns)))
print("\tFeatures Remaining: {}".format(len(df.columns)))

Removing Features with no Variance...
	  Features Removed: 350
	Features Remaining: 1094


## Remove Features with Less than 3 Unique Values

In [67]:
before = len(df.columns) - 1
print("Removing Features with Less than 3 unique values...")
df = df.loc[:, df.nunique() > 3]
print("\t  Features Removed: {}".format(before - len(filtered_features)))
print("\tFeatures Remaining: {}".format(len(filtered_features)))

Removing Features with Less than 3 unique values...
	  Features Removed: 674
	Features Remaining: 339


## Remove Features Based on Kolmogorov-Smirnov Test Comparing Test and Train Features

In [68]:
filtered_features = list()
a = 0.10
before = len(df.columns)

for f in df.dropna(axis=1).columns:
    ks = st.ks_2samp(df.loc[test, f], df.loc[train, f])
    # if p-value > a, add to list
    if ks[1] > a:
        filtered_features.append(f)
print("Remove Features where the Test and Train sets are different...")
print("\t  Features Removed: {}".format(before - len(filtered_features)))
print("\tFeatures Remaining: {}".format(len(filtered_features)))

Remove Features where the Test and Train sets are different...
	  Features Removed: 675
	Features Remaining: 339


In [69]:
x_train = df.loc[train, filtered_features]
y_train = df.loc[train, "IC50"]
x_test  = df.loc[test, filtered_features]

## Run SVR on all Selected Features

In [71]:
svr = LinearSVR(random_state=SEED)
svr.fit(x_train, y_train)
svr.score(x_train,y_train)

0.38667542271162014