# A Model to Predict Drug Solubility 

In [30]:
import pandas as pd
import numpy as np
import sklearn 
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.feature_selection import RFE
from sklearn.svm import SVR

In [3]:
#import data
training_csv_path = 'C:/Users/joe/Downloads/training_full_set.csv'
test_csv_path = 'C:/Users/joe/Downloads/test_full_set.csv'
training_df = pd.read_csv(training_csv_path)
test_df = pd.read_csv(test_csv_path)

In [4]:
#Adding solubility column to test data to make sure its not lost during processing
training_df = training_df.rename(columns ={'S0 (mM)':'Solubility'})
test_df['Log_S'] = 12345

In [5]:
training_df['Log_S'] = np.log(training_df.Solubility)

In [6]:
#default axis=0, join='inner'
df = pd.concat([training_df,test_df], join='inner')
df = df.set_index('NAME')

In [7]:
df = df._get_numeric_data()
df = df.dropna(axis=1, how='any')

In [8]:
# seperate out the data frame into training and test again. currently have n number of records with log s =111
# need to seperate them into training and test again to generate model 
test_df = df[df.Log_S == 12345]
train_df = df[df.Log_S != 12345]

In [9]:
#new dataframe made up of everything but solubility
#and an array for input into feature reduction algorithm 
X_train_df = train_df.drop('Log_S', axis=1)
Y_train_df = train_df.loc[:,'Log_S']

In [10]:
X_train = X_train_df.values
y_train = Y_train_df.values

In [11]:
X_test_df = test_df.drop('Log_S', axis=1)
X_test = X_test_df.values

In [36]:
X_train_scaled = preprocessing.scale(X_train)
X_test_scaled = preprocessing.scale(X_test)

In [37]:
#using two different feature reduction methods 
lasso = Lasso()
elastic = ElasticNet()

lasso_selector = RFE(lasso, 14)
elastic_selector = RFE(elastic, 14)

lasso_model = lasso_selector.fit(X_train_scaled, y_train)
elastic_model = elastic_selector.fit(X_train_scaled, y_train)

In [15]:
X_lasso_columns = X_train_df.columns[lasso_model.get_support()]
X_elastic_columns = X_train_df.columns[elastic_model.get_support()]

In [16]:
y_lasso_predict = lasso_model.predict(X_test_scaled)

In [17]:
y_elastic_predict = elastic_model.predict(X_test_scaled)

In [23]:
lasso_score = lasso_model.score(X_train_scaled, y_train)
elastic_score = elastic_model.score(X_train_scaled, y_train)

In [25]:
print(lasso_score)
print(elastic_score)

-4.504596298553204
-5.0180741483997044


In [22]:
test_df['predicted_lasso'] = y_lasso_predict
test_df['predicted_elastic'] = y_elastic_predict

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [None]:
df_lasso_out = pd.merge(df,y_lasso[['precited_lasso']],how = 'left',left_index = True, right_index = True)
df_elastic_out = pd.merge(df,y_elastic[['precited_elastic']],how = 'left',left_index = True, right_index = True)

In [None]:
df_lasso_out.to_csv('C:/Users/joe/Documents/Chemistry/Data Driven Science/lasso predict.csv')
df_elastic_out.to_csv('C:/Users/joe/Documents/Chemistry/Data Driven Science/elastic predict.csv')

In [20]:
# repeating without using scaled data to compare results


In [19]:
raw_lasso_model = lasso_selector.fit(X_train, y_train)
raw_elastic_model = elastic_selector.fit(X_train, y_train)

































































































































































































































































































































































































































































































In [23]:
raw_y_lasso_predict = raw_lasso_model.predict(X_test)
raw_y_elastic_predict = raw_elastic_model.predict(X_test)

In [24]:
df_raw_y_lasso_predict = pd.DataFrame(data=raw_y_lasso_predict)
df_raw_y_elastic_predict = pd.DataFrame(data=raw_y_elastic_predict)
df_raw_y_lasso_predict.to_csv('C:/Users/joe/Documents/Chemistry/Data Driven Science/raw lasso predict.csv')
df_raw_y_elastic_predict.to_csv('C:/Users/joe/Documents/Chemistry/Data Driven Science/raw elastic predict.csv')