## Extracting data

In [1]:
from rdkit import Chem
from rdkit.Chem import Descriptors
import pandas as pd
import numpy as np
import csv

#Extracting training data
df = pd.read_csv('train.csv')
mols = [Chem.MolFromSmiles(mol) for mol in df['SMILES_canonical']]
X = np.array([[d[1](m) for d in Descriptors._descList] for m in mols])
Y = np.array(df['target_feature'])

#Extracting test data for making prediction
df_test = pd.read_csv('test.csv')
mols_test = [Chem.MolFromSmiles(mol) for mol in df_test['SMILES_canonical']]
X_test = np.array([[d[1](m) for d in Descriptors._descList] for m in mols_test])


## data splitting and scaling, PCA

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

#splitting data in training and testing data
X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=0.2)

'''Scaling and PCA are done after splitting to prevent data leakage'''

#scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled= scaler.transform(X_test)
X_test_scaled= scaler.transform(X_test)

#Perfrom pca and transform data
pca = PCA(0.95)
X_train_pca = pca.fit_transform(X_train_scaled)
X_val_pca = pca.transform(X_val_scaled)
X_test_pca = pca.transform(X_test_scaled)


print(X_train_pca.shape, X_val_pca.shape, X_test_pca.shape)

(4987, 80) (1247, 80) (6233, 80)


## Export to CSV


In [9]:
def export_scv(filename, prediction_list):
    with open (filename , 'w', newline = '') as csvfile:
        colum_names = ['Unique_ID', 'target_feature']

        writer = csv.DictWriter(csvfile, fieldnames= colum_names)
        writer.writeheader()

        molecule_counter = 0
        for i in range(len(prediction_list)):
            molecule_counter += 1
            writer.writerow({'Unique_ID' : molecule_counter, 'target_feature' : prediction_list[i]})

## random forest model

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score, precision_score, recall_score

#Random forrest algorithm
forest = RandomForestClassifier(max_leaf_nodes = 100, class_weight='balanced', max_depth=15, bootstrap=True, random_state = 3)
forest.fit(X_train_scaled, y_train)

#Predict y values from X_val_pca and produce balanced accuracy, precision and recall 
y_pred_forest = forest.predict(X_val_scaled)
print('balanced accuracy:',balanced_accuracy_score(y_val, y_pred_forest),'\n', 'precision:',precision_score(y_val, y_pred_forest), '\n', 'recall:', recall_score(y_val, y_pred_forest))

#Prediction on test data
y_pred_forest_test = forest.predict(X_test_scaled)

#Exporting prediction to cvs file
#export_scv('forest_predictions.csv', y_pred_forest_test)



balanced accuracy: 0.9316009194057975 
 precision: 0.9045643153526971 
 recall: 0.8861788617886179


## logistic regression model

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, precision_score, recall_score

#Logistic regression algorithm
reg = LogisticRegression(class_weight='balanced', max_iter= 200)
reg.fit(X_train_pca, y_train)

#Predict y values from X_val_pca and produce balanced accuracy, precision and recall  
y_pred_reg = reg.predict(X_val_pca)
print('balanced accuracy:',balanced_accuracy_score(y_val, y_pred_reg),'\n', 'precision:',precision_score(y_val, y_pred_reg), '\n', 'recall:', recall_score(y_val, y_pred_reg))

#Prediction on test data
y_pred_logistic_test = reg.predict(X_test_pca)

#Exporting prediction to cvs file
#export_scv('logistic_predictions.csv', y_pred_logistic_test)


balanced accuracy: 0.9487748024333391 
 precision: 0.7727272727272727 
 recall: 0.967479674796748


## Multi-layer Perceptron Classifier

In [10]:
from sklearn.neural_network import MLPClassifier

#Logistic regression algorithm
mlp = MLPClassifier(solver = 'adam', alpha = 0.00001, hidden_layer_sizes = (200,), activation = 'logistic', random_state = 3)
mlp.fit(X_train_scaled, y_train)

#Predict y values from X_val_pca and produce balanced accuracy, precision and recall  
y_pred_mlp = mlp.predict(X_val_scaled)
print('balanced accuracy:',balanced_accuracy_score(y_val, y_pred_mlp),'\n', 'precision:',precision_score(y_val, y_pred_mlp), '\n', 'recall:', recall_score(y_val, y_pred_mlp))

#Prediction on test data
y_pred_logistic_test = mlp.predict(X_test_scaled)

#Exporting prediction to cvs file
#export_scv('nnc_predictions.csv', y_pred_nnc_test)


balanced accuracy: 0.9696157501035549 
 precision: 0.9512195121951219 
 recall: 0.9512195121951219
