# 9 Month Modelling

## EnviroTox Database

### Set up

In [31]:
import pandas as pd
from sklearn import svm
import numpy as np
import itertools
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import classification_report, r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler


In [32]:
model_types=['RF','SVM']

In [33]:
file_train='envtox_single_train.csv' # if connected to drive
file_test='envtox_single_test.csv'
text_file='log.txt'

In [34]:
target= 'classification_binary' # 'classification_binary','classification_ternary', 'Classification','Effect_value_(mgL-1)' (2,3,5,reg)
selection=['2d'] # Pub, 2d, 3d
grid=False

In [35]:
with open(text_file, "w") as file:
    file.write("Log file for model performances: \n")
    file.write("Input type: ")
    for i in selection:
      file.write(i+" ")
    file.write("\nTarget: ")
    file.write(target)

### Read data

In [36]:
df_train=pd.read_csv(file_train, index_col=0)
df_test=pd.read_csv(file_test, index_col=0)

### Prepare

In [37]:
if 'single' in file_train:
  len_meta=7
elif 'combined' in file_train:
  len_meta=8
len_2d=1444
len_3d=431


In [38]:
def extract_relevant_desc_fp(df, selection, target):
  index_list=[]
  # meta data
  index_list.append([1]) # cid
  #target data
  index_list.append([df.columns.get_loc(target)])
  # input data
  for selection_item in selection:
    if selection_item=='2d':
      index_list.append(range(len_meta, len_meta+len_2d))
    if selection_item=='3d':
      index_list.append(range(len_meta+len_2d,len_meta+len_2d+len_3d))
    else:
      index_list.append([df.columns.get_loc(x) for x in df.columns if selection_item in x])
  #target_species
  if 'combined' in file_train:
    index_list.append([-1])
  return df.iloc[:,list(itertools.chain.from_iterable(index_list))]

In [39]:
df_train=extract_relevant_desc_fp(df_train, selection, target)
df_test=extract_relevant_desc_fp(df_test, selection, target)

In [40]:
# drop Nan
df_train=df_train.dropna(axis=0) #drop rows with missing values
df_test=df_test.dropna(axis=0) #drop rows with missing values

In [41]:
def labelencoder(train, test):
        le_name_mapping = {'Highly toxic': 3, 'Moderately toxic': 2, 'Nontoxic': 0, 'Slightly toxic': 1, 'Very highly toxic': 4}
        train.replace({"Classification": le_name_mapping}, inplace=True)
        test.replace({"Classification": le_name_mapping}, inplace=True)
        with open(text_file, "a") as file:
          file.write("\n"+str(le_name_mapping))
        return train, test

In [42]:
if target=='Classification':
  df_train, df_test=labelencoder(df_train, df_test)

In [43]:
if target=='Effect_value_(mgL-1)':
  df_train['Effect_value_(mgL-1)'] = np.log10(df_train['Effect_value_(mgL-1)'])
  df_test['Effect_value_(mgL-1)'] = np.log10(df_test['Effect_value_(mgL-1)'])

In [None]:
if '2d' in selection:
  scaler = StandardScaler()
  scaler.fit(df_train[df_train.columns[2:len_2d+2]])
  df_train[df_train.columns[2:len_2d+2]]=scaler.transform(df_train[df_train.columns[2:len_2d+2]])
  df_test[df_test.columns[2:len_2d+2]]=scaler.transform(df_test[df_test.columns[2:len_2d+2]])

In [45]:
# remove low/zero variance features
def remove_low_variance(input_data, variance_threshold=0.0):
  selection = VarianceThreshold(variance_threshold)
  selection.fit(input_data)
  return input_data[input_data.columns[selection.get_support(indices=True)]]

In [46]:
df_train=remove_low_variance(df_train)

In [47]:
df_test=df_test[df_train.columns]

In [48]:
def extract_correlated_features(df, threshold):
    corr_col=set()
    feature_corr = df.corr()
    for i in range(len(feature_corr.columns)):
        for j in range(i):
            if abs(feature_corr.iloc[i, j]) > threshold:
                col_name = feature_corr.columns[i] 
                corr_col.add(col_name)
    return corr_col

In [49]:
if '2d' in selection:
  if 'single' in file_train:
    corr_col = extract_correlated_features(df_train.iloc[:,2:], 0.9)
  else:
    corr_col = extract_correlated_features(df_train.iloc[:,2:-1], 0.9)
  print(len(set(corr_col)))
  df_train.drop(corr_col,axis=1, inplace=True)
  df_test.drop(corr_col,axis=1, inplace=True)

606


In [50]:
# log file
with open(text_file, "a") as file:
    file.write("\nData shape; Train: "+ str(df_train.shape) + " Test: "+ str(df_test.shape) + "\n")

### Report

In [51]:
def report_scores_classification(model, df, name_string):
    X= df[df.columns[2:]]
    y= df[df.columns[1]]
    y_pred=model.predict(X)
    class_report = classification_report(y, y_pred)
    
    with open(text_file, "a") as file:
      file.write(name_string + "classification Report:  \n" + str(class_report) + "\n")

In [52]:
def report_scores_reg(model, df, name_string):
    X= df[df.columns[2:]]
    y= df[df.columns[1]]
    y_pred=model.predict(X)
    r2=r2_score(y, y_pred)
    rms = mean_squared_error(y, y_pred, squared=False)
    
    with open(text_file, "a") as file:
      file.write(name_string + "Regression:  \nR2: " + str(r2) +" RMSE: "+str(rms) + "\n")

### Models and Grid Search

In [53]:
def rf_model(X,y):
    clf= RandomForestClassifier(random_state=0)
    clf.fit(X, y)

    return clf

In [54]:
def rf_model_reg(X,y):
    reg= RandomForestRegressor(random_state=0)
    reg.fit(X, y)

    return reg

In [55]:
def rf_model_grid_search(df):
    X= df[df.columns[2:]]
    y= df[df.columns[1]]

    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start=10, stop=2000, num=3)]
    # Number of features to consider at every split
    max_features = ['log2', 'sqrt']
    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(1, 110, num=2)]
    max_depth.append(None)
    # Minimum number of samples required to split a node
    min_samples_split = [2, 5, 10]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4]
    # Method of selecting samples for training each tree
    bootstrap = [True, False]

    param_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap}


    rfc = RandomForestClassifier(random_state=0)
    CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5)
    CV_rfc.fit(X, y)

    with open(text_file, "a") as file:
      file.write("Best parameters:" + str(CV_rfc.best_params_) + "\n")
      file.write("Score:" + str(CV_rfc.best_score_) + "\n")
      
    # use best parameter setting
    model=rf_model(CV_rfc.best_params_, X,y)
    return model

In [56]:
def svm_model(X,y):
    clf= svm.SVC(random_state=0)
    clf.fit(X, y)
    return clf

In [57]:
def svm_model_reg(X,y):
    np.random.seed(42)
    reg= svm.SVR()
    reg.fit(X, y)
    return reg

In [58]:
def svm_model_grid_search(df):
    X= df[df.columns[2:]]
    y= df[df.columns[1]]
    param_grid = [{"kernel": ["rbf"], "gamma": [1e-3, 1e-4, 'auto', 'scale'], "C": [0.5, 1, 25, 250]}, #separate as gamma only relevant for rbf kernel
                  {"kernel": ["linear"], "C": [0.5, 1, 25, 250]}]


    svc = svm.SVC(random_state=0)
    CV_svc = GridSearchCV(estimator=svc, param_grid=param_grid, cv=5)
    CV_svc.fit(X, y)

    with open(text_file, "a") as file:
      file.write("Best parameters:" + str(CV_svc.best_params_) + "\n")
      file.write("Score:" + str(CV_svc.best_score_) + "\n")

    # use best parameter setting
    model= svm_model(CV_svc.best_params_, X,y)
    return model

### Training

In [59]:
if grid==False:
  X= df_train[df_train.columns[2:]]
  y= df_train[df_train.columns[1]]
  if 'RF' in model_types:
    print('Random forest')
    with open(text_file, "a") as file:
      file.write("Random forest model\n")
    if target=='Effect_value_(mgL-1)':
      model=rf_model_reg(X,y)
      report_scores_reg(model,df_train,  'Train')
      report_scores_reg(model, df_test,'Test')
    else:
      model=rf_model(X,y)
      report_scores_classification(model,df_train,  'Train')
      report_scores_classification(model, df_test,'Test')
  if 'SVM' in model_types:
    print('Support vector machine')
    with open(text_file, "a") as file:
      file.write("Support vector machine\n")
    if target=='Effect_value_(mgL-1)':
      model=svm_model_reg(X,y)
      report_scores_reg(model,df_train,  'Train')
      report_scores_reg(model, df_test,'Test')
    else:
      model=svm_model(X,y)
      report_scores_classification(model,df_train,  'Train')
      report_scores_classification(model,df_test,  'Test')
else:
  if 'RF' in model_types:
    print('Random forest')
    with open(text_file, "a") as file:
      file.write("Random forest model\n")
    model=rf_model_grid_search(df_train)
    report_scores_classification(model,df_train,  'Train')
    report_scores_classification(model, df_test,'Test')
  if 'SVM' in model_types:
    print('Support vector machine')
    with open(text_file, "a") as file:
      file.write("Support vector machine\n")
    model=svm_model_grid_search(df_train)
    report_scores_classification(model,df_train,  'Train')
    report_scores_classification(model,df_test,  'Test')

Random forest
Support vector machine


In [60]:
from google.colab import files
files.download('log.txt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>