In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import graphviz
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.model_selection import GridSearchCV
from sklearn import tree
import os
import numpy as np
import pandas as pd
import graphviz
from IPython.display import Image
from difflib import SequenceMatcher
pd.options.mode.chained_assignment = None

In [None]:
filenames = next(os.walk("./"), (None, None, []))[2]
anion_file_name = None
cation_file_name = None
if len(filenames) == 0:
  raise BaseException("No files were imported.")
for f in filenames:
  if "Anionic-Cationic" in f:
    anion_file_name = f
    cation_file_name = f
    break
  elif "Anionic" in f:
    anion_file_name = f
  elif "Cationic" in f:
    cation_file_name = f

if anion_file_name is None or cation_file_name is None:
    raise BaseException("Do the name of the files follow the following convention, such that it should contain 'Anionic-Cationic', 'Anionic' or 'Cationic'")

if anion_file_name == cation_file_name:
  a = int(input("Do you wish to run the experiments on both the cations and anions at the same time ? (Enter 1 to continue) "))
  if a != 1:
    anion_file_name = None
    cation_file_name = None
    raise BaseException("Automatically getting file names did not work.\nThere appears to be a file that contains 'Anionic-Cationic' in its name.")
else:
  a = int(input("Do you wish to use the following file with anions ? "+ anion_file_name+ " (Enter 1 to continue) "))
  b = int(input("Do you wish to use the following file with cations ? "+ cation_file_name+ " (Enter 1 to continue) "))
  if a != 1 or b != 1:
    raise BaseException("Automatically getting file names did not work.\n There seems to be multiple filenames that contain 'Anionic' or 'Cationic'")

In [None]:
DEPTH = int(input("What tree depth would you like ?" ))
if DEPTH < 1:
  raise BaseException("The depth entered is invalid. Please choose a positive number.")

nbr_me = int(input("Does your file contain 14 or 16 membranes ?"))
if nbr_me == 14:
  DECALAGE = False
else:
  nbr_membranes = int(input("How many membranes do you wish to use ? (14 or 16)"))
  if nbr_membranes == 16:
    DECALAGE = False
  elif nbr_membranes == 14:
    DECALAGE = True
  else:
    raise BaseException("The number of membranes entered is invalid. Please choose 14 or 16 membranes. ")

In [None]:
### DATALOADER

decalage = np.array([34, 22])
decalage_full = 56
is_equal = (anion_file_name == cation_file_name)

anion_file_name = "./" + anion_file_name
cation_file_name = "./" + cation_file_name

def load_data(boolean):
    if boolean:
      df = pd.read_csv(cation_file_name, sep=";")
    else:
      df = pd.read_csv(anion_file_name, sep=";")

    if DECALAGE:
      s = df.shape[0]
      df = df.iloc[DECALAGE*(decalage[int(boolean)]*(1-int(is_equal))+ decalage_full*int(is_equal)):, :]


    l = ['R1 A-', 'R2 A-', 'R3 A-', 'R1 C+', 'R2 C+', 'R3 C+']
    y = df[l]
    y.rename(columns={'R1 A-': 'R1_A', 'R2 A-': 'R2_A',
                      'R3 A-': 'R3_A', 'R1 C+': 'R1_C',
                      'R2 C+': 'R2_C', 'R3 C+': 'R3_C'},
             inplace=True)
    l = l + ['membrane', 'Peptides', 'bend_percent', 'turn_percent']
    df = df.drop(columns=l)
    return df, y

def load_data_mean(boolean):
    """
    Returns the data. We take the mean of the response variable.
    :param boolean: True if we want positive data, False otherwise.  
    :return:
    """
    df, y = load_data(boolean)
    if boolean:
        y_pos = y[['R1_C', 'R2_C', 'R2_C']].mean(axis=1)
        return df, y_pos
    else:
        y_neg = y[['R1_A', 'R2_A', 'R3_A']].mean(axis=1)
        return df, y_neg

def get_peptides_names(boolean):
      if boolean:
        df = pd.read_csv(cation_file_name, sep=";")
      else:
        df = pd.read_csv(anion_file_name, sep=";")

      if DECALAGE:
        df = df.iloc[DECALAGE*(decalage[int(boolean)]*(1-int(is_equal))+ decalage_full*int(is_equal)):, :]
      return (df['membrane'] + ' + ' + df['Peptides'])

def get_features(X, y, threshold):
  regressor = RandomForestRegressor(random_state=1, n_estimators=1000)
  regressor.fit(X, y)
  feat_importance = pd.DataFrame(regressor.feature_importances_.T, index=X_selected.columns).sort_values(by=0,
                                                                                                        ascending=False)
  new_indexs_pos = feat_importance[feat_importance[0] > threshold].index
  return new_indexs_pos, feat_importance

def get_decisions(X, y):
    regressor = DecisionTreeRegressor(random_state=1, max_depth=DEPTH, criterion='squared_error')
    parameters = {'min_samples_split':[2,3,4,5,6],
                  'min_samples_leaf':[2,3,4,5,6]}
    clf = GridSearchCV(regressor, parameters)
    clf.fit(X, y)

    mean_test_score = clf.cv_results_['mean_test_score']
    std_test_score = clf.cv_results_['std_test_score']

    clf1 = DecisionTreeRegressor(random_state=1,
                                 criterion='squared_error',
                                 max_depth=DEPTH,
                                 min_samples_split=clf.best_params_['min_samples_split'],
                                 min_samples_leaf=clf.best_params_['min_samples_leaf'])
    clf1.fit(X, y)
    leaves = clf1.apply(X)
    prediction = clf1.predict(X)

    r2_score = clf1.score(X, y)
    print("The R^2 of the tree is : ", r2_score)
    return leaves, prediction, clf1

def print_tree(clf, feature_name, file_name):
    dot_data = tree.export_graphviz(clf, out_file=None,
                            feature_names=feature_name,
                            filled=True, rounded=True,
                            special_characters=True, rotate=True, leaves_parallel=True)

    graph = graphviz.Source(dot_data)
    graph.dpi = 500
    graph.size = "30,30!"
    graph.render(file_name, format='jpg')
    os.remove(file_name)

# cation 0.02


In [None]:
anion_cation = True
threshold = 0.02

print("Results for the experiments with {'cations' if anion_cation else 'anions} and a threshold of {threshold}")

membrane = ['Contact angle', 'hydrophilic pores',
        'Volumetric porosity', 'Zeta-potential', 'Rz', ' Macropores FL']

peptide = ['mol_weight', 'isoelectric_point',
        'GRAVY', 'm/z_at_pH7.0',
        'Hall Kier Alpha', 'Polar R', 'A', 'D', 'F', 'H',
        'K', 'L', 'N', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

features = membrane + peptide

X, y = load_data_mean(anion_cation)
X_selected = X[features]
selected_features, feat_importance = get_features(X_selected, y, threshold)
print("Selected features : " , list(selected_features))
feat_importance

In [None]:
X_new = X_selected[selected_features]
leaves, prediction, clf = get_decisions(X_new, y)
print_tree(clf, selected_features, f"tree_{anion_cation}_{threshold}_{DEPTH}_{nbr_membranes}")
Image(f"tree_{anion_cation}_{threshold}_{DEPTH}_{nbr_membranes}.jpg")

In [None]:
df_prediction = pd.DataFrame(prediction[np.argsort(leaves)], index=get_peptides_names(anion_cation)[DECALAGE*(decalage[int(anion_cation)]*(1-int(is_equal))+ decalage_full*int(is_equal)) + np.argsort(leaves)], columns=["Prédiction"])
df_x = pd.DataFrame(X_new.iloc[np.argsort(leaves), :])
df_x.index = df_prediction.index
# pd.concat([df_x, df_prediction], axis=1)
df_y = pd.DataFrame(y[DECALAGE*(decalage[int(anion_cation)]*(1-int(is_equal))+ decalage_full*int(is_equal))+np.argsort(leaves)], columns=['target'])
df_y.index = df_prediction.index
pd.concat([df_x, df_prediction, df_y], axis=1)

In [None]:
pd.concat([df_x, df_prediction, df_y], axis=1).to_csv("cation_0.02.csv")

# cation 0.05

In [None]:
anion_cation = True
threshold = 0.05

print("Results for the experiments with {'cations' if anion_cation else 'anions} and a threshold of {threshold}")

membrane = ['Contact angle', 'hydrophilic pores',
        'Volumetric porosity', 'Zeta-potential', 'Rz', ' Macropores FL']

peptide = ['mol_weight', 'isoelectric_point',
        'GRAVY', 'm/z_at_pH7.0',
        'Hall Kier Alpha', 'Polar R', 'A', 'D', 'F', 'H',
        'K', 'L', 'N', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

features = membrane + peptide

X, y = load_data_mean(anion_cation)
X_selected = X[features]
selected_features, feat_importance = get_features(X_selected, y, threshold)
print("Selected features : ", selected_features)
feat_importance

In [None]:
X_new = X_selected[selected_features]
leaves, prediction, clf = get_decisions(X_new, y)
print_tree(clf, selected_features, f"tree_{anion_cation}_{threshold}_{DEPTH}_{nbr_membranes}")
Image(f"tree_{anion_cation}_{threshold}_{DEPTH}_{nbr_membranes}.jpg")

In [None]:
df_prediction = pd.DataFrame(prediction[np.argsort(leaves)], index=get_peptides_names(anion_cation)[DECALAGE*(decalage[int(anion_cation)]*(1-int(is_equal))+ decalage_full*int(is_equal)) + np.argsort(leaves)], columns=["Prédiction"])
df_x = pd.DataFrame(X_new.iloc[np.argsort(leaves), :])
df_x.index = df_prediction.index
# pd.concat([df_x, df_prediction], axis=1)
df_y = pd.DataFrame(y[DECALAGE*(decalage[int(anion_cation)]*(1-int(is_equal))+ decalage_full*int(is_equal))+np.argsort(leaves)], columns=['target'])
df_y.index = df_prediction.index
pd.concat([df_x, df_prediction, df_y], axis=1)

# anion 0.02

In [None]:
anion_cation = False
threshold = 0.02

print("Results for the experiments with {'cations' if anion_cation else 'anions} and a threshold of {threshold}")

membrane = ['Contact angle', 'hydrophilic pores',
        'Volumetric porosity', 'Zeta-potential', 'Rz', ' Macropores FL']

peptide = ['mol_weight', 'isoelectric_point',
        'GRAVY', 'm/z_at_pH7.0',
        'Hall Kier Alpha', 'Polar R', 'A', 'D', 'F', 'H',
        'K', 'L', 'N', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

features = membrane + peptide

X, y = load_data_mean(anion_cation)
X_selected = X[features]
selected_features, feat_importance = get_features(X_selected, y, threshold)
print("Selected features : ", selected_features)
feat_importance

In [None]:
X_new = X_selected[selected_features]
leaves, prediction, clf = get_decisions(X_new, y)
print_tree(clf, selected_features, f"tree_{anion_cation}_{threshold}_{DEPTH}_{nbr_membranes}")
Image(f"tree_{anion_cation}_{threshold}_{DEPTH}_{nbr_membranes}.jpg")

In [None]:
df_prediction = pd.DataFrame(prediction[np.argsort(leaves)], index=get_peptides_names(anion_cation)[DECALAGE*(decalage[int(anion_cation)]*(1-int(is_equal))+ decalage_full*int(is_equal)) + np.argsort(leaves)], columns=["Prédiction"])
df_x = pd.DataFrame(X_new.iloc[np.argsort(leaves), :])
df_x.index = df_prediction.index
# pd.concat([df_x, df_prediction], axis=1)
df_y = pd.DataFrame(y[DECALAGE*(decalage[int(anion_cation)]*(1-int(is_equal))+ decalage_full*int(is_equal))+np.argsort(leaves)], columns=['target'])
df_y.index = df_prediction.index
pd.concat([df_x, df_prediction, df_y], axis=1)

In [None]:
pd.concat([df_x, df_prediction, df_y], axis=1).to_csv("anion_0.02.csv")
# pd.read_csv("./anion_0.02.csv", index_col=[0] )

# anion 0.05

In [None]:
anion_cation = False
threshold = 0.05

print("Results for the experiments with {'cations' if anion_cation else 'anions} and a threshold of {threshold}")

membrane = ['Contact angle', 'hydrophilic pores',
        'Volumetric porosity', 'Zeta-potential', 'Rz', ' Macropores FL']

peptide = ['mol_weight', 'isoelectric_point',
        'GRAVY', 'm/z_at_pH7.0',
        'Hall Kier Alpha', 'Polar R', 'A', 'D', 'F', 'H',
        'K', 'L', 'N', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

features = membrane + peptide

X, y = load_data_mean(anion_cation)
X_selected = X[features]
selected_features, feat_importance = get_features(X_selected, y, threshold)
print("Selected features : ", selected_features)
feat_importance

In [None]:
X_new = X_selected[selected_features]
leaves, prediction, clf = get_decisions(X_new, y)
print_tree(clf, selected_features, f"tree_{anion_cation}_{threshold}_{DEPTH}_{nbr_membranes}")
Image(f"tree_{anion_cation}_{threshold}_{DEPTH}_{nbr_membranes}.jpg")

In [None]:
df_prediction = pd.DataFrame(prediction[np.argsort(leaves)], index=get_peptides_names(anion_cation)[DECALAGE*(decalage[int(anion_cation)]*(1-int(is_equal))+ decalage_full*int(is_equal)) + np.argsort(leaves)], columns=["Prédiction"])
df_x = pd.DataFrame(X_new.iloc[np.argsort(leaves), :])
df_x.index = df_prediction.index
# pd.concat([df_x, df_prediction], axis=1)
df_y = pd.DataFrame(y[DECALAGE*(decalage[int(anion_cation)]*(1-int(is_equal))+ decalage_full*int(is_equal))+np.argsort(leaves)], columns=['target'])
df_y.index = df_prediction.index
pd.concat([df_x, df_prediction, df_y], axis=1)