In [None]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn import tree
import graphviz
import os
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr

# Avec les deux premières membranes

In [None]:
def load_data():
    df = pd.read_csv("./Donnees_peptides_membranes.csv", sep=";")
    l = ['R1 A-', 'R2 A-', 'R3 A-', 'R1 C+', 'R2 C+', 'R3 C+']
    y = df[l]
    y.rename(columns={'R1 A-': 'R1_A', 'R2 A-': 'R2_A',
                      'R3 A-': 'R3_A', 'R1 C+': 'R1_C',
                      'R2 C+': 'R2_C', 'R3 C+': 'R3_C'},
             inplace=True)
    l = l + ['membrane', 'Peptides', 'bend_percent', 'turn_percent']
    df = df.drop(columns=l)
    return df, y

def load_data_mean(boolean):
    """
    Retourne les données. On prend la moyenne de la variable réponse.
    :param boolean: True si on veut les données positives, False sinon.
    :return:
    """
    df, y = load_data()
    if boolean:
        y_pos = y[['R1_C', 'R2_C', 'R2_C']].mean(axis=1)
        return df, y_pos
    else:
        y_neg = y[['R1_A', 'R2_A', 'R3_A']].mean(axis=1)
        return df, y_neg

## Corrélation Membrane-Membrane

In [None]:
X, y = load_data_mean(True)
membrane = ['Thickness', 'Conductivity', 'Contact angle', 'hydrophilic pores',
     'Volumetric porosity', 'Zeta-potential', 'Ra', 'Rz', 'Porosity FL',
     ' Macropores FL']
membrane_X = X[membrane]
df_membrane = membrane_X.corr()
plt.figure(figsize=(20, 20))
sns.heatmap(df_membrane, annot=True)
plt.savefig("./matrice_correlation_membrane.pdf")

In [None]:
array_membrane = df_membrane.values
print("="*20, "Corrélation entre les caractéristiques de membranes", "="*20)
for i in range(array_membrane.shape[0]):
    for j in range(i+1, array_membrane.shape[1]):
        if np.abs(array_membrane[i, j]) >= 0.6:
            print(membrane[i], " + ",  membrane[j], "avec une corrélation de ", array_membrane[i, j])

## Corrélation Peptide-Peptide

In [None]:
peptide = ['length_peptide', 'mol_weight', 'isoelectric_point',
  'GRAVY', 'netCharge_at_pH7.0', 'm/z_at_pH7.0', 'avg_hydro',
  'aromaticity', 'instability_index', 'avg_num_hacceptors',
  'avg_num_hdonors', 'NumRotatableBonds', 'Hall Kier Alpha', 'Kappa 1',
  'avg_NumRings', 'Fraction CSP3', 'TPSA', 'CrippenClogP', 'Hydro R',
  'Basic R', 'Acidic R', 'Polar R', 'A', 'C', 'D', 'E', 'F', 'G', 'H',
  'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
peptide_X = X[peptide]
df_peptide = peptide_X.corr()
plt.figure(figsize=(40, 40))
sns.heatmap(df_peptide, annot=True)
plt.savefig("./matrice_correlation_peptide.pdf")

In [None]:
print("\n", "=" * 20, "Corrélation entre les caractéristiques de peptides", "=" * 20)
array_peptide = df_peptide.values
for i in range(array_peptide.shape[0]):
    for j in range(i+1, array_peptide.shape[1]):
        if np.abs(array_peptide[i, j]) >= 0.6:
            print(peptide[i], " + ", peptide[j], "avec une corrélation de ", array_peptide[i, j])

## Corrélation données-étiquettes

In [None]:
X, y_pos = load_data_mean(True)
_, y_neg = load_data_mean(False)
array = X.values
pos_vals = []
neg_vals = []
for i in range(array.shape[1]):
    pos_vals.append(pearsonr(array[:, i], y_pos)[0])
    neg_vals.append(pearsonr(array[:, i], y_neg)[0])
df_y = pd.DataFrame(np.array([pos_vals, neg_vals]).T, index=X.columns, columns=['Positif', 'Négatif'])
plt.figure(figsize=(15, 20))
sns.heatmap(df_y, annot=True)
plt.savefig("./correlation_y_values.pdf")

In [None]:
df_peptide_avec = df_peptide
df_membrane_avec = df_membrane
df_y_avec = df_y

## Calcul de l'importance avec les deux premières membranes

In [None]:
X, y = load_data_mean(True)
membrane = ['Contact angle', 'hydrophilic pores',
            'Volumetric porosity', 'Zeta-potential', 'Rz', ' Macropores FL']

peptide = ['mol_weight', 'isoelectric_point',
           'GRAVY', 'm/z_at_pH7.0',
            'Hall Kier Alpha', 'Polar R', 'A', 'D', 'F', 'H',
            'K', 'L', 'N', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
X_selected = X[membrane + peptide]

regressor = RandomForestRegressor(random_state=1, n_estimators=1000)
regressor.fit(X_selected, y)
print("="*10, "Y positifs", "="*10)
print(pd.DataFrame(regressor.feature_importances_.T, index=X_selected.columns).sort_values(by=0, ascending=False))

In [None]:
X, y = load_data_mean(False)
membrane = ['Contact angle', 'hydrophilic pores',
            'Volumetric porosity', 'Zeta-potential', 'Rz', ' Macropores FL']

peptide = ['mol_weight', 'isoelectric_point',
           'GRAVY', 'm/z_at_pH7.0',
            'Hall Kier Alpha', 'Polar R', 'A', 'D', 'F', 'H',
            'K', 'L', 'N', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
X_selected = X[membrane + peptide]

regressor = RandomForestRegressor(random_state=1, n_estimators=1000)
regressor.fit(X_selected, y)
print("="*10, "Y négatifs", "="*10)
print(pd.DataFrame(regressor.feature_importances_.T, index=X_selected.columns).sort_values(by=0, ascending=False))

# Sans les deux premières membranes

In [None]:
decalage = np.array([34, 22])
decalage_full = 56

def load_data():
    df = pd.read_csv("./Donnees_peptides_membranes (Anionic-Cationic) VF.csv", sep=";")
    df = df.iloc[decalage_full:,:]
    l = ['R1 A-', 'R2 A-', 'R3 A-', 'R1 C+', 'R2 C+', 'R3 C+']
    y = df[l]
    y.rename(columns={'R1 A-': 'R1_A', 'R2 A-': 'R2_A',
                      'R3 A-': 'R3_A', 'R1 C+': 'R1_C',
                      'R2 C+': 'R2_C', 'R3 C+': 'R3_C'},
             inplace=True)
    l = l + ['membrane', 'Peptides', 'bend_percent', 'turn_percent']
    df = df.drop(columns=l)
    return df, y

def load_data_mean(boolean):
    """
    Retourne les données. On prend la moyenne de la variable réponse.
    :param boolean: True si on veut les données positives, False sinon.
    :return:
    """
    df, y = load_data()
    if boolean:
        y_pos = y[['R1_C', 'R2_C', 'R2_C']].mean(axis=1)
        return df, y_pos
    else:
        y_neg = y[['R1_A', 'R2_A', 'R3_A']].mean(axis=1)
        return df, y_neg

## Corrélation membrane-membrane

In [None]:
X, y = load_data_mean(True)
membrane = ['Thickness', 'Conductivity', 'Contact angle', 'hydrophilic pores',
     'Volumetric porosity', 'Zeta-potential', 'Ra', 'Rz', 'Porosity FL',
     ' Macropores FL']
membrane_X = X[membrane]
df_membrane = membrane_X.corr()
plt.figure(figsize=(20, 20))
sns.heatmap(df_membrane, annot=True)
plt.savefig("./matrice_correlation_membrane_sans_PVDF800_CF55.pdf")
array_membrane = df_membrane.values

In [None]:
array_membrane = df_membrane.values
print("="*20, "Corrélation entre les caractéristiques de membranes", "="*20)
for i in range(array_membrane.shape[0]):
    for j in range(i+1, array_membrane.shape[1]):
        if np.abs(array_membrane[i, j]) >= 0.6:
            print(membrane[i], " + ",  membrane[j], "avec une corrélation de ", array_membrane[i, j])

## Corrélation peptide-peptide

In [None]:
peptide = ['length_peptide', 'mol_weight', 'isoelectric_point',
  'GRAVY', 'netCharge_at_pH7.0', 'm/z_at_pH7.0', 'avg_hydro',
  'aromaticity', 'instability_index', 'avg_num_hacceptors',
  'avg_num_hdonors', 'NumRotatableBonds', 'Hall Kier Alpha', 'Kappa 1',
  'avg_NumRings', 'Fraction CSP3', 'TPSA', 'CrippenClogP', 'Hydro R',
  'Basic R', 'Acidic R', 'Polar R', 'A', 'C', 'D', 'E', 'F', 'G', 'H',
  'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
peptide_X = X[peptide]
df_peptide = peptide_X.corr()
plt.figure(figsize=(40, 40))
sns.heatmap(df_peptide, annot=True)
plt.savefig("./matrice_correlation_peptide_sans_PVDF800_CF55.pdf")

In [None]:
print("\n", "=" * 20, "Corrélation entre les caractéristiques de peptides", "=" * 20)
array_peptide = df_peptide.values
for i in range(array_peptide.shape[0]):
    for j in range(i+1, array_peptide.shape[1]):
        if np.abs(array_peptide[i, j]) >= 0.6:
            print(peptide[i], " + ", peptide[j], "avec une corrélation de ", array_peptide[i, j])

## Corrélation données-étiquettes

In [None]:
X, y_pos = load_data_mean(True)
_, y_neg = load_data_mean(False)
array = X.values
pos_vals = []
neg_vals = []
for i in range(array.shape[1]):
    pos_vals.append(pearsonr(array[:, i], y_pos)[0])
    neg_vals.append(pearsonr(array[:, i], y_neg)[0])
df_y = pd.DataFrame(np.array([pos_vals, neg_vals]).T, index=X.columns, columns=['Positif', 'Négatif'])
plt.figure(figsize=(15, 20))
sns.heatmap(df_y, annot=True)
plt.savefig("./correlation_y_values_sans_PVDF800_CF55.pdf")

In [None]:
df_membrane_sans = df_membrane
df_peptide_sans = df_peptide
df_y_sans = df_y

## Calcul de l'importance sans les deux premières membranes

In [None]:
X, y = load_data_mean(True)
membrane = ['Contact angle', 'hydrophilic pores',
            'Volumetric porosity', 'Zeta-potential', 'Rz', ' Macropores FL']

peptide = ['mol_weight', 'isoelectric_point',
           'GRAVY', 'm/z_at_pH7.0',
            'Hall Kier Alpha', 'Polar R', 'A', 'D', 'F', 'H',
            'K', 'L', 'N', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
X_selected = X[membrane + peptide]

regressor = RandomForestRegressor(random_state=1, n_estimators=1000)
regressor.fit(X_selected, y)
print("="*10, "Y positifs", "="*10)
print(pd.DataFrame(regressor.feature_importances_.T, index=X_selected.columns).sort_values(by=0, ascending=False))

In [None]:
X, y = load_data_mean(False)
membrane = ['Contact angle', 'hydrophilic pores',
            'Volumetric porosity', 'Zeta-potential', 'Rz', ' Macropores FL']

peptide = ['mol_weight', 'isoelectric_point',
           'GRAVY', 'm/z_at_pH7.0',
            'Hall Kier Alpha', 'Polar R', 'A', 'D', 'F', 'H',
            'K', 'L', 'N', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
X_selected = X[membrane + peptide]

regressor = RandomForestRegressor(random_state=1, n_estimators=1000)
regressor.fit(X_selected, y)
print("="*10, "Y négatifs", "="*10)
print(pd.DataFrame(regressor.feature_importances_.T, index=X_selected.columns).sort_values(by=0, ascending=False))