# Group Assignment 

Analysis of the known SMILES

In [4]:
# Import packages 
import pandas as pd
import seaborn as sns
import numpy as np
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
#from rdkit.Chem import Draw
#from rdkit.Chem.Draw import IPythonConsole

import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score


## Preprocessing

In [2]:
# Read file
original_df = pd.read_csv('tested_molecules_1.csv')

# Split the column
original_df[['SMILES', 'ALDH1_inhibition']] = original_df['SMILES,"ALDH1_inhibition"'].str.split(',', expand=True)
original_df.drop('SMILES,"ALDH1_inhibition"', axis=1, inplace=True)

original_df['ALDH1_inhibition'] = original_df['ALDH1_inhibition'].str.strip('"')

all_descriptors = [desc[0] for desc in Descriptors.descList]

descriptor_data = []
for i, row in original_df.iterrows():
    mol = Chem.MolFromSmiles(row['SMILES'])
    descriptor_values = [getattr(Descriptors, descriptor)(mol) for descriptor in all_descriptors]
    descriptor_data.append(descriptor_values)

descriptor_df = pd.DataFrame(descriptor_data, columns=all_descriptors)
original_df = pd.concat([original_df, descriptor_df], axis=1)

df = original_df.copy()

# The same needs to be done for the test_data

original_df_test = pd.read_csv('tested_molecules_2.csv')

original_df_test[['SMILES', 'ALDH1_inhibition']] = original_df_test['SMILES;ALDH1_inhibition'].str.split(';', expand=True)
original_df_test.drop('SMILES;ALDH1_inhibition', axis=1, inplace=True)

#original_df_test['ALDH1_inhibition'] = original_df['ALDH1_inhibition'].str.strip('"')
X_test_data = original_df_test.copy()


### Adding descriptors

In [3]:
# Get all available descriptors
all_descriptors = [desc[0] for desc in Descriptors.descList]

# Print all descriptors
for descriptor in all_descriptors:
    print(descriptor)

In [4]:
df.head()  # waarom maar 5 rijen?  Pakt wel 209 PCs dus denk dat het goed gaat. 

In [5]:
df_variables = df.drop(columns=["SMILES", "ALDH1_inhibition"])
df_variables

### Find correlation

MaxAbsEStateIndex == MaxEStateINdex is removed 


In [6]:
#correlation = df_variables.corr()                            # Calculate correlation

### Te veel variabelen om er een mooi figuur van te maken
#sns.set (rc = {'figure.figsize':(9, 8)})                     # Set size of figure
##sns.heatmap(correlation, cmap="Blues", annot=True)           # Making heatmap
#plt.title('Correlation analysis', fontsize=15);   

In [7]:
correlation = df_variables.corr()                            # Calculate correlation
f = plt.figure(figsize=(19, 15))
plt.matshow(correlation, fignum=f.number)          

# choose axes range
plt.xticks(range(df_variables.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=14, rotation=45)
plt.yticks(range(df_variables.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=14)

cb = plt.colorbar()  
cb.ax.tick_params(labelsize=14)
plt.title('Correlation Matrix', fontsize=20);

In [8]:
new_df_variables = df_variables.copy()
corr_matrix = new_df_variables.corr().abs()
corr_matrix
mask = np.triu(np.ones(corr_matrix.shape, dtype=bool), k=1)
mask

# Select upper triangle of correlation matrix using the boolean mask
upper = corr_matrix.where(mask)

# Find index of columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.90)]

# Drop the columns
new_df_variables.drop(columns=to_drop, inplace=True)
new_df_variables



In [9]:
df_for_stv = new_df_variables.copy()
# Calculate the standard deviation for each column
std_dev = df_for_stv.std()
threshold = 0.0

columns_to_drop = std_dev[std_dev < threshold].index

# Drop the columns with zero standard deviation from df_variables
df_for_stv.drop(columns=columns_to_drop, inplace=True)
df_for_stv

In [10]:
from sklearn.feature_selection import SelectKBest, f_classif
k = 15
selector = SelectKBest(score_func=f_classif, k=k)
X_selected = selector.fit_transform(df_for_stv, df["ALDH1_inhibition"])
selected_feature_indices = selector.get_support(indices=True)
selected_feature_names = df_for_stv.columns[selected_feature_indices]
df_selected = df_for_stv[selected_feature_names]

df_selected
feature_scores = selector.scores_
feature_scores_df = pd.DataFrame({'Feature': df_for_stv.columns, 'Score': feature_scores})
feature_scores_df = feature_scores_df.sort_values('Score', ascending=False)
print(feature_scores_df.head(k))

In [11]:
correlation = df_selected.corr()                            # Calculate correlation
f = plt.figure(figsize=(19, 15))
plt.matshow(correlation, fignum=f.number)          

# choose axes range
plt.xticks(range(df_selected.select_dtypes(['number']).shape[1]), df_selected.select_dtypes(['number']).columns, fontsize=14, rotation=45)
plt.yticks(range(df_selected.select_dtypes(['number']).shape[1]), df_selected.select_dtypes(['number']).columns, fontsize=14)

cb = plt.colorbar()  
cb.ax.tick_params(labelsize=14)
plt.title('Correlation Matrix', fontsize=20);

### Check missing values 


In [12]:
missing_values = df_selected.isna().sum().sum()     
if missing_values > 0: 
   print('Remove missing values')
else: 
   print('No missing_values')

### Check duplicates

In [13]:
check_duplicate = df_selected.duplicated()
if check_duplicate.any():
    print("Duplicates exist in the DataFrame.")
else:
    print("All GOOD!")

## Principal Component Analysis (PCA)

In [14]:
# Scaling the data 
df_copied = df_selected.copy()
scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(df_copied)

In [15]:
# Create principal components
pca = PCA()
df_pca = pca.fit_transform(df_scaled)

# Convert to dataframe
component_names = [f"PC{i+1}" for i in range(df_pca.shape[1])]
df_pca_converted = pd.DataFrame(df_pca, columns=component_names)

df_pca_converted.head()

In [16]:
# Explained variance
evr = pca.explained_variance_ratio_
print(evr*100)
    
# Cumaltive Variance
cv = np.cumsum(evr)
print(cv)                                # NEED 46 Principal components for 90% , 11 for 60% 

In [17]:
# Create figure
fig, axs = plt.subplots(1, 2)
n = pca.n_components_
grid = np.arange(1, n + 1)

# Explained variance
axs[0].bar(grid, evr)
axs[0].set(
    xlabel="Component", title="% Explained Variance", ylim=(0.0, 1.0)
)

# Cumulative Variance
axs[1].plot(np.r_[0, grid], np.r_[0, cv], "o-")
axs[1].plot([0, n], [0.9, 0.9], color='k', linestyle='-', linewidth=2)
axs[1].set(
    xlabel="Component", title="% Cumulative Variance", ylim=(0.0, 1.0)
)
        
# Set up figure
fig.set(figwidth=8, dpi=100);

In [18]:
# Summery figure explained and cumulative variance
plt.bar(range(1,len(evr)+1), evr, alpha=0.5, align='center', label='Individual explained variance ')    # explained variance
plt.step(range(1,len(cv)+1), cv, where='mid',label='Cumulative explained variance ')       # Cumulative explained variance

plt.xlabel('Principal components (PCs)')
plt.ylabel('variance')
plt.title('Expained and cumulative variance for each principal component', fontsize=15)

plt.legend(loc='best')
plt.tight_layout()
plt.show()

In [19]:
#X_test = X_test_data['SMILES']
selected_descriptors = [desc for desc in all_descriptors if desc in df_selected.columns]
descriptor_data = []
for i, row in X_test_data.iterrows():
    mol = Chem.MolFromSmiles(row['SMILES'])
    descriptor_values = [getattr(Descriptors, descriptor)(mol) for descriptor in selected_descriptors]
    descriptor_data.append(descriptor_values)

descriptor_df = pd.DataFrame(descriptor_data, columns=selected_descriptors)
X_test_all_columns = pd.concat([X_test_data, descriptor_df], axis=1)



X_test = X_test_all_columns.drop(columns=["SMILES", "ALDH1_inhibition"])
y_test = X_test_data['ALDH1_inhibition']
X_train = df_pca
y_train = original_df['ALDH1_inhibition']

tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
y_pred = tree.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

### Loadings

In [20]:
loadings = pd.DataFrame(
   pca.components_.T,                     # transpose the pca matrix 
   columns=component_names,               # so the columns are the principal components
   index=df_selected.columns,                      # and the rows are the original features
)
loadings      

In [21]:
def plot_loadings(PC_1, PC_2):
    labels = loadings.index
    sns.set_style('white')
    sns.scatterplot(data=loadings, x=PC_1, y=PC_2, hue=labels, palette = 'Paired')

    plt.axhline(y=0, color='gray', linestyle='dotted')    
    plt.axvline(x=0, color='gray', linestyle='dotted')
    plt.axline((-0.2, -0.2), slope = 1,color ='r', linestyle = 'dotted')

    plt.legend(ncol =5, title = 'Variables', loc='center left', bbox_to_anchor=(1.0, 0.5))

In [22]:
plot_loadings(PC_1 ='PC1', PC_2= 'PC2')               # IS ZO NIETS ZICHTBAAR< Worden te veel variabelen meegenomen. 

plt.xlabel('Loadings on PC1 (EV = 45.01 %)')
plt.ylabel('Loadings on PC2 (EV = 21.57 %)')
plt.title('Loadings principal components 1 and 2', fontsize = 15)

### Score plots 

In [23]:
def plot_scores(label,PC_1, PC_2):
    labels = original_df[label]
    sns.set_style('white')
    sns.scatterplot(x=df_pca[:, PC_1], y=df_pca[:, PC_2], hue=labels, palette= 'bright')

    plt.axhline(y=0, color='gray', linestyle='dotted')    
    plt.axvline(x=0, color='gray', linestyle='dotted')

    plt.legend(loc= 'best', ncol =2, title = label)

In [24]:
print(df_pca)

In [25]:
plot_scores(label= 'ALDH1_inhibition', PC_1= 0 , PC_2 = 1)

plt.xlabel('Scores on PC1 (EV = 19.79 %)')
plt.ylabel('Scores on PC2 (EV = 7.78 %)')
plt.title('Scores separated by ALDH1 inhibition', fontsize=15)

In [26]:
plot_scores(label= 'ALDH1_inhibition', PC_1= 2 , PC_2 = 3)

plt.xlabel('Scores on PC3 (EV = 5.95 %)')
plt.ylabel('Scores on PC4 (EV = 5.18 %)')
plt.title('Scores separated by ALDH1 inhibition', fontsize=15)

In [27]:
plot_scores(label= 'ALDH1_inhibition', PC_1= 4 , PC_2 = 5)

plt.xlabel('Scores on PC5 (EV = 4.07 %)')
plt.ylabel('Scores on PC6 (EV = 3.60 %)')
plt.title('Scores separated by ALDH1 inhibition', fontsize=15)

In [28]:
plot_scores(label= 'ALDH1_inhibition', PC_1= 6 , PC_2 = 7)

plt.xlabel('Scores on PC7 (EV = 3.31 %)')
plt.ylabel('Scores on PC8 (EV = 2.78 %)')
plt.title('Scores separated by ALDH1 inhibition', fontsize=15)

In [29]:
plot_scores(label= 'ALDH1_inhibition', PC_1= 8 , PC_2 = 9)

plt.xlabel('Scores on PC9 (EV = 2.55 %)')
plt.ylabel('Scores on PC10 (EV = 2.23 %)')
plt.title('Scores separated by ALDH1 inhibition', fontsize=15)

In [30]:
plot_scores(label= 'ALDH1_inhibition', PC_1= 0 , PC_2 = 10)

plt.xlabel('Scores on PC1 (EV = 19.79 %)')
plt.ylabel('Scores on PC10 (EV = 1.98 %)')
plt.title('Scores separated by ALDH1 inhibition', fontsize=15)