In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

import lightgbm as lgb
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from math import sqrt

from sklearn.model_selection import GroupKFold, cross_val_predict
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import r_regression

# Data for

In [None]:
#data_for = '_PredMP'

#data_for = '_ExpMP'

data_for = '_NoMP'

In [None]:
file_path = "../Exp_1_Raw_Dataset/Raw_dataset_dataset_20240705.csv"

data = pd.read_csv(file_path)

In [None]:
data = data.sort_values('Type', ascending=False)
data = data.reset_index(drop = True)
data

In [None]:
train = data[data['Type'] == 'Train']
test = data[data['Type'] == 'Test']
lab = data[data['Type'] == 'Lab']

In [None]:
cols_to_drop = ['Drug','Solvent_1', 'Solvent_2',  
                 'Drug-solvent system', 'Solubility (g/100g)', 'LogS', 'Class', 'Drug_SMILES', 'Type']

if data_for == '_ExpMP':
    cols_to_drop = cols_to_drop + ['Drug_Predicted_Melting_temp (K)']
elif data_for == '_PredMP':
    cols_to_drop = cols_to_drop + ['Drug_Collected_Melting_temp (K)']
elif data_for == '_NoMP':
    cols_to_drop = cols_to_drop + ['Drug_Predicted_Melting_temp (K)'] + ['Drug_Collected_Melting_temp (K)']
else:
    print("Something wrong")

cols_to_drop

In [None]:
X_train = train.drop(cols_to_drop, axis = 1)
X_test = test.drop(cols_to_drop, axis = 1)
X_lab = lab.drop(cols_to_drop, axis = 1)

In [None]:
X_train.shape

In [None]:
X_lab.shape

# Remove the identical columns

In [None]:
variance = X_train.var()

columns_to_drop = variance[variance == 0].index

X_train = X_train.drop(columns=columns_to_drop)

X_train.shape

# Remove the highly correlated columns

In [None]:
def reduce_columns_by_correlation(data, threshold):
    corr_matrix = data.corr().abs()
    columns_to_drop = set()
    results_data = []

    for col in data.columns[:-1]:
        if col not in columns_to_drop:
            high_corr_columns = corr_matrix.loc[col, corr_matrix.loc[col, :] > threshold].index.tolist()
            
            if col in high_corr_columns:
                high_corr_columns.remove(col)

            columns_to_drop.update(high_corr_columns)
            results_data.append({'iteration': col, 'columns left': len(data.columns) - len(columns_to_drop)})

    data = data.drop(columns=list(columns_to_drop))


    return data


In [None]:
X_train = reduce_columns_by_correlation(X_train, 0.8)
X_train.shape

In [None]:
X_test = X_test[X_train.columns]
X_lab = X_lab[X_train.columns]

In [None]:
scaler = StandardScaler()
pca = PCA()

scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_lab_scaled = scaler.transform(X_lab)


X_train_scaled_df = pd.DataFrame(X_train_scaled, columns = X_train.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns = X_test.columns)
X_lab_scaled_df = pd.DataFrame(X_lab_scaled, columns = X_lab.columns)


pca.fit(X_train_scaled)

X_train_pca = pca.transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)
X_lab_pca = pca.transform(X_lab_scaled)

In [None]:
red = '#DD706E'
grey = '#515265'
yellow = '#FAAF3A'
blue = '#3A93C2'


threshold = 0.95

explained_variance_ratio = pca.explained_variance_ratio_
cumulative_explained_variance = np.cumsum(explained_variance_ratio)
pc_number_threshold = np.argmax(cumulative_explained_variance >= threshold) + 1


pca_summary_threshold = pd.DataFrame({
    'PCA': range(1, pc_number_threshold + 1),
    'Variance': explained_variance_ratio[:pc_number_threshold] * 100,
    'Cumulative Variance': cumulative_explained_variance[:pc_number_threshold] * 100
})


fontsize = 16
fig, ax1 = plt.subplots(figsize=(12, 6))
ax1.bar(pca_summary_threshold['PCA'], pca_summary_threshold['Variance'], alpha=1.0, label='Variance explained', color=blue)


ax2 = ax1.twinx()

ax2.plot(pca_summary_threshold['PCA'], pca_summary_threshold['Cumulative Variance'], 
         marker='o', markersize=8, markeredgewidth=0.5, markeredgecolor='black', 
         linestyle='-', color=red, label='Cumulative variance explained')



ax2.fill_between(pca_summary_threshold['PCA'], 0.0, pca_summary_threshold['Cumulative Variance'], color=red, alpha=0.2)


last_pca = pca_summary_threshold['PCA'].iloc[-1]
last_cumulative_variance = pca_summary_threshold['Cumulative Variance'].iloc[-1]
ax2.scatter(last_pca, last_cumulative_variance, color=grey, zorder=5, s = 70)  


ax2.annotate(f'({last_pca}, {last_cumulative_variance:.2f}%)', 
             xy=(last_pca, last_cumulative_variance), 
             xytext=(last_pca-15, last_cumulative_variance-15),
             fontsize=fontsize,
             arrowprops=dict(facecolor='black', arrowstyle='->'))


ax1.set_title('', fontsize=fontsize)
ax1.set_xlabel('Number of PCA components', fontsize=fontsize)
ax1.set_ylabel('Variance explained (%)', fontsize=fontsize)
ax2.set_ylabel('Cumulative variance explained (%)', fontsize=fontsize)


ax1.set_ylim(0, 10)
ax2.set_ylim(0, 110)
ax2.set_xlim(-2, last_pca+2)


ax1.tick_params(axis='both', which='major', labelsize=fontsize)
ax2.tick_params(axis='y', which='major', labelsize=fontsize)


handles1, labels1 = ax1.get_legend_handles_labels()
handles2, labels2 = ax2.get_legend_handles_labels()
handles = handles1 + handles2
labels = labels1 + labels2
lgd = ax1.legend(handles, labels, loc='upper left', bbox_to_anchor=(0.1, 1), fontsize=fontsize-2, frameon=False, ncol=2)


ax1.set_facecolor('none')
fig.patch.set_alpha(0.0)
ax1.grid(False)


fig.canvas.print_figure('Figure_SI' + data_for + '_PCA.png', dpi=600)

plt.show()

In [None]:
feat_to_keep = [f"PC{i+1}" for i in range(pc_number_threshold)]
pca_all_feat = [f"PC{i+1}" for i in range(X_train_pca.shape[1])]
col_to_add = ['Drug','Solvent_1', 'Solvent_2', 'Drug-solvent system', 'LogS', 'Class', 'Solubility (g/100g)']


In [None]:
def dataset_generator(X_train_array, X_test_array, X_lab_array, data, all_feat, feat_to_keep, col_to_add):
    
    X_train = pd.DataFrame(X_train_array, columns = all_feat)
    X_test = pd.DataFrame(X_test_array, columns = all_feat)
    X_lab = pd.DataFrame(X_lab_array, columns = all_feat)
    
    X_train = X_train[feat_to_keep]
    X_test = X_test[feat_to_keep]
    X_lab = X_lab[feat_to_keep]
    
    X_train['Type'] = 'Train'
    X_test['Type'] = 'Test'
    X_lab['Type'] = 'Lab'
    
    combined_df = pd.concat([X_train, X_test, X_lab], ignore_index=True)
    
    for col in col_to_add:
        combined_df[col] = data[col]
        
    train_subset = combined_df[combined_df['Type'] == 'Train']
    test_subset = combined_df[combined_df['Type'] == 'Test']
    

    overlapping_groups = set(train_subset['Drug-solvent system']).intersection(set(test_subset['Drug-solvent system']))
    
    if (combined_df['Type'].equals(data['Type'])):
        print('First check:  passed')     
    else:
        print('First check:  failed')
        
    if (combined_df['LogS'].equals(data['LogS'])):
        print('Second check: passed')     
    else:
        print('Second check: failed')
        
    if (combined_df['Drug'].equals(data['Drug'])):
        print('Third check:  passed')     
    else:
        print('Third check:  failed')
    
    print("Number of overlapping train/test groups: ", len(overlapping_groups))
    
    return combined_df

In [None]:
dataset_PCA = dataset_generator(X_train_array = X_train_pca, X_test_array = X_test_pca, 
                                X_lab_array = X_lab_pca,
                                data = data, all_feat = pca_all_feat, 
                                feat_to_keep = feat_to_keep, col_to_add = col_to_add)
dataset_PCA.shape


In [None]:
dataset_PCA

In [None]:
dataset_PCA.to_csv('Summary_and_dataset/PCA_dataset' + data_for + '.csv', index = False)

In [None]:
X_train_scaled_df.shape

In [None]:
train['LogS'].shape

In [None]:
refined_dataset = dataset_generator(X_train_array = X_train_scaled, X_test_array = X_test_scaled, 
                                X_lab_array = X_lab_scaled,
                                data = data, all_feat = X_train.columns, 
                                feat_to_keep = X_train_scaled_df.columns, col_to_add = col_to_add)
refined_dataset.shape


In [None]:
refined_dataset.to_csv('Summary_and_dataset/refined_dataset' + data_for + '.csv', index = False)
