# Сравнение различных методов сжатия

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem import AllChem
from rdkit import rdBase, DataStructs
from sklearn.decomposition import PCA
import seaborn as sns
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

DATA_PATH = 'project1/'
df = pd.read_csv(DATA_PATH+'delaney-processed.csv')
PandasTools.AddMoleculeColumnToFrame(df,'smiles','molec')

# Разделение датасета

In [4]:
def razdelit_dataset(dataframe):
    train_df = dataframe.sample(frac=.8, random_state=43)
    test_df = dataframe[~dataframe.index.isin(train_df.index)]
    return train_df, test_df

# Создаем Morgan Fingerprint(MF)

In [5]:
def m_f(dataframe,k,n):
    MF_list=[]
    for i in range(len(dataframe[['molec']])):
      mol = AllChem.GetMorganFingerprintAsBitVect((dataframe.loc[i,'molec']), k, nBits=n)
      MF_list.append(mol)
    MF_ar = np.asanyarray(MF_list)
    return pd.DataFrame(MF_ar)
k=2

df4096,df2048,df1024,df512 = m_f(df,k,4096),m_f(df,k,2048),m_f(df,k,1024),m_f(df,k,512)

# Используем RandomForest на MF

In [6]:
def ForestMF(df_MF,dataframe):
    depend = 'measured log solubility in mols per litre'
    df_MF[depend] = dataframe[depend]
    razdelit_dataset(df_MF)

    train_df_MF,test_df_MF = [],[]
    train_df_MF, test_df_MF = razdelit_dataset(df_MF)
    
    train_y_MF = np.asanyarray(train_df_MF[[depend]])
    test_y_MF = np.asanyarray(test_df_MF[[depend]])

    train_df_MF = train_df_MF.drop(columns=depend)
    test_df_MF = test_df_MF.drop(columns=depend)

    train_x_MF = np.asanyarray(train_df_MF)
    test_x_MF = np.asanyarray(test_df_MF)

    regr_MF = RandomForestRegressor(max_depth=9,random_state=0)
    regr_MF.fit(train_x_MF, train_y_MF.ravel())
    
    test_Y_MF = regr_MF.predict(test_x_MF)
    return r2_score(test_y_MF,test_Y_MF)


In [7]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.decomposition import NMF

# Используем сжатие PCA

In [8]:
def PCA_A(MF_ar,com,df):
    depend = 'measured log solubility in mols per litre'
    if depend in MF_ar:
         MF_ar = MF_ar.drop(columns=depend)
    MF_ar = np.asanyarray(MF_ar)
    train_df, test_df = razdelit_dataset(df)
    train_yMF,test_yMF = np.asanyarray(train_df[depend]), np.asanyarray(test_df[depend])
    
    pca = PCA(n_components=com)
    crds_MF_pca = pca.fit_transform(MF_ar)
    crds_dfMF_pca = pd.DataFrame(crds_MF_pca)
    
    train_crds_dfMF,test_crds_dfMF = razdelit_dataset(crds_dfMF_pca)

    train_X_pca_MF = np.asanyarray(train_crds_dfMF)
    test_X_pca_MF = np.asanyarray(test_crds_dfMF)
    
    regrMF_crds = RandomForestRegressor(max_depth=9)
    regrMF_crds.fit(train_X_pca_MF, train_yMF.ravel())
    
    test_Y_MF_crds = regrMF_crds.predict(test_X_pca_MF)
    return r2_score(test_yMF,test_Y_MF_crds)
print(PCA_A(df4096,100,df))

0.566823861864618


# Используем сжатие T-SNE

In [9]:
def T_SNE_A(MF_ar,com,df):
    depend = 'measured log solubility in mols per litre'
    if depend in MF_ar:
         MF_ar = MF_ar.drop(columns=depend)
    MF_ar = np.asanyarray(MF_ar)
    train_df, test_df = razdelit_dataset(df)
    train_yMF,test_yMF = np.asanyarray(train_df[depend]), np.asanyarray(test_df[depend])
    
    t_sne = TSNE(n_components=3)
    crds_MF_T_SNE = t_sne.fit_transform(MF_ar)
    crds_dfMF_T_SNE = pd.DataFrame(crds_MF_T_SNE)
    
    train_crds_dfMF,test_crds_dfMF = razdelit_dataset(crds_dfMF_T_SNE)

    train_X_T_SNE_MF = np.asanyarray(train_crds_dfMF)
    test_X_T_SNE_MF = np.asanyarray(test_crds_dfMF)
    
    regrMF_crds = RandomForestRegressor(max_depth=9)
    regrMF_crds.fit(train_X_T_SNE_MF, train_yMF.ravel())
    
    test_Y_MF_crds = regrMF_crds.predict(test_X_T_SNE_MF)
    return r2_score(test_yMF,test_Y_MF_crds)
print(T_SNE_A(df4096,3,df))

0.3338339739916132


# Используем сжатие NMF

In [10]:
def NMF_A(MF_ar,com,df):
    depend = 'measured log solubility in mols per litre'
    if depend in MF_ar:
         MF_ar = MF_ar.drop(columns=depend)
    MF_ar = np.asanyarray(MF_ar)
    train_df, test_df = razdelit_dataset(df)
    train_yMF,test_yMF = np.asanyarray(train_df[depend]), np.asanyarray(test_df[depend])
    
    nmf = NMF(n_components=com)
    crds_MF_NMF = nmf.fit_transform(MF_ar)
    crds_dfMF_NMF = pd.DataFrame(crds_MF_NMF)
    
    train_crds_dfMF,test_crds_dfMF = razdelit_dataset(crds_dfMF_NMF)

    train_X_NMF_MF = np.asanyarray(train_crds_dfMF)
    test_X_NMF_MF = np.asanyarray(test_crds_dfMF)
    
    regrMF_crds = RandomForestRegressor(max_depth=9)
    regrMF_crds.fit(train_X_NMF_MF, train_yMF.ravel())
    
    test_Y_MF_crds = regrMF_crds.predict(test_X_NMF_MF)
    return r2_score(test_yMF,test_Y_MF_crds)
print(NMF_A(df4096,100,df))

0.5092363155700702


# Сравнение результатов

In [17]:
met = ['Random_Forest','PCA','t-sne','NMF']

a512 = [ForestMF(df4096,df)*100,PCA_A(df4096,100,df)*100,T_SNE_A(df4096,100,df)*100,NMF_A(df4096,100,df)*100]
a1024 = [ForestMF(df2048,df)*100,PCA_A(df2048,100,df)*100,T_SNE_A(df2048,100,df)*100,NMF_A(df2048,100,df)*100]
a2048 = [ForestMF(df1024,df)*100,PCA_A(df1024,100,df)*100,T_SNE_A(df1024,100,df)*100,NMF_A(df1024,100,df)*100]
a4096 = [ForestMF(df512,df)*100,PCA_A(df512,100,df)*100,T_SNE_A(df512,100,df)*100,NMF_A(df512,100,df)*100]
df_final = pd.DataFrame({'Methods':met,'512':a512,'1024':a1024,'2048':a2048,'4096':a4096})

In [18]:
df_final

Unnamed: 0,Methods,512,1024,2048,4096
0,Random_Forest,56.546963,58.247201,59.29023,56.579341
1,PCA,56.979886,58.019152,61.046089,54.247764
2,t-sne,30.73867,28.186172,35.511978,29.472313
3,NMF,51.036191,53.225508,47.478637,49.815647
