In [1]:
from feature_engineering.Pipe import PipeLine,PipeUnion
from feature_engineering.data_process import *
from feature_engineering.feature_selection import *
from feature_engineering.feature_extraction import *
import sklearn.datasets as ds
import pandas as pd
import numpy as np

In [14]:
def get_x(path=None):
    path = r"D:\QQ\qqchatdata\295746332\FileRecv\第三次模拟\Molecular_Descriptor.xlsx" if path is None else path
    data = pd.read_excel(path)
    data = data.set_index("SMILES")
    return data

def get_y(path=None):
    path = r"D:\QQ\qqchatdata\295746332\FileRecv\第三次模拟\ERα_activity.xlsx" if path is None else path
    data = pd.read_excel(path)
    return data[['pIC50','SMILES']].set_index("SMILES")

def get_info(data):
    unique_value = data.nunique()
    dtypes = data.dtypes.apply(lambda x:x.name)
    null = data.isnull().sum(0)
    info = pd.concat([unique_value,dtypes,null],axis=1)
    info.columns = ["unique_value","dtypes","null"]
    return info

In [4]:
x = get_x()
y = get_y()

1974

In [5]:
x.mode().loc[0]

nAcid       0.000000
ALogP       1.857900
ALogp2      3.451792
AMR       148.868200
apol       77.158583
             ...    
WTPT-5      0.000000
WPATH     694.000000
WPOL       35.000000
XLogP       2.701000
Zagreb    182.000000
Name: 0, Length: 729, dtype: float64

In [53]:
pipe1 = PipeLine([DropNA(threshold=0.2)])

In [56]:
class DtypesSelector(BaseProcess):
    def __init__(self,dtypes):
        super().__init__()
        self.dtypes = dtypes
    
    def _transform(self,X,y=None):
        tmp = X.select_dtypes(include=self.dtypes)
        tmp = tmp.astype(dtype=self.dtypes)
        return tmp if y is None else (tmp,y)

class CateDetectOutlier(BaseProcess):
    def __init__(self,threshold=0.01,handle = 'nan'):
        assert handle in ['drop','nan'],'handle must be drop or nan'
        super().__init__()
        self.threshold = threshold
        self.handle = handle
    
    def _transform(self,X,y=None):
        if 0<=self.threshold<1:
            threshold = int(X.shape[0]*self.threshold)
        else:
            threshold = self.threshold
        for col in list(X.columns):
            tmp = X[col].value_counts()
            tmp = tmp[tmp<threshold]
            match self.handle:
                case 'drop':
                    X = X[~X[col].isin(tmp.index)]
                case 'nan':
                    X[col] = X[col].apply(lambda x:np.nan if x in tmp.index else x)
                case _:
                    raise ValueError("handle must be drop or nan")
        return X if y is None else (X,y.loc[X.index])

pipe_float = PipeLine([DtypesSelector(dtypes="float"),
                       FillNA(method='median'),Scaler(method='robust')])
pipe_int = PipeLine([DtypesSelector(dtypes="int"),
                     FillNA(method='mode')])
pipe_outlier_float = PipeLine([DetectOutlier(method='iqr',handle='nan'),FillNA(method='median')])
pipe_outlier_int = PipeLine([CateDetectOutlier(threshold=0.01,handle='nan'),FillNA(method='mode')])
pipe_float = pipe_float+pipe_outlier_float
pipe_int = pipe_int+pipe_outlier_int
pipe2 = PipeUnion([('float_features',pipe_float),('int_features',pipe_int)])
pipe3 = PipeLine([('baseline',pipe1),('parallel process',pipe2)])

In [59]:
x1 = pipe3.fit_transform(x)
y1 = y.loc[x1.index]
data1 = pd.concat([x1,y1],axis=1)
save_path = r"D:\QQ\qqchatdata\295746332\FileRecv\第三次模拟\preprocessed1.xlsx"
data1.to_excel(save_path)

In [61]:
pipe3