In [342]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import os
import pickle

from collections import Counter
from imblearn.over_sampling import SMOTENC, SMOTE
from autogluon.tabular import TabularPredictor
from scipy import stats
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, ElasticNet, RidgeClassifierCV, RidgeClassifier
from sklearn.preprocessing import MinMaxScaler

pd.set_option('display.max_rows', None)


In [343]:
def read_data(path):
    df = pd.read_csv(path)
    return df

In [474]:
def data_exploration(df):
    print(df.describe())
    print(df.info())
    for col in df.columns:
        if col not in ['id','label']:
            sns.histplot(df[col])
            plt.show()
    sns.heatmap(df.corr())
    plt.show()
    print('*'*50,'\n','Label correlation')
    print(df.corr().iloc[:,-1])
    print('*'*50, '\n', 'Highly correlated columns')

    
    temp = df.corr().abs()
    for col in temp.columns:
        temp2 = temp[col].values.tolist()
        temp3 = [i for i in temp2 if (i > 0.7) and (i < 1)]
        if temp3:
            print(col, temp3)
            print(df.corr()[col])
    print('*'*50)
    print(df.corr().iloc[:,-1:])
    return 

In [521]:
def data_processing(df,  mode , hybrid, drop_cols=None, dropna=False, drop_outlier=0, normalize=None, fr=0, upsampling=False, aggregate=False):
    print('*'*50)
    print('Mode:', mode)
    
    if dropna:
        df =df.dropna().reset_index(drop=True)
        print('Dropped NA value')
        
    if mode == 'train':
        label = df['label'].copy()
        df = df.drop('label', axis=1)
        print('saved label', len(label))
        
#     df = df.drop('id', axis=1)
#     df = df.drop('Age', axis=1)
#     df = df.drop('Sex 0M1F', axis=1)

    if drop_cols is not None:
        print('Dropped cols', drop_cols)
        df = df.drop(drop_cols, axis=1)
        
#     df['sum'] = df.apply(lambda x : sum(x), axis=1)
#     df['pos_sum'] = df['Mono CD64+MFI (cells/ul)'] + df['Neu CD64+MFI (cells/ul)']
#     df['neg_sum'] = df['MO HLADR+ MFI (cells/ul)'] + df['CD3+T (cells/ul)']+df['CD8+T (cells/ul)']+df['CD4+T (cells/ul)']+df['CD19+ (cells/ul)']+ df['CD45+ (cells/ul)']
    if aggregate:
        df['aggregate'] = df['CD3+T (cells/ul)'] + df[ 'CD45+ (cells/ul)']
        df = df.drop(['CD3+T (cells/ul)','CD45+ (cells/ul)'], axis = 1)
#         df['aggregate2'] = df['NK (cells/ul)'] + df['CD19+ (cells/ul)'] +  df['aggregate']
#         df = df.drop(['CD19+ (cells/ul)'], axis = 1)
    
    if drop_outlier:
        print('Before trim outlier, df shape:',df.shape)
        df = df.dropna().reset_index(drop=True)
        df = df[(np.abs(stats.zscore(df)) < drop_outlier).all(axis=1)]
        print('After trim outlier, df shape:',df.shape)

    if normalize is not None:
        if normalize == 'log':
            for col in df.columns:
                df[col] = np.log10(df[col]+1)
            print('Normalized with log10.')
        elif normalize == 'minmax':
            scaler = MinMaxScaler(feature_range=(0,fr))
            df.iloc[:, :] = scaler.fit_transform(df.iloc[:, :])
            print('Normalized with minmax.')
        elif normalize == 'log_nat':
            for col in df.columns:
                df[col] = np.log1p(df[col]+1)
            print('Normalized with natural log.')
        elif normalize == 'log2':
            for col in df.columns:
                df[col] = np.log1p(df[col]+1)
            print('Normalized with log2.')
                
    if upsampling:
        df =df.dropna().reset_index(drop=True)
        sm = SMOTE(random_state=32)
        X, label = sm.fit_resample(df.iloc[:,:-1],label)
        df = pd.DataFrame(X, columns=df.columns)
        print('Upsampled.')
    
    if hybrid:
        temp = df.copy()
        for col in df.columns:
            df[col] = np.log10(df[col]+1)
        scaler = MinMaxScaler(feature_range=(0,fr))
        temp.iloc[:,:] = scaler.fit_transform(temp.iloc[:,:])
#         if mode == 'train':
#             df = pd.concat([df.iloc[:,:], temp.iloc[:,:], label], axis = 1)
#         else:
        df = pd.concat([df.iloc[:,:], temp.iloc[:,:]], axis = 1)
        print('Created with hybrid')
    
    if (mode == 'train'):
        print('Append Label.')
        df['label'] = label
    print(df.columns)
    df.columns = [str(i) for i in range(df.shape[1])]
    
    print('*'*50)
    return df

In [522]:
def model_prediction(df, method=None):
    
    if method =='automl':
        model = TabularPredictor(label=str(df.shape[1]-1), problem_type='binary', eval_metric='accuracy')
        model.fit(
            df, 
            presets='best_quality', 
            hyperparameters = {
                'NN_TORCH': {}, 
                'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, 
                        {'ag_args_fit': {'num_gpus': 1}}, 
                        'GBMLarge'],
                'CAT': {}, 
                'XGB': [
                        {'booster':'gbtree', 'tree_method':'exact', 'eta':0.2, 'ag_args': {'name_suffix': '_GBT_EXACT_0_2'}},
                        {'booster':'gblinear', 'tree_method':'exact', 'eta':0.2, 'ag_args': {'name_suffix': '_GBL_EXACT_0_2'}}
#                     ,{'booster':'dart', 'tree_method':'exact', 'eta':0.2, 'ag_args': {'name_suffix':  '_DART_EXACT_0_2'}}
                ], 
                'FASTAI': [{'layers':[200,100]},
                           {'layers':[128,64]},
                           {}],  
                'RF': [{'criterion': 'gini', 'n_estimators':200, 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, 
                       {'criterion': 'entropy', 'n_estimators':200, 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}},
                       {'criterion': 'squared_error', 'n_estimators':200, 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression']}},],
                'XT': [{'criterion': 'gini', 'n_estimators':200, 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}, },
                       {'criterion': 'entropy', 'n_estimators':200, 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}, }, 
                       {'criterion': 'squared_error', 'n_estimators':200, 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression']}},],
        #         'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}},
        #                 {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}},],
                'LR': [{'penalty': 'L1', 'ag_args': {'name_suffix': '_L1'}},
                       {'penalty': 'L2', 'ag_args': {'name_suffix': '_L2'}}]
        #         'NN_MXNET': {}
        #         'TRANSF':{}
            },
            auto_stack=True,
            num_bag_folds=5,
            num_stack_levels=4,
            verbosity=2,
            ag_args_fit={'num_gpus': 1}
        )

        print(model.fit_summary())
        print(model.evaluate(df))
        print('Leader Board\n',model.leaderboard(df, silent=True))

    elif method == 'SVC':
        param_grid = {
            'C': [0.001, 0.01, 0.1, 1, 10],
            'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
            'gamma': ['scale', 'auto'],
            'cache_size': [2048],
            'class_weight': [None, 'balanced'],
            'decision_function_shape':['ovo', 'ovr']
        }
        model = GridSearchCV(estimator=SVC(), param_grid=param_grid, scoring= ['accuracy', 'f1', 'roc_auc'], 
                            n_jobs=-1, cv=5, verbose=2, return_train_score=True, refit='accuracy' )
        model.fit(df.iloc[:,:-1].to_numpy(), df.iloc[:,-1:].to_numpy().reshape(-1,))
        best_estimator = model.best_estimator_
        print('Best estimator:', best_estimator)

    elif method == 'linear':
        alpha = [0.001, 0.01, 0.1, 1, 10, 0.005, 0.05, 0.5, 5]
        scoring=['accuracy']
        solver=['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs']
        class_weight= [None, 'balanced']
        verbose = [2]
        best_acc = 0
        best_params = []

        for i in alpha:
            for j in solver:
                for k in class_weight:
                    if j in ['lbfgs','auto']:
                        model=RidgeClassifier(alpha=i, solver = j, class_weight = k  ,positive=True)
                    else:
                        model=RidgeClassifier(alpha=i, solver = j, class_weight = k )
                    model.fit(df.iloc[:,:-1].to_numpy(), df.iloc[:,-1:].to_numpy().reshape(-1,))
                    
                    pred = model.predict(df.iloc[:,:-1].to_numpy())
                    acc = accuracy_score(train_data_fin_grid.iloc[:,-1:].to_numpy().reshape(-1,), pred)
                    print(f'Alpha {i:6} Solver {j:13} Class weight {str(k):10} Normalize {l:5} ACC {acc}')
                    if acc > best_acc:
                        best_acc = acc
                        best_params = [i,j,k,l]
        print('Best accuracy:', best_acc)
        print('Best parameter:', best_params)
              
    elif method == 'boosting':
        model = RandomForestClassifier(n_estimators=200).fit(model.iloc[:,:-1], model.iloc[:,-1:])
    elif method == 'bagging':
        param_grid = {
            'base_estimator': [RidgeClassifier(), SVC() ,RandomForestClassifier(),LogisticRegression(),ElasticNet() ],
            'n_estimators' : [50,100,150,200,250,300,400,500,600,700],
            'learning_rate': [0.0001,0.0001, 0.001, 0.01, 0.1, 1],
            'algorithm': ['SAMME', 'SAMME.R']
        }
        model = GridSearchCV(estimator=AdaBoostClassifier(), param_grid=param_grid, scoring= ['accuracy', 'f1', 'roc_auc'], 
                            n_jobs=-1, cv=5, verbose=2, return_train_score=True, refit='accuracy' )
        model.fit(df.iloc[:,:-1].to_numpy(), df.iloc[:,-1:].to_numpy().reshape(-1,))
        best_estimator = model.best_estimator_
        print('Best estimator:', best_estimator)
    
    pred = model.predict(df.iloc[:,:-1])
    acc = accuracy_score(df.iloc[:,-1:].to_numpy().reshape(-1,), pred)
    print('Accuracy on training dataset:',acc)

    return model

In [523]:
def predict_test_and_save(model, test_df):
    y_pred = model.predict(test_df)
    y_pred = y_pred.astype('int64')
    submission = pd.read_csv(f"sample_submission.csv")
    submission['label'] = y_pred
    submission.to_csv('./submission.csv', index=False)
    print('Prediction Saved')
    return

In [None]:

df = read_data('train.csv')
df_test = read_data('test.csv')
set_a = ['id', 'Age', 'Sex 0M1F', 'CD8+T (cells/ul)','CD4+T (cells/ul)', 'Mono CD64+MFI (cells/ul)' ]
set_b = ['id', 'Age', 'Sex 0M1F', 'CD8+T (cells/ul)','CD4+T (cells/ul)']
set_c = ['id', 'Age', 'Sex 0M1F', 'CD8+T (cells/ul)','CD4+T (cells/ul)', 'Mono CD64+MFI (cells/ul)', 'MO HLADR+ MFI (cells/ul)']

selected = set_a

# data_exploration(df)

df = data_processing(df, mode='train', hybrid=False , 
                     drop_cols=selected,
                     dropna=True, drop_outlier=0, normalize='log', fr=3, upsampling=False, aggregate = False)

df_test = data_processing(df_test, mode='test', hybrid=False , 
                     drop_cols=selected,  
                          dropna=False, drop_outlier=0, normalize='log', fr=3, upsampling=False, aggregate = False)
# data_exploration(df)

# df['5'] = df['1'] + df['2'] + df['3']
# df = df.drop(['1','2', '3'], axis =1)
# df.columns = [str(i) for i in range(df.shape[1])]


# df_test['5'] = df_test['1'] + df_test['2'] + df_test['3']
# df_test = df_test.drop(['1','2', '3'], axis =1)
# df_test.columns = [str(i) for i in range(df_test.shape[1])]

# df.corr()
model_v3 = model_prediction(df, method='automl')

predict_test_and_save(model_v3, df_test)



No path specified. Models will be saved in: "AutogluonModels/ag-20220515_144033\"
Presets specified: ['best_quality']
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20220515_144033\"
AutoGluon Version:  0.4.1b20220507
Python Version:     3.8.13
Operating System:   Windows
Train Data Rows:    86
Train Data Columns: 6
Label Column: 6
Preprocessing data ...
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    86644.57 MB
	Train Data (Original)  Memory Usage: 0.0 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:

**************************************************
Mode: train
Dropped NA value
saved label 86
Dropped cols ['id', 'Age', 'Sex 0M1F', 'CD8+T (cells/ul)', 'CD4+T (cells/ul)', 'Mono CD64+MFI (cells/ul)']
Normalized with log10.
Append Label.
Index(['MO HLADR+ MFI (cells/ul)', 'Neu CD64+MFI (cells/ul)',
       'CD3+T (cells/ul)', 'NK (cells/ul)', 'CD19+ (cells/ul)',
       'CD45+ (cells/ul)', 'label'],
      dtype='object')
**************************************************
**************************************************
Mode: test
Dropped cols ['id', 'Age', 'Sex 0M1F', 'CD8+T (cells/ul)', 'CD4+T (cells/ul)', 'Mono CD64+MFI (cells/ul)']
Normalized with log10.
Index(['MO HLADR+ MFI (cells/ul)', 'Neu CD64+MFI (cells/ul)',
       'CD3+T (cells/ul)', 'NK (cells/ul)', 'CD19+ (cells/ul)',
       'CD45+ (cells/ul)'],
      dtype='object')
**************************************************


	0.9302	 = Validation score   (accuracy)
	13.87s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: LightGBM_BAG_L1 ...
	Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
	0.9302	 = Validation score   (accuracy)
	13.24s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: RandomForestGini_BAG_L1 ...
	0.8953	 = Validation score   (accuracy)
	0.33s	 = Training   runtime
	0.07s	 = Validation runtime
Fitting model: RandomForestEntr_BAG_L1 ...
	0.8953	 = Validation score   (accuracy)
	0.34s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: CatBoost_BAG_L1 ...
	Fitting 5 child models (S1F1 - S1F5) | Fitting with ParallelLocalFoldFittingStrategy
