In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Thu May  5 12:26:31 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
cd drive/MyDrive/5001kaggle/

/content/drive/MyDrive/5001kaggle


# 1.Data Processing

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix,classification_report,roc_auc_score,accuracy_score

## 1.1 Read Data

In [5]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,id,MO HLADR+ MFI (cells/ul),Neu CD64+MFI (cells/ul),CD3+T (cells/ul),CD8+T (cells/ul),CD4+T (cells/ul),NK (cells/ul),CD19+ (cells/ul),CD45+ (cells/ul),Age,Sex 0M1F,Mono CD64+MFI (cells/ul),label
0,0,3556.0,2489.0,265.19,77.53,176.55,0.0,4.2,307.91,52,0,7515.0,1
1,1,1906.0,134.0,1442.61,551.9,876.07,112.1,168.15,1735.48,20,1,1756.0,0
2,2,1586.0,71.0,1332.74,684.2,655.26,244.95,216.52,1820.04,28,1,1311.0,0
3,3,683.0,94.0,419.23,255.8,162.17,72.05,44.68,538.22,55,1,1443.0,0
4,4,1032.0,71.0,1102.72,480.27,625.3,188.78,130.77,1427.97,28,1,1542.0,0


In [6]:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,id,MO HLADR+ MFI (cells/ul),Neu CD64+MFI (cells/ul),CD3+T (cells/ul),CD8+T (cells/ul),CD4+T (cells/ul),NK (cells/ul),CD19+ (cells/ul),CD45+ (cells/ul),Age,Sex 0M1F,Mono CD64+MFI (cells/ul)
0,0,2843.0,156.0,1358.52,730.78,637.85,127.06,94.82,1588.62,45,1,3256.0
1,1,437.0,137.0,509.43,268.05,243.07,390.86,98.24,1002.76,51,1,491.0
2,2,826.0,82.0,1232.22,493.42,744.08,516.28,320.15,2200.58,32,0,1381.0
3,3,861.0,50.0,1512.86,925.51,590.07,380.25,25.8,1929.1,50,0,1377.0
4,4,1160.0,157.0,890.42,403.91,489.53,266.92,87.63,1251.52,43,0,1844.0


## 1.2 Deal with Missing Data

In [7]:
train.isnull().sum()

id                          0
MO HLADR+ MFI (cells/ul)    1
Neu CD64+MFI (cells/ul)     1
CD3+T (cells/ul)            0
CD8+T (cells/ul)            0
CD4+T (cells/ul)            0
NK (cells/ul)               0
CD19+ (cells/ul)            0
CD45+ (cells/ul)            0
Age                         0
Sex 0M1F                    0
Mono CD64+MFI (cells/ul)    1
label                       0
dtype: int64

In [8]:
#Since the number of missing values in "train" dataset is only one, I just delete it.
train = train.dropna(how='any', axis = 0)

In [9]:
test.isnull().sum()

id                          0
MO HLADR+ MFI (cells/ul)    0
Neu CD64+MFI (cells/ul)     0
CD3+T (cells/ul)            0
CD8+T (cells/ul)            0
CD4+T (cells/ul)            0
NK (cells/ul)               0
CD19+ (cells/ul)            0
CD45+ (cells/ul)            0
Age                         0
Sex 0M1F                    0
Mono CD64+MFI (cells/ul)    0
dtype: int64

## 1.3 Deal with Imbalance of Train Data

In [10]:
y=train['label']
Counter(y)
#imbalance

Counter({0: 57, 1: 29})

In [11]:
smo = SMOTE(random_state=100)
X=train.drop('id', axis=1)
X=X.drop('label', axis=1)
Y=train['label']
train_X, train_Y = smo.fit_resample(X, Y)
print(Counter(train_Y))
#balance
test_X=test.drop('id', axis=1)

Counter({1: 57, 0: 57})


## 1.4 Data Normalization

In [12]:
preprocess = StandardScaler()
columns = ['MO HLADR+ MFI (cells/ul)', 'Neu CD64+MFI (cells/ul)',
       'CD3+T (cells/ul)', 'CD8+T (cells/ul)', 'CD4+T (cells/ul)',
       'NK (cells/ul)', 'CD19+ (cells/ul)', 'CD45+ (cells/ul)', 'Mono CD64+MFI (cells/ul)']
train_X = preprocess.fit_transform(train_X[columns])
test_X=preprocess.fit_transform(test[columns])


In [13]:
train_X=pd.DataFrame(train_X)
test_X=pd.DataFrame(test_X)

# 2. Modeling

## 2.1 Split the train_data into train and valid parts

In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(train_X, train_Y, random_state=0, test_size=0.2)

## 2.2 Logistic Regression

In [15]:
classfier = LogisticRegression()
classfier.fit(X_train, Y_train)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

LogisticRegression(multi_class='warn', solver='warn')

In [16]:
y_pred = classfier.predict(X_test)
y_pred

array([1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1,
       0])

In [17]:
#present confusion_matrix
print('confusion_matrix')
print(confusion_matrix(Y_test,y_pred))

#present classification_report
print('classification_report')
print(classification_report(Y_test,y_pred))

#present accuracy and AUC
print('Accuracy:%f'%(accuracy_score(Y_test,y_pred)))
print('Area under the curve:%f'%(roc_auc_score(Y_test,y_pred)))

confusion_matrix
[[ 9  1]
 [ 2 11]]
classification_report
              precision    recall  f1-score   support

           0       0.82      0.90      0.86        10
           1       0.92      0.85      0.88        13

    accuracy                           0.87        23
   macro avg       0.87      0.87      0.87        23
weighted avg       0.87      0.87      0.87        23

Accuracy:0.869565
Area under the curve:0.873077


## 2.3 SVM

In [19]:
classifier1=svm.SVC(C=2,kernel='rbf',gamma=10,decision_function_shape='ovr') # ovr:一对多策略
classifier1.fit(X_train, Y_train.ravel())

SVC(C=2, gamma=10)

In [20]:
y_pred = classifier1.predict(X_test)
y_pred

array([1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0])

In [21]:
#present confusion_matrix
print('confusion_matrix')
print(confusion_matrix(Y_test,y_pred))

#present classification_report
print('classification_report')
print(classification_report(Y_test,y_pred))

#present accuracy and AUC
print('Accuracy:%f'%(accuracy_score(Y_test,y_pred)))
print('Area under the curve:%f'%(roc_auc_score(Y_test,y_pred)))

confusion_matrix
[[10  0]
 [ 9  4]]
classification_report
              precision    recall  f1-score   support

           0       0.53      1.00      0.69        10
           1       1.00      0.31      0.47        13

    accuracy                           0.61        23
   macro avg       0.76      0.65      0.58        23
weighted avg       0.79      0.61      0.57        23

Accuracy:0.608696
Area under the curve:0.653846


## 2.4 Grid Search + Random Forest

In [25]:
params = [{'n_estimators': [5,10,15,20,25,30,40,50,100], 'max_depth': [3,4,5,6,8,10,15,20]}]
rf_model = GridSearchCV( RandomForestClassifier(), params, cv=LeaveOneOut(),scoring='accuracy',n_jobs=-1)
rf_model.fit(train_X, train_Y) 
print("The best values of parameters：:", rf_model.best_params_)
print("Accuracy: ", rf_model.best_score_)

The best values of parameters：: {'max_depth': 5, 'n_estimators': 10}
Accuracy:%f 0.9210526315789473


# Predict

It is obvious that 'Grid Search + Random Forest' has the highest accuracy, thus we use it to predict the test_dataset.

In [26]:
Y_pred=rf_model.predict(test_X)
Y_pred=pd.DataFrame(Y_pred)


In [27]:
submission=pd.concat([test['id'],Y_pred],axis=1)
submission.columns=['id','label']
submission

Unnamed: 0,id,label
0,0,1
1,1,1
2,2,0
3,3,0
4,4,0
5,5,0
6,6,0
7,7,0
8,8,0
9,9,0


In [50]:
submission.to_csv('submission.csv', index = False)

## No Data Augment and No normalization

In [28]:
X1=train.drop('id', axis=1)
X1=X1.drop('label', axis=1)
Y1=train['label']
test_X1=test.drop('id', axis=1)
params = [{'n_estimators': [5,10,15,20,25,30,40,50,100], 'max_depth': [3,4,5,6,8,10,15,20]}]
rf_model1 = GridSearchCV( RandomForestClassifier(), params, cv=LeaveOneOut(),scoring='accuracy',n_jobs=-1)
rf_model1.fit(X1, Y1) 
print("The best values of parameters：:", rf_model1.best_params_)
print("Accuracy: ", rf_model1.best_score_)


The best values of parameters：: {'max_depth': 8, 'n_estimators': 25}
Accuracy:  0.9534883720930233


In [29]:
Y_pred1=rf_model1.predict(test_X1)
Y_pred1=pd.DataFrame(Y_pred1)
submission1=pd.concat([test['id'],Y_pred1],axis=1)
submission1.columns=['id','label']
submission1

Unnamed: 0,id,label
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
5,5,0
6,6,0
7,7,0
8,8,0
9,9,0


In [35]:
submission1.to_csv('submission.csv', index = False)

## No Data Augment

In [31]:
X2=train.drop('id', axis=1)
X2=X2.drop('label', axis=1)
Y2=train['label']
test_X2=test.drop('id', axis=1)
preprocess = StandardScaler()
columns = ['MO HLADR+ MFI (cells/ul)', 'Neu CD64+MFI (cells/ul)',
       'CD3+T (cells/ul)', 'CD8+T (cells/ul)', 'CD4+T (cells/ul)',
       'NK (cells/ul)', 'CD19+ (cells/ul)', 'CD45+ (cells/ul)', 'Mono CD64+MFI (cells/ul)']
train_X2 = preprocess.fit_transform(X2[columns])
test_X2=preprocess.fit_transform(test[columns])

params = [{'n_estimators': [5,10,15,20,25,30,40,50,100], 'max_depth': [3,4,5,6,8,10,15,20]}]
rf_model2 = GridSearchCV( RandomForestClassifier(), params, cv=LeaveOneOut(),scoring='accuracy',n_jobs=-1)
rf_model2.fit(train_X2, Y2) 
print("The best values of parameters：:", rf_model2.best_params_)
print("Accuracy: ", rf_model2.best_score_)


The best values of parameters：: {'max_depth': 3, 'n_estimators': 100}
Accuracy:  0.9302325581395349


In [34]:
Y_pred2=rf_model2.predict(test_X2)
Y_pred2=pd.DataFrame(Y_pred2)
submission2=pd.concat([test['id'],Y_pred2],axis=1)
submission2.columns=['id','label']
submission2

Unnamed: 0,id,label
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
5,5,0
6,6,0
7,7,0
8,8,0
9,9,0


In [None]:
submission2.to_csv('submission.csv', index = False)