In [74]:
import numpy as np
import pandas as pd
import pandas_profiling
import numpy as np

## 1. Reading trainning data

In [81]:
#  reading the dataset
train_path = "../input/msbd5001data/msbd5001-spring-2022/train.csv"
column_name = ["HLADR","NeuCD64","CD3","CD8","CD4","NK","CD19","CD45","Age","Sex","MonoCD64", "Label"]
trainDF = pd.read_csv(train_path, header=0, names=column_name)

In [82]:
print(trainDF.columns)

In [83]:
trainDF.head()

## 2. Checking missing value, replace with mean

In [84]:
import missingno as msno
msno.matrix(trainDF) 

In [87]:
trainDF['HLADR'].fillna(trainDF['HLADR'].mean(),inplace = True)
trainDF['NeuCD64'].fillna(trainDF['NeuCD64'].mean(),inplace = True)
trainDF['MonoCD64'].fillna(trainDF['MonoCD64'].mean(),inplace = True)

In [88]:
trainDF.isnull().sum()

In [89]:
trainDF.describe().T

## 3.Investigating distribution of label

In [90]:
import matplotlib.pyplot as plt
import seaborn as sns

fig, axs = plt.subplots(1,2,figsize=(14,7))
sns.countplot(x='Label',data=trainDF,ax=axs[0])
axs[0].set_title("Frequency of each Class")
trainDF['Label'].value_counts().plot(x=None,y=None, kind='pie', ax=axs[1],autopct='%1.2f%%')
axs[1].set_title("Percentage of each Class")
plt.show()

## 4.Investigating features (distribution under each label)

In [91]:
import matplotlib.gridspec as gridspec

#Select only the anonymized features.
v_feat = trainDF.iloc[:,0:11].columns
plt.figure(figsize=(16,11*4))
gs = gridspec.GridSpec(11, 1)
for i, cn in enumerate(trainDF[v_feat]):
    ax = plt.subplot(gs[i])
    sns.distplot(trainDF[cn][trainDF["Label"] == 1], bins=50)
    sns.distplot(trainDF[cn][trainDF["Label"] == 0], bins=50)
    ax.set_xlabel('')
    ax.set_title('histogram of feature: ' + str(cn))

## 5.Drop 2 meaningless features

In [92]:
droplist = ['Age','Sex']
trainDF2 = trainDF.drop(droplist, axis = 1)
trainDF2.shape # 查看数据的维度

In [93]:
trainDF2.head()

In [94]:
x_feature = list(trainDF2.columns)
x_feature.remove('Label')
x_val = trainDF2[x_feature]
y_val = trainDF2['Label']

## 6.Building models(3 for voting strategy)

In [95]:
from sklearn.ensemble import RandomForestClassifier

clf_rf =RandomForestClassifier()
clf_rf.fit(x_val, y_val) 

In [96]:
names = trainDF[x_feature].columns
names, clf_rf.feature_importances_
for feature in zip(names, clf_rf.feature_importances_):
    print(feature)

In [97]:
from sklearn.linear_model import LogisticRegression
clf_lr = LogisticRegression(solver='liblinear')
clf_lr.fit(x_val, y_val)

In [112]:
from sklearn.tree import DecisionTreeClassifier
clf_dt = DecisionTreeClassifier()
clf_dt.fit(x_val, y_val)

## 7.Reading test data

In [98]:
test_path = '../input/msbd5001data/msbd5001-spring-2022/test.csv'
column_name = ["HLADR","NeuCD64","CD3","CD8","CD4","NK","CD19","CD45","Age","Sex","MonoCD64"]

testDF = pd.read_csv(test_path, header=0, names=column_name)

In [99]:
testDF.head()

In [100]:
droplist = ['Age','Sex']
testDF2 = testDF.drop(droplist, axis = 1)
testDF2.shape # 查看数据的维度

## 8. Predicting

In [104]:
y_pred_rf = clf_rf.predict(testDF2)
y_pred_rf

In [105]:
y_pred_lr = clf_lr.predict(testDF2)
y_pred_lr

In [113]:
y_pred_dt = clf_dt.predict(testDF2)
y_pred_dt

In [119]:
def vote(rf,lr,dt):
    res = []
    for r,l,d in zip(rf,lr,dt):
        count = sum([r,l,d])
        if count >=2:
            res.append(1)
        else:
            res.append(0)
    return res

In [121]:
result = vote(y_pred_rf, y_pred_lr, y_pred_dt)

## 9.Exporting result

In [122]:
resDF = pd.DataFrame({'id': range(len(result)), 'label': result})

In [123]:
resDF.to_csv('./pred_vote.csv', index = False)