In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score,f1_score

In [2]:
# splits data into x and y
# drops a class from x and y
# scales the dataset
def split(data, test_data, drop):
    X_train = data.copy()
    y_train = X_train.pop('target')
    X_test = test_data.copy()
    y_test = X_test.pop('target')

    X_train = X_train[y_train != drop]
    X_test = X_test[y_test != drop]

    y_train = y_train[y_train != drop]
    y_test = y_test[y_test != drop]

    std = StandardScaler()
    std.fit(X_train)
    X_train = pd.DataFrame(std.transform(X_train), columns = X_train.columns, index = X_train.index)
    X_test = pd.DataFrame(std.transform(X_test), columns = X_test.columns, index = X_test.index)
    return X_train, y_train, X_test, y_test, std

In [3]:
# trains on a model and predicts on the test set
def train_test(model, X_train, y_train, X_test, y_test):
    model = model.fit(X_train, y_train)
    y_pred = pd.Series(model.predict(X_test),index=y_test.index)
    return model, y_pred

In [4]:
#read data
data = pd.read_csv('dataset/segment/train.csv', index_col='name')
test_data = pd.read_csv('dataset/segment/test.csv', index_col='name')

In [5]:
%%time
# initiaise models
models = {
    "Random Forest": RandomForestClassifier(
                    random_state=42,
                    max_features='auto',
                    n_estimators= 500,
                    max_depth=6,
                    criterion='entropy',),
    "ANN": MLPClassifier(
                    max_iter=600,
                    momentum=0.6,
                    solver='adam',
                    activation='relu',
                    learning_rate_init=0.005,
                    alpha=0.001,
                    random_state=42),
    "SVM": svm.SVC(random_state=42),
    "Decision Tree": DecisionTreeClassifier(
                    random_state = 42,
                    criterion = 'entropy',
                    max_depth = 2,
                    max_features = 'log2',
                    splitter = 'best',),
    "KNN": KNeighborsClassifier(
                    algorithm='auto',
                    leaf_size=30,
                    metric='minkowski',
                    metric_params=None,
                    n_jobs=10,
                    n_neighbors=2,
                    p=3,
                    weights='uniform'),
}
# initialise classes
classes = ['normal', 'fatty', 'cirrhosis']

# for each binary classifier, train single feature models on all models and record their accuracy

for drop in classes:
    results = pd.DataFrame()
    X_train, y_train, X_test, y_test, std = split(data, test_data, drop)
    classes = np.unique(y_train)
    for col in X_train.columns:
        temp = {}
        for name in models.keys():
            model, y_pred = train_test(models[name], X_train[col].values.reshape(-1, 1),
                                       y_train, X_test[col].values.reshape(-1, 1), y_test)
            temp[f"{name} Accuracy"] = accuracy_score(y_pred,y_test)
        results = results.append(pd.DataFrame(temp,index=[col]))
    results.to_csv(f"dataset/segment/manual selection/{classes[0]}_{classes[1]}.csv")

Wall time: 22min 38s


## Visualization

In [None]:
files = ['fatty_normal', 'cirrhosis_fatty', 'cirrhosis_normal']
for name in files:
    data = pd.read_csv(f'dataset/segment/manual selection/{name}.csv', index_col = 0)
    for col in data.columns:
        model_accuracy = data[col].copy()
        model_accuracy.sort_values(ascending=False, inplace=True)
        model_accuracy.plot(title=f'{name}\n{col}',xlabel='Features',ylabel="Accuracy",kind="bar", legend=False, figsize=(25,3), fontsize=13, grid=True)
        plt.show()
    print('\n\n___________________________________________________________________________________________________________________\n\n')

## Selection

In [23]:
files = ['fatty_normal', 'cirrhosis_fatty', 'cirrhosis_normal']
features_acc={}
for name in files:
    features_acc[name] = pd.read_csv(f'dataset/segment/manual selection/{name}.csv', index_col = 0)

In [24]:
data = pd.read_csv('dataset/segment/train.csv', index_col='name')
test_data = pd.read_csv('dataset/segment/test.csv', index_col='name')

In [25]:
%%time
models = {
    "Random Forest": RandomForestClassifier(
                    random_state=42,
                    max_features='auto',
                    n_estimators= 500,
                    max_depth=6,
                    criterion='entropy',),
    "ANN": MLPClassifier(
                    max_iter=600,
                    momentum=0.6,
                    solver='adam',
                    activation='relu',
                    learning_rate_init=0.005,
                    alpha=0.001,
                    random_state=42),
    "SVM": svm.SVC(random_state=42),
    "Decision Tree": DecisionTreeClassifier(
                    random_state = 42,
                    criterion = 'entropy',
                    max_depth = 2,
                    max_features = 'log2',
                    splitter = 'best',),
#     "KNN": KNeighborsClassifier(
#                     algorithm='auto',
#                     leaf_size=30,
#                     metric='minkowski',
#                     metric_params=None,
#                     n_jobs=10,
#                     n_neighbors=2,
#                     p=3,
#                     weights='uniform'),
}
classes = ['normal', 'fatty', 'cirrhosis']

# for each binary classifier, train the best scoring n features on each model and record their accuracy
result = {col:pd.DataFrame() for col in features_acc.keys()}
for classifier in features_acc.keys():
    keep = classifier.split('_')
    drop = [cls for cls in classes if cls not in keep]
    X_train, y_train, X_test, y_test, std = split(data, test_data, drop[0])
    print("Classifier: ", classifier)
    for model_acc in features_acc[classifier].columns:
        feat_imp = features_acc[classifier][model_acc].sort_values(ascending=False)
        model_name = model_acc[:len(model_acc) - 9]
        res = pd.DataFrame(columns=[model_name])
        for i in range(2,108):
            cols = feat_imp.index[0:i]
            model, y_pred = train_test(models[model_name], X_train[cols], y_train, X_test[cols], y_test)
            res.loc[i] = {model_name: accuracy_score(y_pred,y_test)}
            print(i)
        print(model_name)
        result[classifier] = pd.concat([result[classifier], res], axis=1)

Classifier:  fatty_normal
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
Random Forest
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
ANN
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
SV

In [26]:
for name in result.keys():
    result[name].to_excel(f"dataset/segment/manual selection/model acc/{name}.xlsx")

In [27]:
for key in features_acc.keys():
    print(name)
    for model_acc in features_acc[key].columns:
        print(model_acc)
        feat_imp = features_acc[key][model_acc].copy()
        feat_imp.sort_values(ascending=False,inplace=True)
        for i,j in zip(feat_imp[0:9],feat_imp.index[0:9]):
            print('\t',j,': ',i)

cirrhosis_normal
Random Forest Accuracy
	 TotalEnergy :  0.770664118
	 Median :  0.760152891
	 10Percentile :  0.760152891
	 contrast_d1_90 :  0.757286192
	 Maximum :  0.756808409
	 90Percentile :  0.756330626
	 Range :  0.756330626
	 contrast_d2_0 :  0.755852843
	 contrast_d3_90 :  0.755852843
ANN Accuracy
	 contrast_d3_135 :  0.756330626
	 contrast_d3_90 :  0.756330626
	 contrast_d2_0 :  0.755852843
	 contrast_d2_90 :  0.755852843
	 Range :  0.755852843
	 contrast_d1_135 :  0.75537506
	 contrast_d2_135 :  0.75537506
	 Maximum :  0.754897277
	 SumSquares :  0.754419494
SVM Accuracy
	 contrast_d2_0 :  0.754419494
	 DifferenceVariance :  0.754419494
	 contrast_d3_0 :  0.75394171
	 ClusterShade :  0.75394171
	 contrast_d1_0 :  0.75394171
	 Variance :  0.75394171
	 homogeneity_d1_0 :  0.75394171
	 10Percentile :  0.753463927
	 90Percentile :  0.753463927
Decision Tree Accuracy
	 Variance :  0.756330626
	 contrast_d1_0 :  0.755852843
	 contrast_d1_45 :  0.755852843
	 contrast_d2_0 :  0.755

# MultiClass Model


In [None]:
data = pd.read_csv('dataset/segment/train.csv', index_col='name')
test_data = pd.read_csv('dataset/segment/test.csv', index_col='name')

In [None]:
X_train = data.copy()
y_train = X_train.pop('target')
X_test = test_data.copy()
y_test = X_test.pop('target')

std = StandardScaler()
std.fit(X_train)
X_train = pd.DataFrame(std.transform(X_train), columns = X_train.columns, index = X_train.index)
X_test = pd.DataFrame(std.transform(X_test), columns = X_test.columns, index = X_test.index)

In [None]:
models = {
    "Random Forest": RandomForestClassifier(
                    random_state=42,
                    max_features='auto',
                    n_estimators= 500,
                    max_depth=6,
                    criterion='entropy',),
    "ANN": MLPClassifier(
                    max_iter=600,
                    momentum=0.6,
                    solver='adam',
                    activation='relu',
                    learning_rate_init=0.005,
                    alpha=0.001,
                    random_state=42),
    "SVM": svm.SVC(random_state=42),
    "Decision Tree": DecisionTreeClassifier(
                    random_state = 42,
                    criterion = 'entropy',
                    max_depth = 2,
                    max_features = 'log2',
                    splitter = 'best',),
    "KNN": KNeighborsClassifier(
                    algorithm='auto',
                    leaf_size=30,
                    metric='minkowski',
                    metric_params=None,
                    n_jobs=10,
                    n_neighbors=2,
                    p=3,
                    weights='uniform'),
}

results = pd.DataFrame()

for col in X_train.columns:
    temp = {}
    for name in models.keys():
        model, y_pred = train_test(models[name], X_train[col].values.reshape(-1, 1),
                                   y_train, X_test[col].values.reshape(-1, 1), y_test)
        temp[f"{name} Accuracy"] = accuracy_score(y_pred,y_test)
    results = results.append(pd.DataFrame(temp,index=[col]))

In [None]:
results.describe()

# Visualisation

In [None]:
for col in results.columns:
    model_accuracy = results[col].copy()
    model_accuracy.sort_values(ascending=False, inplace=True)
    model_accuracy.plot(title=f'{name}\n{col}',xlabel='Features',ylabel="Accuracy",kind="bar", legend=False, figsize=(25,3), fontsize=13, grid=True)
    plt.show()

# Feature Selection

In [None]:
%%time
result = pd.DataFrame()
for model_acc in results.columns:
    feat_imp = results[model_acc].sort_values(ascending=False)
    model_name = model_acc[:len(model_acc) - 9]
    res = pd.DataFrame(columns=[model_name])
    for i in range(2,108):
        cols = feat_imp.index[0:i]
        model, y_pred = train_test(models[model_name], X_train[cols], y_train, X_test[cols], y_test)
        res.loc[i] = {model_name: accuracy_score(y_pred,y_test)}
    print(model_name)
    result = pd.concat([result, res], axis=1)

In [None]:
result.to_excel("dataset/segment/manual selection/multiclass/model acc/multiclass.xlsx")

In [None]:
results.to_excel("dataset/segment/manual selection/multiclass/multiclass.xlsx")