In [1]:
import pandas as pd
from numpy import loadtxt
from numpy import unique
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [2]:
def evaluation(y_test,y_pred):
    confusion = confusion_matrix(y_test, y_pred)

    TP = confusion[1][1]
    FP = confusion[0][1]
    FN = confusion[1][0]
    TN = confusion[0][0]

    sensitivity = TP/(TP + FN) # diantara semua yang sebenarnya positive berapa diantaranya ditebak positive
    negative_predictive_value = TN/(TN+FN)
    specificity = TN/(TN+FP)
    precision = TP/(TP + FP) # diantara semua yang ditebak positif berapa yang benar
    accuracy = (TP + TN)/(TP + TN + FP + FN)


    print("Confusion Matrix:")
    print(confusion)
    print("\nSensitivity (Recall):", sensitivity)
    print("Precision:", precision)
    print("Negative Predictive Value:", negative_predictive_value)
    print("Specificity:", specificity)
    print("Accuracy:", accuracy)

### 1. Model tanpa pembersihan data

In [3]:
dataset = pd.read_csv('oil-spill.csv', header=None)
print(dataset.describe())
df = dataset.values

               0             1            2            3           4   \
count  937.000000    937.000000   937.000000   937.000000  937.000000   
mean    81.588047    332.842049   698.707086   870.992209   84.121665   
std     64.976730   1931.938570   599.965577   522.799325   45.361771   
min      1.000000     10.000000     1.920000     1.000000    0.000000   
25%     31.000000     20.000000    85.270000   444.200000   54.000000   
50%     64.000000     65.000000   704.370000   761.280000   73.000000   
75%    124.000000    132.000000  1223.480000  1260.370000  117.000000   
max    352.000000  32389.000000  1893.080000  2724.570000  180.000000   

                 5           6           7              8           9   ...  \
count  9.370000e+02  937.000000  937.000000     937.000000  937.000000  ...   
mean   7.696964e+05   43.242721    9.127887    3940.712914    0.221003  ...   
std    3.831151e+06   12.718404    3.588878    8167.427625    0.090316  ...   
min    7.031200e+04   21.2

In [4]:
X = df[:,:-1]
y = df[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=1)
model = LogisticRegression(max_iter=3000)
model.fit(X_train, y_train)

In [5]:
y_pred = model.predict(X_test)

evaluation(y_test,y_pred)

Confusion Matrix:
[[173   3]
 [  6   6]]

Sensitivity (Recall): 0.5
Precision: 0.6666666666666666
Negative Predictive Value: 0.9664804469273743
Specificity: 0.9829545454545454
Accuracy: 0.9521276595744681


### 2.  Model dengan penghapusan single value – Oil Spill

In [6]:
dataset= pd.read_csv('oil-spill.csv', header=None)
print(dataset.nunique())

0     238
1     297
2     927
3     933
4     179
5     375
6     820
7     618
8     561
9      57
10    577
11     59
12     73
13    107
14     53
15     91
16    893
17    810
18    170
19     53
20     68
21      9
22      1
23     92
24      9
25      8
26      9
27    308
28    447
29    392
30    107
31     42
32      4
33     45
34    141
35    110
36      3
37    758
38      9
39      9
40    388
41    220
42    644
43    649
44    499
45      2
46    937
47    169
48    286
49      2
dtype: int64


In [7]:
counts = dataset.nunique()
# record columns to delete
to_del = [i for i,v in enumerate(counts) if v == 1]
print(to_del)
# drop useless columns
dataset.drop(to_del, axis=1, inplace=True)
display(dataset.head)

[22]


<bound method NDFrame.head of       0      1        2       3    4           5      6      7        8   \
0      1   2558  1506.09  456.63   90   6395000.0  40.88   7.89  29780.0   
1      2  22325    79.11  841.03  180  55812500.0  51.11   1.21  61900.0   
2      3    115  1449.85  608.43   88    287500.0  40.42   7.34   3340.0   
3      4   1201  1562.53  295.65   66   3002500.0  42.40   7.97  18030.0   
4      5    312   950.27  440.86   37    780000.0  41.43   7.03   3350.0   
..   ...    ...      ...     ...  ...         ...    ...    ...      ...   
932  200     12    92.42  364.42  135     97200.0  59.42  10.34    884.0   
933  201     11    98.82  248.64  159     89100.0  59.64  10.18    831.0   
934  202     14    25.14  428.86   24    113400.0  60.14  17.94    847.0   
935  203     10    96.00  451.30   68     81000.0  59.90  15.01    831.0   
936  204     11     7.73  235.73  135     89100.0  61.82  12.24    831.0   

       9   ...       40        41       42       43     4

In [8]:
X = dataset.drop(49, axis=1)
y = dataset[49]
X_train, X_test, y_train, y_test_2 = train_test_split(X, y, test_size=0.2, random_state=1)
model = LogisticRegression(max_iter=3000)
model.fit(X_train, y_train)

In [9]:
y_pred_2 = model.predict(X_test)

evaluation(y_test_2,y_pred_2)

Confusion Matrix:
[[174   2]
 [  6   6]]

Sensitivity (Recall): 0.5
Precision: 0.75
Negative Predictive Value: 0.9666666666666667
Specificity: 0.9886363636363636
Accuracy: 0.9574468085106383


In [10]:
print("y_test = ",y_test_2)
print("y_pred = ",y_pred_2)

y_test =  386    0
41     0
726    0
605    0
35     0
      ..
906    0
904    0
191    0
640    0
345    0
Name: 49, Length: 188, dtype: int64
y_pred =  [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0]


### 3. Model dengan penghapusan few values

In [11]:
data = loadtxt('oil-spill.csv', delimiter=',')
# summarize the number of unique values in each column
for i in range(data.shape[1]):
	num = len(unique(data[:, i]))
	percentage = float(num) / data.shape[0] * 100
	print('%d, %d, %.1f%%' % (i, num, percentage))

0, 238, 25.4%
1, 297, 31.7%
2, 927, 98.9%
3, 933, 99.6%
4, 179, 19.1%
5, 375, 40.0%
6, 820, 87.5%
7, 618, 66.0%
8, 561, 59.9%
9, 57, 6.1%
10, 577, 61.6%
11, 59, 6.3%
12, 73, 7.8%
13, 107, 11.4%
14, 53, 5.7%
15, 91, 9.7%
16, 893, 95.3%
17, 810, 86.4%
18, 170, 18.1%
19, 53, 5.7%
20, 68, 7.3%
21, 9, 1.0%
22, 1, 0.1%
23, 92, 9.8%
24, 9, 1.0%
25, 8, 0.9%
26, 9, 1.0%
27, 308, 32.9%
28, 447, 47.7%
29, 392, 41.8%
30, 107, 11.4%
31, 42, 4.5%
32, 4, 0.4%
33, 45, 4.8%
34, 141, 15.0%
35, 110, 11.7%
36, 3, 0.3%
37, 758, 80.9%
38, 9, 1.0%
39, 9, 1.0%
40, 388, 41.4%
41, 220, 23.5%
42, 644, 68.7%
43, 649, 69.3%
44, 499, 53.3%
45, 2, 0.2%
46, 937, 100.0%
47, 169, 18.0%
48, 286, 30.5%
49, 2, 0.2%


In [12]:
for i in range(data.shape[1]):
	num = len(unique(data[:, i]))
	percentage = float(num) / data.shape[0] * 100
	if percentage < 1:
		print('%d, %d, %.1f%%' % (i, num, percentage))

21, 9, 1.0%
22, 1, 0.1%
24, 9, 1.0%
25, 8, 0.9%
26, 9, 1.0%
32, 4, 0.4%
36, 3, 0.3%
38, 9, 1.0%
39, 9, 1.0%
45, 2, 0.2%
49, 2, 0.2%


In [13]:
df = pd.read_csv('oil-spill.csv', header=None)
print(df.shape)
# get number of unique values for each column
X = df.drop(49, axis=1)
y = df[49]
counts = X.nunique()
# record columns to delete
to_del = [i for i,v in enumerate(counts) if (float(v)/X.shape[0]*100) < 1]
print(to_del)
# drop useless columns
X.drop(to_del, axis=1, inplace=True)
print(X.shape)

(937, 50)
[21, 22, 24, 25, 26, 32, 36, 38, 39, 45]
(937, 39)


In [14]:
X_train, X_test, y_train, y_test_3 = train_test_split(X, y, test_size=0.2, random_state=1)
model = LogisticRegression(max_iter=3000)
model.fit(X_train, y_train)

y_pred_3 = model.predict(X_test)

evaluation(y_test_3,y_pred_3)

Confusion Matrix:
[[174   2]
 [ 10   2]]

Sensitivity (Recall): 0.16666666666666666
Precision: 0.5
Negative Predictive Value: 0.9456521739130435
Specificity: 0.9886363636363636
Accuracy: 0.9361702127659575


In [15]:
print("y_test = ",y_test_3)
print("y_pred = ",y_pred_3)

y_test =  386    0
41     0
726    0
605    0
35     0
      ..
906    0
904    0
191    0
640    0
345    0
Name: 49, Length: 188, dtype: int64
y_pred =  [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0]


### 4. Model dengan penghapusan low variance

In [16]:
df = pd.read_csv('oil-spill.csv', header=None)
# split data into inputs and outputs
data = df.values
X = data[:, :-1]
y = data[:, -1]
print(X.shape, y.shape)

(937, 49) (937,)


In [17]:
df = pd.read_csv('oil-spill.csv', header=None)
data = df.values
X = data[:, :-1]
y = data[:, -1]
transform = VarianceThreshold(threshold=0.5)
X_sel = transform.fit_transform(X)
print(X_sel.shape)
X_train, X_test, y_train, y_test_4 = train_test_split(X_sel, y, test_size=0.2, random_state=1)
model = LogisticRegression(max_iter=3000)
model.fit(X_train, y_train)

y_pred_4 = model.predict(X_test)

evaluation(y_test_4,y_pred_4)

(937, 31)
Confusion Matrix:
[[174   2]
 [  6   6]]

Sensitivity (Recall): 0.5
Precision: 0.75
Negative Predictive Value: 0.9666666666666667
Specificity: 0.9886363636363636
Accuracy: 0.9574468085106383


In [18]:
print("y_test = ",y_test_4)
print("y_pred = ",y_pred_4)

y_test =  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
y_pred =  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.

### 5. Model tanpa pembersihan data

In [19]:
def eval(y_test,y_pred):
    cm = confusion_matrix(y_test,y_pred)
    sens_per_class = []
    prec_per_class = []
    npv_per_class = []
    spec_per_class = []
    TP_total = 0

    for class_index in range(len(cm)):
        TP = cm[class_index, class_index]  # True Positives
        FN = sum(cm[class_index, :]) - TP  # False Negatives
        FP = sum(cm[:, class_index]) - TP  # False Positives
        TN = cm.sum() - (TP + FN + FP)  # True Negatives

        sensitivity = TP/(TP+FN)
        precision = TP/(TP+FP)
        npv = TN / (TN + FN)
        specificity = TN / (TN + FP)

        sens_per_class.append(sensitivity)
        prec_per_class.append(precision)
        npv_per_class.append(npv)
        spec_per_class.append(specificity)

        TP_total += TP 

    n = len(sens_per_class)

    sensitivity = sum(sens_per_class) / n
    precision = sum(prec_per_class) / n
    negative_predictive_value = sum(npv_per_class) / n
    specificity = sum(spec_per_class) / n
    accuracy = TP_total / cm.sum()

    print("Confusion Matrix:")
    print(cm)
    print("\nSensitivity (Recall):", sensitivity)
    print("Precision:", precision)
    print("Negative Predictive Value:", negative_predictive_value)
    print("Specificity:", specificity)
    print("Accuracy:", accuracy)

In [20]:
data = pd.read_csv('iris.csv',header=None)
data.describe()

FileNotFoundError: [Errno 2] No such file or directory: 'iris.csv'

In [None]:
data.nunique()

0    35
1    23
2    43
3    22
4     3
dtype: int64

In [None]:
X = data.drop(4,axis=1)
y = data[4]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

model = LogisticRegression(max_iter=3000)
model.fit(X_train,y_train)

y_pred = model.predict(X_test)
eval(y_test,y_pred)

Confusion Matrix:
[[11  0  0]
 [ 0 12  1]
 [ 0  0  6]]

Sensitivity (Recall): 0.9743589743589745
Precision: 0.9523809523809524
Negative Predictive Value: 0.9814814814814815
Specificity: 0.9861111111111112
Accuracy: 0.9666666666666667


In [None]:
print("y_test = ",y_test)
print("y_pred = ",y_pred)

y_test =  14         Iris-setosa
98     Iris-versicolor
75     Iris-versicolor
16         Iris-setosa
131     Iris-virginica
56     Iris-versicolor
141     Iris-virginica
44         Iris-setosa
29         Iris-setosa
120     Iris-virginica
94     Iris-versicolor
5          Iris-setosa
102     Iris-virginica
51     Iris-versicolor
78     Iris-versicolor
42         Iris-setosa
92     Iris-versicolor
66     Iris-versicolor
31         Iris-setosa
35         Iris-setosa
90     Iris-versicolor
84     Iris-versicolor
77     Iris-versicolor
40         Iris-setosa
125     Iris-virginica
99     Iris-versicolor
33         Iris-setosa
19         Iris-setosa
73     Iris-versicolor
146     Iris-virginica
Name: 4, dtype: object
y_pred =  ['Iris-setosa' 'Iris-versicolor' 'Iris-versicolor' 'Iris-setosa'
 'Iris-virginica' 'Iris-versicolor' 'Iris-virginica' 'Iris-setosa'
 'Iris-setosa' 'Iris-virginica' 'Iris-versicolor' 'Iris-setosa'
 'Iris-virginica' 'Iris-versicolor' 'Iris-versicolor' 'Iris-setosa'
 'I

### 6. Model dengan penghapusan data terduplikasi

In [None]:
data = pd.read_csv('iris.csv',header=None)
dups = data.duplicated()
print(dups.any())
print(data[dups])
print(data.shape)
data.drop_duplicates(inplace=True)
print(data.shape)

True
       0    1    2    3               4
34   4.9  3.1  1.5  0.1     Iris-setosa
37   4.9  3.1  1.5  0.1     Iris-setosa
142  5.8  2.7  5.1  1.9  Iris-virginica
(150, 5)
(147, 5)


In [None]:
X = data.drop(4,axis=1)
y = data[4]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

model = LogisticRegression(max_iter=3000)
model.fit(X_train,y_train)


In [None]:
y_pred = model.predict(X_test)
eval(y_test,y_pred)

Confusion Matrix:
[[ 9  0  0]
 [ 0 12  1]
 [ 0  1  7]]

Sensitivity (Recall): 0.9326923076923078
Precision: 0.9326923076923078
Negative Predictive Value: 0.9652406417112299
Specificity: 0.9652406417112299
Accuracy: 0.9333333333333333
