In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import zscore
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

In [2]:
df = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
df

Unnamed: 0,id,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,Status
0,0,999,D-penicillamine,21532,M,N,N,N,N,2.3,316.0,3.35,172.0,1601.0,179.80,63.0,394.0,9.7,3.0,D
1,1,2574,Placebo,19237,F,N,N,N,N,0.9,364.0,3.54,63.0,1440.0,134.85,88.0,361.0,11.0,3.0,C
2,2,3428,Placebo,13727,F,N,Y,Y,Y,3.3,299.0,3.55,131.0,1029.0,119.35,50.0,199.0,11.7,4.0,D
3,3,2576,Placebo,18460,F,N,N,N,N,0.6,256.0,3.50,58.0,1653.0,71.30,96.0,269.0,10.7,3.0,C
4,4,788,Placebo,16658,F,N,Y,N,N,1.1,346.0,3.65,63.0,1181.0,125.55,96.0,298.0,10.6,4.0,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7900,7900,1166,D-penicillamine,16839,F,N,N,N,N,0.8,309.0,3.56,38.0,1629.0,79.05,224.0,344.0,9.9,2.0,C
7901,7901,1492,Placebo,17031,F,N,Y,N,N,0.9,260.0,3.43,62.0,1440.0,142.00,78.0,277.0,10.0,4.0,C
7902,7902,1576,D-penicillamine,25873,F,N,N,Y,S,2.0,225.0,3.19,51.0,933.0,69.75,62.0,200.0,12.7,2.0,D
7903,7903,3584,D-penicillamine,22960,M,N,Y,N,N,0.7,248.0,2.75,32.0,1003.0,57.35,118.0,221.0,10.6,4.0,D


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7905 entries, 0 to 7904
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             7905 non-null   int64  
 1   N_Days         7905 non-null   int64  
 2   Drug           7905 non-null   object 
 3   Age            7905 non-null   int64  
 4   Sex            7905 non-null   object 
 5   Ascites        7905 non-null   object 
 6   Hepatomegaly   7905 non-null   object 
 7   Spiders        7905 non-null   object 
 8   Edema          7905 non-null   object 
 9   Bilirubin      7905 non-null   float64
 10  Cholesterol    7905 non-null   float64
 11  Albumin        7905 non-null   float64
 12  Copper         7905 non-null   float64
 13  Alk_Phos       7905 non-null   float64
 14  SGOT           7905 non-null   float64
 15  Tryglicerides  7905 non-null   float64
 16  Platelets      7905 non-null   float64
 17  Prothrombin    7905 non-null   float64
 18  Stage   

In [4]:
X = df.drop('Status', axis=1)
y = df['Status']

# X = df_zscore_filtered.drop('Status', axis=1)
# y = df_zscore_filtered['Status']

In [5]:
encoder = LabelEncoder()

for col in X.columns:
    if X[col].dtype == 'object':
        print(col, X[col].unique())
        X[col] = encoder.fit_transform(X[col])

for col in test.columns:
    if test[col].dtype == 'object':
        test[col] = encoder.fit_transform(test[col])

df

Drug ['D-penicillamine' 'Placebo']
Sex ['M' 'F']
Ascites ['N' 'Y']
Hepatomegaly ['N' 'Y']
Spiders ['N' 'Y']
Edema ['N' 'Y' 'S']


Unnamed: 0,id,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,Status
0,0,999,D-penicillamine,21532,M,N,N,N,N,2.3,316.0,3.35,172.0,1601.0,179.80,63.0,394.0,9.7,3.0,D
1,1,2574,Placebo,19237,F,N,N,N,N,0.9,364.0,3.54,63.0,1440.0,134.85,88.0,361.0,11.0,3.0,C
2,2,3428,Placebo,13727,F,N,Y,Y,Y,3.3,299.0,3.55,131.0,1029.0,119.35,50.0,199.0,11.7,4.0,D
3,3,2576,Placebo,18460,F,N,N,N,N,0.6,256.0,3.50,58.0,1653.0,71.30,96.0,269.0,10.7,3.0,C
4,4,788,Placebo,16658,F,N,Y,N,N,1.1,346.0,3.65,63.0,1181.0,125.55,96.0,298.0,10.6,4.0,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7900,7900,1166,D-penicillamine,16839,F,N,N,N,N,0.8,309.0,3.56,38.0,1629.0,79.05,224.0,344.0,9.9,2.0,C
7901,7901,1492,Placebo,17031,F,N,Y,N,N,0.9,260.0,3.43,62.0,1440.0,142.00,78.0,277.0,10.0,4.0,C
7902,7902,1576,D-penicillamine,25873,F,N,N,Y,S,2.0,225.0,3.19,51.0,933.0,69.75,62.0,200.0,12.7,2.0,D
7903,7903,3584,D-penicillamine,22960,M,N,Y,N,N,0.7,248.0,2.75,32.0,1003.0,57.35,118.0,221.0,10.6,4.0,D


In [6]:
y_encoder = LabelEncoder()

y = y_encoder.fit_transform(y)

In [7]:
z_scores = np.abs(zscore(X))
threshold = 3
mask = (z_scores < threshold).all(axis=1)
X = X[mask]
y = y[mask]
X.shape, y.shape

((6148, 19), (6148,))

In [8]:
test_id = test['id']
test_id

0        7905
1        7906
2        7907
3        7908
4        7909
        ...  
5266    13171
5267    13172
5268    13173
5269    13174
5270    13175
Name: id, Length: 5271, dtype: int64

In [9]:
scaler = StandardScaler()

df_data = scaler.fit_transform(X)
df = pd.DataFrame(
    data=df_data,
    columns=df.drop('Status', axis=1).columns
)

test_data = scaler.transform(test)
test = pd.DataFrame(
    data=test_data,
    columns=test.columns
)

df = df.drop('id', axis=1)
test = test.drop('id', axis=1)

X = df.to_numpy()
test = test.to_numpy()

df

Unnamed: 0,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,0.381118,0.964834,0.324878,0.0,0.0,-0.898428,-0.502896,-0.206310,-0.408901,0.379173,-0.172059,-0.079479,-0.055410,0.650522,-0.493219,1.127253,0.782241,0.057895
1,0.383043,0.964834,0.107084,0.0,0.0,-0.898428,-0.502896,-0.206310,-0.566646,-0.585683,-0.304601,-0.182125,0.113212,-0.878359,-0.296084,0.005028,0.318478,0.057895
2,-1.337671,0.964834,-0.398018,0.0,0.0,1.113056,-0.502896,-0.206310,-0.303738,0.218363,0.192432,-0.079479,-0.260449,0.426783,-0.296084,0.358773,0.163891,1.217298
3,-1.419472,-1.036448,0.334128,0.0,0.0,1.113056,-0.502896,-0.206310,-0.566646,-0.844765,-0.437143,-0.674828,3.915696,-1.135057,-0.986058,-0.678066,1.555178,0.057895
4,-0.844939,0.964834,-0.105103,0.0,0.0,-0.898428,-0.502896,-0.206310,-0.356320,0.057554,-0.801634,-0.490064,0.132212,0.725102,-0.443935,0.273386,-1.072809,0.057895
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6143,-0.736191,0.964834,1.834297,0.0,0.0,-0.898428,-0.502896,-0.206310,-0.198575,-0.532080,0.457516,-0.038421,0.423542,-0.430882,-0.739639,-1.068405,0.782241,0.057895
6144,-0.973896,-1.036448,-0.347283,0.0,0.0,-0.898428,-0.502896,-0.206310,-0.461483,-0.112189,-0.105788,-0.592711,0.094213,-0.691910,2.858083,0.919885,-0.918221,-1.101509
6145,-0.660164,0.964834,-0.293465,0.0,0.0,1.113056,-0.502896,-0.206310,-0.408901,-0.549948,-0.536550,-0.100008,-0.055410,0.822536,-0.739639,0.102612,-0.763634,1.217298
6146,-0.579325,-1.036448,2.184953,0.0,0.0,-0.898428,1.988484,4.847063,0.169496,-0.862633,-1.331803,-0.325830,-0.456780,-0.915649,-1.133909,-0.836641,3.410228,-1.101509


In [10]:
X

array([[ 0.38111847,  0.96483394,  0.32487787, ...,  1.12725287,
         0.78224072,  0.05789473],
       [ 0.3830432 ,  0.96483394,  0.10708422, ...,  0.00502765,
         0.31847834,  0.05789473],
       [-1.33767112,  0.96483394, -0.39801766, ...,  0.35877256,
         0.16389087,  1.21729799],
       ...,
       [-0.66016392,  0.96483394, -0.2934655 , ...,  0.10261245,
        -0.76363391,  1.21729799],
       [-0.57932499, -1.03644779,  2.18495337, ..., -0.83664127,
         3.4102276 , -1.10150853],
       [-0.19245298, -1.03644779,  0.32487787, ...,  0.82230037,
        -0.29987152,  0.05789473]])

In [11]:
y

array([0, 0, 0, ..., 0, 2, 0])

In [12]:
param_grid = {
    'C': [0.01, 0.05, 0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.01, 0.1, 1, 10],
    'decision_function_shape': ['ovo', 'ovr'],
}

In [13]:
estimator = SVC(
    kernel='rbf',
    verbose=1,
    random_state=42,
    probability=True,
)

In [14]:
grid_search = GridSearchCV(
    estimator=estimator,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    verbose=2,
    n_jobs=-1
)

In [15]:
grid_search.fit(X, y)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
[LibSVM]

In [16]:
print("\nBest parameters:")
print(grid_search.best_params_)


Best parameters:
{'C': 1, 'decision_function_shape': 'ovo', 'gamma': 0.1}


In [17]:
print("\nBest cross-validation accuracy:")
print(grid_search.best_score_)


Best cross-validation accuracy:
0.8144118756077715


In [18]:
model = grid_search.best_estimator_

In [19]:
probs = model.predict_proba(test)
probs

array([[0.82275494, 0.0134928 , 0.16375225],
       [0.80741264, 0.10163282, 0.09095454],
       [0.43283582, 0.05042755, 0.51673662],
       ...,
       [0.90822485, 0.0143044 , 0.07747075],
       [0.89812565, 0.05395681, 0.04791754],
       [0.32239697, 0.0426562 , 0.63494683]])

In [20]:
y_pred = model.predict(X)
y_pred

array([0, 0, 0, ..., 0, 2, 0])

In [21]:
accuracy_score(y_pred=y_pred, y_true=y)

0.8651594014313598

In [22]:
y_encoder.classes_

array(['C', 'CL', 'D'], dtype=object)

In [23]:
probs_df = pd.DataFrame(probs, columns=[f'Status_{i}' for i in y_encoder.classes_])
probs_df

Unnamed: 0,Status_C,Status_CL,Status_D
0,0.822755,0.013493,0.163752
1,0.807413,0.101633,0.090955
2,0.432836,0.050428,0.516737
3,0.887527,0.015998,0.096475
4,0.792172,0.019507,0.188320
...,...,...,...
5266,0.876960,0.042353,0.080686
5267,0.918649,0.012546,0.068804
5268,0.908225,0.014304,0.077471
5269,0.898126,0.053957,0.047918


In [24]:
probs_df.insert(0, 'id', test_id.values)
probs_df

Unnamed: 0,id,Status_C,Status_CL,Status_D
0,7905,0.822755,0.013493,0.163752
1,7906,0.807413,0.101633,0.090955
2,7907,0.432836,0.050428,0.516737
3,7908,0.887527,0.015998,0.096475
4,7909,0.792172,0.019507,0.188320
...,...,...,...,...
5266,13171,0.876960,0.042353,0.080686
5267,13172,0.918649,0.012546,0.068804
5268,13173,0.908225,0.014304,0.077471
5269,13174,0.898126,0.053957,0.047918


In [25]:
probs_df.to_csv('predicted_probabilities.csv', index=False)

In [26]:
# 0.8354206198608476 -> No zscore
# 0.845640858815875 -> Zscore