In [1]:
import pandas as pd
import numpy as np
 
# Read dataset
df = pd.read_csv('C:\\Users\\user\\Desktop\\balance-scale.data', 
                 names=['balance', 'var1', 'var2', 'var3', 'var4'])
 
# Display example observations
df.head()

Unnamed: 0,balance,var1,var2,var3,var4
0,B,1,1,1,1
1,R,1,1,1,2
2,R,1,1,1,3
3,R,1,1,1,4
4,R,1,1,1,5


In [2]:
df['balance'].value_counts()

L    288
R    288
B     49
Name: balance, dtype: int64

In [3]:
df['balance'] = [1 if b=='B' else 0 for b in df.balance]
 
df['balance'].value_counts()

0    576
1     49
Name: balance, dtype: int64

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [5]:
# 把balance欄位(y)與var1-4欄位區分開(X)
y = df.balance
X = df.drop('balance', axis=1)
 
# Train model
clf_0 = LogisticRegression().fit(X, y)
 
# Predict on training set
pred_y_0 = clf_0.predict(X)

In [6]:
print( accuracy_score(pred_y_0, y) )

0.9216


In [7]:
print( np.unique( pred_y_0 ) )

[0]


In [8]:
# up-sample(隨機複製"少數"的樣本，以加強訊號)
from sklearn.utils import resample

In [9]:
# Separate majority and minority classes
df_majority = df[df.balance==0]
df_minority = df[df.balance==1]
 
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=576,    # to match majority class
                                 random_state=123) # random_state 用來確保每次切分資料的結果都相同
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
df_upsampled.balance.value_counts()

1    576
0    576
Name: balance, dtype: int64

In [10]:
# Separate input features (X) and target variable (y)
y = df_upsampled.balance
X = df_upsampled.drop('balance', axis=1)
 
# Train model
clf_1 = LogisticRegression().fit(X, y)
 
# Predict on training set
pred_y_1 = clf_1.predict(X)
 
# Is our model still predicting just one class?
print( np.unique( pred_y_1 ) )

# How's our accuracy?
print( accuracy_score(y, pred_y_1) )

[0 1]
0.513888888889


In [11]:
# down-sample(隨機移除"多數"的樣本)
# Separate majority and minority classes
df_majority = df[df.balance==0]
df_minority = df[df.balance==1]
 
# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=49,     # to match minority class
                                 random_state=123) # reproducible results
 
# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])
 
# Display new class counts
df_downsampled.balance.value_counts()

1    49
0    49
Name: balance, dtype: int64

In [12]:
# Separate input features (X) and target variable (y)
y = df_downsampled.balance
X = df_downsampled.drop('balance', axis=1)
 
# Train model
clf_2 = LogisticRegression().fit(X, y)
 
# Predict on training set
pred_y_2 = clf_2.predict(X)
 
# Is our model still predicting just one class?
print( np.unique( pred_y_2 ) )

 
# How's our accuracy?
print( accuracy_score(y, pred_y_2) )
# 0.581632653061


[0 1]
0.581632653061


In [13]:
from sklearn.metrics import roc_auc_score

In [17]:
# Predict class probabilities
# ROC要有預測"機率"所以這邊用predict_proba()
prob_y_2 = clf_2.predict_proba(X)
 
# Keep only the positive class 
prob_y_2 = [p[1] for p in prob_y_2]
 
prob_y_2[:5]


[0.45419197226479618,
 0.48205962213283882,
 0.46862327066392495,
 0.47868378832689129,
 0.58143856820159712]

In [18]:
print( roc_auc_score(y, prob_y_2) )

0.568096626406


In [19]:
# Penalize Algorithms
from sklearn.svm import SVC

In [20]:
# Separate input features (X) and target variable (y)
y = df.balance
X = df.drop('balance', axis=1)
 
# Train model
clf_3 = SVC(kernel='linear', #指定要在算法中使用的類型
            class_weight='balanced', # 權重
            probability=True)
 
clf_3.fit(X, y)
 
# Predict on training set
pred_y_3 = clf_3.predict(X)
 
# Is our model still predicting just one class?
print( np.unique( pred_y_3 ) )

 
# How's our accuracy?
print( accuracy_score(y, pred_y_3) )

 
# What about AUROC?
prob_y_3 = clf_3.predict_proba(X)
prob_y_3 = [p[1] for p in prob_y_3]
print( roc_auc_score(y, prob_y_3) )

[0 1]
0.688
0.4694763322


In [111]:
from sklearn.ensemble import RandomForestClassifier

In [112]:
# Separate input features (X) and target variable (y)
y = df.balance
X = df.drop('balance', axis=1)
 
# Train model
clf_4 = RandomForestClassifier()
clf_4.fit(X, y)
 
# Predict on training set
pred_y_4 = clf_4.predict(X)
 
# Is our model still predicting just one class?
print( np.unique( pred_y_4 ) )

 
# How's our accuracy?
print( accuracy_score(y, pred_y_4) )

 
# What about AUROC?
prob_y_4 = clf_4.predict_proba(X)
prob_y_4 = [p[1] for p in prob_y_4]
print( roc_auc_score(y, prob_y_4) )

[0 1]
0.9712
0.998635912698
