In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read dataset
df = pd.read_csv('balance-scale.data',
                 names=['balance', 'var1', 'var2', 'var3', 'var4'])

# Display example observations
df.head()

Unnamed: 0,balance,var1,var2,var3,var4
0,B,1,1,1,1
1,R,1,1,1,2
2,R,1,1,1,3
3,R,1,1,1,4
4,R,1,1,1,5


In [3]:
"""
R for right-heavy, i.e. when var3 * var4 > var1 * var2
L for left-heavy, i.e. when var3 * var4 < var1 * var2
B for balanced, i.e. when var3 * var4 = var1 * var2
"""
df['balance'].value_counts()


L    288
R    288
B     49
Name: balance, dtype: int64

In [4]:
# Transform into binary classification
df['balance'] = [1 if b=='B' else 0 for b in df.balance]
 
df['balance'].value_counts()
# 0    576
# 1     49
# Name: balance, dtype: int64
# About 8% were balanced



0    576
1     49
Name: balance, dtype: int64

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Separate input features (X) and target variable (y)
y = df.balance
X = df.drop('balance', axis=1)
 
# Train model
clf_0 = LogisticRegression().fit(X, y)
 
# Predict on training set
pred_y_0 = clf_0.predict(X)


In [10]:
# Should we be excited?
print( np.unique( pred_y_0 ) )
print('Accuracy_score = ' +  str(accuracy_score(pred_y_0, y)) )
#So our model has 92% overall accuracy, but is it because it's predicting only 1 class?

[0]
Accuracy_score = 0.9216


In [11]:
"""
1)First, we'll separate observations from each class into different DataFrames.
2)Next, we'll resample the minority class with replacement, setting the number of samples to match that of the majority class.
3)Finally, we'll combine the up-sampled minority class DataFrame with the original majority class DataFrame.
"""
from sklearn.utils import resample

df_majority = df[df.balance==0]
df_minority = df[df.balance==1]

# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=576,    # to match majority class
                                 random_state=123) # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
df_upsampled.balance.value_counts()
# 1    576
# 0    576
# Name: balance, dtype: int64

1    576
0    576
Name: balance, dtype: int64

In [12]:
# Separate input features (X) and target variable (y)
y = df_upsampled.balance
X = df_upsampled.drop('balance', axis=1)
 
# Train model
clf_1 = LogisticRegression().fit(X, y)
 
# Predict on training set
pred_y_1 = clf_1.predict(X)
 
# Is our model still predicting just one class?
print( np.unique( pred_y_1 ) )
# [0 1]
 
# How's our accuracy?
print( accuracy_score(y, pred_y_1) )
# 0.513888888889


[0 1]
0.5147569444444444
