In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('balance-scale.data',
                names = 'balanced left_wt left_dt right_wt right_dt'.split())
df.head()

Unnamed: 0,balanced,left_wt,left_dt,right_wt,right_dt
0,B,1,1,1,1
1,R,1,1,1,2
2,R,1,1,1,3
3,R,1,1,1,4
4,R,1,1,1,5


In [3]:
df.balanced.value_counts()

R    288
L    288
B     49
Name: balanced, dtype: int64

In [4]:
df['target'] = [1 if b=='B' else 0 for b in df.balanced]
df.head()

Unnamed: 0,balanced,left_wt,left_dt,right_wt,right_dt,target
0,B,1,1,1,1,1
1,R,1,1,1,2,0
2,R,1,1,1,3,0
3,R,1,1,1,4,0
4,R,1,1,1,5,0


In [6]:
df.target.value_counts(normalize=True)

0    0.9216
1    0.0784
Name: target, dtype: float64

In [7]:
features = 'left_wt left_dt right_wt right_dt'.split()
X = df[features]
y = df.target

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [11]:
lgr_0 = LogisticRegression()
lgr_0.fit(X, y)

pred_y_0 = lgr_0.predict(X)

LogisticRegression()

In [12]:
accuracy_score( y, pred_y_0)

0.9216

In [14]:
set(pred_y_0)

{0}

## Sampling

In [16]:
from sklearn.utils import resample

### UpSampling

In [18]:
majority = df[df.target==0]
minority = df[df.target==1]
len(majority), len(minority)

(576, 49)

In [19]:
minority_upsampled = resample( minority,
                             replace=True,
                             n_samples=len(majority),
                             random_state=123)


In [20]:
upsampled = pd.concat([majority, minority_upsampled])

In [21]:
upsampled.target.value_counts()

1    576
0    576
Name: target, dtype: int64

In [23]:
X = upsampled[features]
y = upsampled.target
lgr_up = LogisticRegression()
lgr_up.fit(X,y)
pred_y_up = lgr_up.predict(X)
accuracy_score(y, pred_y_up)

LogisticRegression()

0.5147569444444444

In [24]:
set(pred_y_up)

{0, 1}

## Down sampling


In [25]:
majority = df[df.target==0]
minority = df[df.target==1]
len(majority), len(minority)

(576, 49)

In [26]:
features

['left_wt', 'left_dt', 'right_wt', 'right_dt']

In [27]:
majority_downsampled = resample( majority,
                             replace=False,
                             n_samples=len(minority),
                             random_state=123)

In [29]:
downsampled = pd.concat([majority_downsampled, minority])

In [30]:
X = downsampled[features]
y = downsampled.target
lgr_down = LogisticRegression()
lgr_down.fit(X,y)
pred_y_down = lgr_down.predict(X)
accuracy_score(y, pred_y_down)

LogisticRegression()

0.5612244897959183

## Try a different alg

In [31]:
from sklearn.ensemble import RandomForestClassifier

In [32]:
X = df[features]
y = df.target
rf = RandomForestClassifier()
rf.fit(X,y)
pred_y_rf = rf.predict(X)
accuracy_score(y, pred_y_rf)

RandomForestClassifier()

1.0