In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
!pip install numpy pandas matplotlib scikit-learn

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix as cm
from sklearn.impute import SimpleImputer

In [None]:
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")
df = df.drop("customerID", axis=1)
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df.dtypes
df.head()

In [None]:
nan_v = [i for i in range(len(df.TotalCharges)) if np.isnan(df.TotalCharges[i])]
for i in nan_v:
    df.at[i, 'TotalCharges']=  0

df.loc[nan_v]

In [None]:
df.head()
df.isna().sum()
df.nunique()
df.dtypes
# df.columns
# col_list = [df_col_list[i] for i in range(len(df_col_list)) if uniq_list[i] > 2 and uniq_list[i] < 5]
# tuple(zip(list(df.columns), list(df.nunique())))
df.describe()

In [None]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

X.head()
y.head()

In [None]:
# X = X.values
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [3]:
def convert(x):
    if x > 0:
        return 1
    return -1

In [None]:
y = np.array([convert(yi) for yi in y])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
def binarize(df, col_list):
    for col in col_list:
        uniq = df[col].unique()
        
        df[col] = df[col].apply(lambda x: 0 if x == uniq[0] else 1)

In [4]:
def normalize(df, col_list):
    scaler = MinMaxScaler()
#     scaler = StandardScaler()
    df.loc[:, col_list] = scaler.fit_transform(df.loc[:, col_list])

In [5]:
def transform_data(df, bin_columns, onehot_columns, value_columns):
    binarize(df, bin_columns)
    df = pd.get_dummies(df, columns=onehot_columns)
    normalize(df, value_columns)
    return df

In [None]:
bin_columns = [col for col in X if X[col].nunique() == 2]
onehot_columns = ['MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract','PaymentMethod']
value_columns = ['tenure', 'MonthlyCharges', 'TotalCharges']

X_train = transform_data(X_train, bin_columns, onehot_columns, value_columns)
X_test = transform_data(X_test, bin_columns, onehot_columns, value_columns)

In [None]:
X_train = X_train.values
X_test = X_test.values
y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)

In [None]:
def logloss(y, y_hat):
    return -np.mean(y * np.log(y_hat) + (1-y) * np.log(1-y_hat))

In [4]:
def meanloss(y, y_hat):
    return np.mean((y - y_hat) ** 2)

In [5]:
def h(X, w):
    return np.tanh(X @ w)
#     return 1.0/(1 + np.exp(-X @ w))

In [None]:
def grad(X, y, y_hat, w):
#     m = X.shape[0]
#     eps = 0.00001
#     y_div = np.copy(y_hat)
#     y_div[y_hat == 0] = eps
#     dw = (1/m) * ( X.T @ ((1 / y_div) * (y - y_hat) * (1 + y_hat)) )
    
#     return dw

    # X --> Input.
    # y --> true/target value.
    # y_hat --> hypothesis/predictions.
    # w --> weights (parameter).
    # b --> bias (parameter).
    
    # m-> number of training examples.
    m = X.shape[0]
    
    # Gradient of loss w.r.t weights.
    dw = (1/m) * (X.T @ (y_hat - y))
    
    return dw

In [6]:
def gradient(X, y, y_hat, w):
    m = X.shape[0]
    dw = -(1/m) * ( X.T @ ((y - y_hat) * (1 - y_hat**2)) )
    return dw

In [7]:
def logistic_regression(X, y, bs, epochs, lr):
    # X --> Input.
    # y --> true/target value.
    # bs --> Batch Size.
    # epochs --> Number of iterations.
    # lr --> Learning rate.
    m, n = X.shape
    
    # Initializing weights to zeros.
    w = np.zeros((n+1,1))
#     w = np.full((n+1, 1), 1/m)
    
    X_train = np.concatenate((np.ones((m, 1)), X), axis=1)
    
    # Empty list to store losses.
    losses = []
    
    for epoch in range(epochs):
        for i in range((m-1) // bs + 1):
            start_i = i * bs
            end_i = start_i + bs
            
            Xb = X_train[start_i:end_i]
            yb = y[start_i:end_i]
            
            y_hat = h(Xb, w)
            
            dw = gradient(Xb, yb, y_hat, w)
            
            w -= lr * dw
            
#             print(f"Epoch {epoch}: [{start_i}:{end_i}]: {w.T}")
        
        # Calculating loss and appending it in the list.
        l = meanloss(y, h(X_train, w))
        losses.append(l)
    
    return w, losses

In [None]:
w, losses = logistic_regression(X_train, y_train, X_train.shape[0], 5, 0.3)

In [8]:
def predict(X_test, w, returns_prob=False):
    m, n = X_test.shape
    X_test = np.concatenate((np.ones((m, 1)), X_test), axis=1)
    pred = h(X_test, w)
    
    if returns_prob:
        return pred
    
#     pred = np.squeeze(pred)
    return np.array([convert(pi) for pi in pred.reshape(-1)]).reshape(-1, w.shape[1])

In [9]:
def accuracy(y, y_hat):
    return np.sum(y == y_hat) / len(y)

In [None]:
pred = predict(X_test, w)

np.sum(y_test == pred) / len(y_test)
np.sum(pred == 1)
np.sum(pred == -1)

In [None]:
w
losses

In [10]:
def adaboost(X_train, y_train, K):
    m, n = X_train.shape
    pw = np.array([1/m] * m)
    ws = np.zeros((K, (n+1)))
    z = np.zeros(K)
    
    examples = np.arange(m)

    for k in range(K):
        data = np.random.choice(examples, (m,), p=pw)
        Xk_train = X_train[data, :]
        yk_train = y_train[data, :]
        
        w, losses = logistic_regression(Xk_train, yk_train, 100, 20, 0.1)
        yk_pred = predict(Xk_train, w)
        print(f"k={k}, accuracy={accuracy(yk_train, yk_pred) * 100}%")
        
        ws[k] = np.squeeze(w)
        
        y_hat = predict(Xk_train, w)
        
        mask = np.squeeze((y_train != y_hat))
        error = np.sum(pw[mask])
        
        if error > 0.5:
            continue
            
        mask = ~mask
        pw[mask] = pw[mask] * error / (1-error)
        
        pw = pw / np.sum(pw)
        
        z[k] = np.log((1-error) / error)
    
    return ws, z
#     print(ws.shape)
#     preds = predict(X_train, ws.T)
#     print(preds.shape, z.shape)
#     ahypo = preds @ z.reshape(-1, 1)
#     return ahypo, z

In [11]:
def weighted_majority(X, ws, z):
    preds = predict(X, ws.T)
    y_hat = preds @ z.reshape(-1, 1)
    return np.array([convert(pi) for pi in y_hat.reshape(-1)]).reshape(-1, 1)

In [12]:
def report(y, y_pred):
    acc = accuracy(y, y_pred)
    [[TN, FP], [FN, TP]] = cm(y, y_pred)
    recall = TP / (TP + FN)
    specificity = TN / (TN + FP)
    precision = TP / (TP + FP)
    false_discovery_rate = FP / (FP + TP)
    f1 = 2*TP / (2*TP + FP + FN)
    
    print(f"Accuracy: {acc}")
    print(f"True positive rate (sensitivity, recall, hit rate): {recall}")
    print(f"True negative rate (specificity): {specificity}")
    print(f"Positive predictive value (precision): {precision}")
    print(f"False discovery rate: {false_discovery_rate}")
    print(f"F1 score: {f1}")
    print()

In [None]:
ws, z = adaboost(X_train, y_train, 20)

In [None]:
weighted_majority(X_train, y_train, ws, z)
weighted_majority(X_test, y_test, ws, z)

In [None]:
# Using 1 and -1
y_pred = weighted_majority(X_train, ws, z)
report(y_train, y_pred)

y_pred = weighted_majority(X_test, ws, z)
report(y_test, y_pred)

In [15]:
df_train = pd.read_csv("adult.csv")
df_train.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']

In [None]:
df.shape

In [16]:
df_test = pd.read_csv("adult.test.csv")
df_test.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']

In [17]:
df = pd.concat([df_train, df_test])

In [None]:
df.shape

In [None]:
df["income"].unique()

In [18]:
df["income"].replace([' <=50K', ' >50K', ' <=50K.', ' >50K.'], [-1, 1, -1, 1], inplace=True)

In [20]:
df.isin([' ?']).sum(axis=0)
df["workclass"].value_counts()

age                  0
workclass         1836
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     583
income               0
dtype: int64

 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
 ?                    1836
 State-gov            1297
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: workclass, dtype: int64

In [19]:
df.replace(" ?", np.nan, inplace=True)
simputer = SimpleImputer(missing_values = np.nan, strategy='most_frequent')
df.loc[:, df.columns] = pd.DataFrame(simputer.fit_transform(df), columns=df.columns, index=df.index)

In [None]:
# df.isna().sum()
# ddf.isna().sum()
# df.nunique()
# ddf.nunique()
# df["workclass"].value_counts()
# ddf["workclass"].value_counts()
# df.head()

In [20]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

X.head()
y.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States


0    -1
1    -1
2    -1
3    -1
4    -1
Name: income, dtype: object

In [None]:
df["sex"].value_counts()
df.nunique()
df.dtypes

In [21]:
X.drop("education-num", axis=1, inplace=True)
X["sex"].replace([' Male', ' Female'], [0, 1], inplace=True)
# onehot_columns=['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'native-country']
# X = pd.get_dummies(X, columns=onehot_columns)

In [22]:
onehot_columns=['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'native-country']
X = pd.get_dummies(X, columns=onehot_columns)
X.isna().sum()
X.head()

age                                0
fnlwgt                             0
sex                                0
capital-gain                       0
capital-loss                       0
                                  ..
native-country_ Thailand           0
native-country_ Trinadad&Tobago    0
native-country_ United-States      0
native-country_ Vietnam            0
native-country_ Yugoslavia         0
Length: 103, dtype: int64

Unnamed: 0,age,fnlwgt,sex,capital-gain,capital-loss,hours-per-week,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,...,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia
0,50,83311,0,0,0,13,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,38,215646,0,0,0,40,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2,53,234721,0,0,0,40,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
3,28,338409,1,0,0,40,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,37,284582,1,0,0,40,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [23]:
X_train = X.iloc[:df_train.shape[0]]
y_train = y.iloc[:df_train.shape[0]]
X_test = X.iloc[df_train.shape[0]:]
y_test = y.iloc[df_train.shape[0]:]

In [24]:
value_columns = ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']
normalize(X_train, value_columns)
normalize(X_test, value_columns)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)


In [25]:
X_test.head()
X_train.head()

Unnamed: 0,age,fnlwgt,sex,capital-gain,capital-loss,hours-per-week,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,...,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia
0,0.287671,0.051677,0,0.0,0.0,0.5,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
1,0.150685,0.219011,0,0.0,0.0,0.397959,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0.369863,0.099418,0,0.076881,0.0,0.397959,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
3,0.013699,0.060942,1,0.0,0.0,0.295918,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4,0.232877,0.125398,0,0.0,0.0,0.295918,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


Unnamed: 0,age,fnlwgt,sex,capital-gain,capital-loss,hours-per-week,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,...,native-country_ Portugal,native-country_ Puerto-Rico,native-country_ Scotland,native-country_ South,native-country_ Taiwan,native-country_ Thailand,native-country_ Trinadad&Tobago,native-country_ United-States,native-country_ Vietnam,native-country_ Yugoslavia
0,0.452055,0.048238,0,0.0,0.0,0.122449,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0.287671,0.138113,0,0.0,0.0,0.397959,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2,0.493151,0.151068,0,0.0,0.0,0.397959,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
3,0.150685,0.221488,1,0.0,0.0,0.397959,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0.273973,0.184932,1,0.0,0.0,0.397959,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [26]:
X_train = X_train.values
X_test = X_test.values
y_train = y_train.values.astype(int).reshape(-1, 1)
y_test = y_test.values.astype(int).reshape(-1, 1)

In [None]:
w, losses = logistic_regression(X_train, y_train, X_train.shape[0], 5, 0.3)

In [None]:
pred = predict(X_test, w)

np.sum(y_test == pred) / len(y_test)
np.sum(pred == 1)
np.sum(pred == -1)

In [27]:
ws, z = adaboost(X_train, y_train, 20)

k=0, accuracy=84.81572481572482%
k=1, accuracy=82.2512285012285%
k=2, accuracy=82.07002457002457%
k=3, accuracy=81.44041769041769%
k=4, accuracy=81.55405405405406%
k=5, accuracy=81.50798525798525%
k=6, accuracy=81.1916461916462%
k=7, accuracy=81.6891891891892%
k=8, accuracy=81.237714987715%
k=9, accuracy=81.2899262899263%
k=10, accuracy=81.3943488943489%
k=11, accuracy=81.35135135135135%
k=12, accuracy=81.83353808353809%
k=13, accuracy=81.47420147420148%
k=14, accuracy=81.4004914004914%
k=15, accuracy=81.71068796068795%
k=16, accuracy=81.28071253071253%
k=17, accuracy=81.73525798525799%
k=18, accuracy=81.27149877149877%
k=19, accuracy=82.0485257985258%


In [28]:
# Using 1 and -1
y_pred = weighted_majority(X_train, ws, z)
report(y_train, y_pred)

y_pred = weighted_majority(X_test, ws, z)
report(y_test, y_pred)

Accuracy: 0.8336302211302211
True positive rate (sensitivity, recall, hit rate): 0.7214640989669685
True negative rate (specificity): 0.8692099194951252
Positive predictive value (precision): 0.6363329583802024
False discovery rate: 0.3636670416197975
F1 score: 0.6762297531528301

Accuracy: 0.8313267813267813
True positive rate (sensitivity, recall, hit rate): 0.7142485699427977
True negative rate (specificity): 0.8675406144442657
Positive predictive value (precision): 0.6251706873008648
False discovery rate: 0.37482931269913516
F1 score: 0.666747572815534



In [66]:
df = pd.read_csv("creditcard.csv")
df.shape
df.head()

(284807, 31)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


df.Class.value_counts()
df.dtypes
df.nunique()

In [67]:
df = df.sample(frac=1).reset_index(drop=True)
df_sub = pd.concat([df.loc[df["Class"] == 1], df.loc[df["Class"] == 0].iloc[:10000, :]])
df_sub = df_sub.sample(frac=1).reset_index(drop=True)
del df

In [68]:
X = df_sub.iloc[:, :-1]
y = df_sub.iloc[:, -1]
y.replace(0, -1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [69]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [56]:
X_train[:10, :]

array([[3.09007982e-01, 9.40981556e-01, 6.69837582e-01, 9.55993645e-01,
        1.81667897e-01, 5.72751275e-01, 4.04225589e-01, 5.89819591e-01,
        6.73414140e-01, 4.97136862e-01, 7.01265974e-01, 3.21166264e-01,
        8.05724809e-01, 6.33358270e-01, 7.15104322e-01, 4.76237703e-01,
        6.83509750e-01, 7.52271447e-01, 7.69632195e-01, 5.46790953e-01,
        4.95878618e-01, 4.54858608e-01, 5.26743723e-01, 5.84920360e-01,
        4.30363532e-01, 4.71309026e-01, 4.12976533e-01, 5.53803720e-01,
        3.58729948e-01, 4.07130207e-03],
       [9.03958465e-01, 9.90243036e-01, 6.60209793e-01, 8.60322129e-01,
        2.47166831e-01, 5.83816660e-01, 3.67965015e-01, 5.89837522e-01,
        6.70511364e-01, 6.04064466e-01, 6.78714264e-01, 3.37172192e-01,
        8.00683317e-01, 3.64757717e-01, 6.95927952e-01, 3.84934188e-01,
        7.65555604e-01, 7.77054495e-01, 6.66402775e-01, 5.26172580e-01,
        4.99965938e-01, 4.59867000e-01, 5.37936329e-01, 5.90988760e-01,
        4.24562759e-01,

In [70]:
y_train = y_train.values.reshape(-1, 1)
y_test = y_test.values.reshape(-1, 1)

In [78]:
ws, z = adaboost(X_train, y_train, 20)

k=0, accuracy=97.84344096270702%
k=1, accuracy=94.03073990229953%
k=2, accuracy=93.94733706660313%
k=3, accuracy=93.97116644823068%
k=4, accuracy=94.28094840938877%
k=5, accuracy=93.1967115453354%
k=6, accuracy=92.8869295841773%
k=7, accuracy=93.06564994638389%
k=8, accuracy=93.00607649231503%
k=9, accuracy=93.07756463719767%
k=10, accuracy=94.12605742880972%
k=11, accuracy=93.48266412486595%
k=12, accuracy=94.10222804718218%
k=13, accuracy=93.13713809126654%
k=14, accuracy=93.68521386870012%
k=15, accuracy=93.1967115453354%
k=16, accuracy=92.86310020254975%
k=17, accuracy=93.75670201358275%
k=18, accuracy=93.99499582985823%
k=19, accuracy=93.70904325032765%


In [79]:
# Using 1 and -1
y_pred = weighted_majority(X_train, ws, z)
report(y_train, y_pred)
    
y_pred = weighted_majority(X_test, ws, z)
report(y_test, y_pred)

Accuracy: 0.9791492910758965
True positive rate (sensitivity, recall, hit rate): 0.5653266331658291
True negative rate (specificity): 0.9997498436522827
Positive predictive value (precision): 0.9911894273127754
False discovery rate: 0.00881057268722467
F1 score: 0.72

Accuracy: 0.9804668889947594
True positive rate (sensitivity, recall, hit rate): 0.574468085106383
True negative rate (specificity): 0.9995012468827931
Positive predictive value (precision): 0.9818181818181818
False discovery rate: 0.01818181818181818
F1 score: 0.7248322147651006



In [80]:
cm(y_test, y_pred)

array([[2004,    1],
       [  40,   54]])

In [30]:
df = pd.read_csv('bank-additional-full.csv', sep=';')
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [4]:
df.nunique()

age                 78
job                 12
marital              4
education            8
default              3
housing              3
loan                 3
contact              2
month               10
day_of_week          5
duration          1544
campaign            42
pdays               27
previous             8
poutcome             3
emp.var.rate        10
cons.price.idx      26
cons.conf.idx       26
euribor3m          316
nr.employed         11
y                    2
dtype: int64

In [35]:
df.default.unique()

array(['no', 'unknown', 'yes'], dtype=object)

In [8]:
df.contact.unique()

array(['telephone', 'cellular'], dtype=object)

In [9]:
df.describe()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
count,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0
mean,40.02406,258.28501,2.567593,962.475454,0.172963,0.081886,93.575664,-40.5026,3.621291,5167.035911
std,10.42125,259.279249,2.770014,186.910907,0.494901,1.57096,0.57884,4.628198,1.734447,72.251528
min,17.0,0.0,1.0,0.0,0.0,-3.4,92.201,-50.8,0.634,4963.6
25%,32.0,102.0,1.0,999.0,0.0,-1.8,93.075,-42.7,1.344,5099.1
50%,38.0,180.0,2.0,999.0,0.0,1.1,93.749,-41.8,4.857,5191.0
75%,47.0,319.0,3.0,999.0,0.0,1.4,93.994,-36.4,4.961,5228.1
max,98.0,4918.0,56.0,999.0,7.0,1.4,94.767,-26.9,5.045,5228.1


In [31]:
df.y.replace(['no', 'yes'], [-1, 1], inplace=True)

In [32]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [33]:
X.head()
y.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0
2,37,services,married,high.school,no,yes,no,telephone,may,mon,226,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,151,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0
4,56,services,married,high.school,no,no,yes,telephone,may,mon,307,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0


0   -1
1   -1
2   -1
3   -1
4   -1
Name: y, dtype: int64

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

In [19]:
df.nunique()

age                 78
job                 12
marital              4
education            8
default              3
housing              3
loan                 3
contact              2
month               10
day_of_week          5
duration          1544
campaign            42
pdays               27
previous             8
poutcome             3
emp.var.rate        10
cons.price.idx      26
cons.conf.idx       26
euribor3m          316
nr.employed         11
y                    2
dtype: int64

In [24]:
df['poutcome'].unique()

array(['nonexistent', 'failure', 'success'], dtype=object)

In [34]:
cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
X = pd.get_dummies(X, columns=cols)

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [38]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [41]:
y_train = y_train.values.reshape(-1, 1)
y_test = y_test.values.reshape(-1, 1)

In [53]:
w, losses = logistic_regression(X_train, y_train, 50, 10, 0.1)

In [54]:
pred = predict(X_test, w)
accuracy(y_test, pred)

0.9021607186210245

In [55]:
ws, z = adaboost(X_train, y_train, 20)

k=0, accuracy=90.57056145675266%
k=1, accuracy=85.38998482549317%
k=2, accuracy=86.11532625189682%
k=3, accuracy=86.19119878603946%
k=4, accuracy=84.77086494688922%
k=5, accuracy=85.17147192716237%
k=6, accuracy=85.30500758725341%
k=7, accuracy=86.78603945371776%
k=8, accuracy=86.10015174506829%
k=9, accuracy=85.34142640364189%
k=10, accuracy=85.49924127465857%
k=11, accuracy=84.7587253414264%
k=12, accuracy=86.33687405159333%
k=13, accuracy=85.61760242792108%
k=14, accuracy=85.91502276176024%
k=15, accuracy=86.16995447647952%
k=16, accuracy=85.86039453717754%
k=17, accuracy=85.8391502276176%
k=18, accuracy=86.46737481031866%
k=19, accuracy=86.66767830045524%


In [56]:
# Using 1 and -1
y_pred = weighted_majority(X_train, ws, z)
report(y_train, y_pred)
    
y_pred = weighted_majority(X_test, ws, z)
report(y_test, y_pred)

Accuracy: 0.9033080424886191
True positive rate (sensitivity, recall, hit rate): 0.23809523809523808
True negative rate (specificity): 0.987352156969987
Positive predictive value (precision): 0.704
False discovery rate: 0.296
F1 score: 0.3558431055398302

Accuracy: 0.9009468317552805
True positive rate (sensitivity, recall, hit rate): 0.23622881355932204
True negative rate (specificity): 0.9869755963805867
Positive predictive value (precision): 0.7012578616352201
False discovery rate: 0.29874213836477986
F1 score: 0.35340729001584786



In [80]:
df = pd.read_csv("online_adult.csv")
df.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']

In [19]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,-1
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,-1
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,-1
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,-1
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,-1


In [32]:
df.nunique()

age                  73
workclass             8
fnlwgt            21647
education            16
education-num        16
marital-status        7
occupation           14
relationship          6
race                  5
sex                   2
capital-gain        119
capital-loss         92
hours-per-week       94
native-country       41
income                2
dtype: int64

In [81]:
df["income"].replace([' <=50K', ' >50K'], [-1, 1], inplace=True)

In [82]:
df.replace(" ?", np.nan, inplace=True)

In [83]:
simputer = SimpleImputer(missing_values = np.nan, strategy='most_frequent')
df = pd.DataFrame(simputer.fit_transform(df), columns=df.columns, index=df.index)

In [84]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [87]:
# X = X.drop(["education-num", "native-country"], axis=1)
# X["sex"].replace([' Male', ' Female'], [0, 1], inplace=True)
onehot_columns=['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
X = pd.get_dummies(X, columns=onehot_columns)

In [88]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

In [89]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [90]:
y_train = y_train.values.astype(int).reshape(-1, 1)
y_test = y_test.values.astype(int).reshape(-1, 1)

In [91]:
w, losses = logistic_regression(X_train, y_train, 50, 10, 0.1)

In [92]:
pred = predict(X_test, w)
accuracy(y_test, pred)

0.8439803439803439

In [93]:
ws, z = adaboost(X_train, y_train, 20)

k=0, accuracy=84.27518427518427%
k=1, accuracy=82.05023205023205%
k=2, accuracy=82.13213213213213%
k=3, accuracy=81.84206934206935%
k=4, accuracy=81.5997815997816%
k=5, accuracy=80.17676767676768%
k=6, accuracy=81.54859404859405%
k=7, accuracy=80.93093093093093%
k=8, accuracy=80.44635544635544%
k=9, accuracy=80.72276822276822%
k=10, accuracy=81.31313131313132%
k=11, accuracy=81.52470652470653%
k=12, accuracy=81.41891891891892%
k=13, accuracy=81.40185640185639%
k=14, accuracy=81.006006006006%
k=15, accuracy=81.14591864591864%
k=16, accuracy=80.56238056238057%
k=17, accuracy=81.22440622440622%
k=18, accuracy=81.43939393939394%
k=19, accuracy=79.74337974337973%


In [94]:
# Using 1 and -1
y_pred = weighted_majority(X_train, ws, z)
report(y_train, y_pred)
    
y_pred = weighted_majority(X_test, ws, z)
report(y_test, y_pred)

Accuracy: 0.8390322140322141
True positive rate (sensitivity, recall, hit rate): 0.6794980259447264
True negative rate (specificity): 0.8899693859175221
Positive predictive value (precision): 0.6634999311579237
False discovery rate: 0.3365000688420763
F1 score: 0.6714036920933473

Accuracy: 0.836916461916462
True positive rate (sensitivity, recall, hit rate): 0.6582109479305741
True negative rate (specificity): 0.8903071400079776
Positive predictive value (precision): 0.6419270833333334
False discovery rate: 0.3580729166666667
F1 score: 0.6499670402109426



In [98]:
np.sum(y_train == 1)
np.sum(y_train == -1)

7092

22212