In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import numpy.random as nr
import math

from sklearn import preprocessing
import sklearn.model_selection as ms
from sklearn import linear_model
import sklearn.metrics as sklm

%matplotlib inline

In [2]:
AdvWorksTest = pd.read_csv('AW_test.csv')
AdvWorksTrain = pd.read_csv('AdvWorksCusts.csv')
AW_BikeBuyer = pd.read_csv('AW_BikeBuyer.csv')

AdvWorksTest.columns = [str.replace('-', '_') for str in AdvWorksTest.columns]
AdvWorksTrain.columns = [str.replace('-', '_') for str in AdvWorksTrain.columns]

for col in AdvWorksTest.columns:
    if AdvWorksTest[col].dtype == object:
        count = 0
        count = [count + 1 for x in AdvWorksTest[col] if x == '?']
        #print(col + ' ' + str(sum(count)))
        
for col in AdvWorksTrain.columns:
    if AdvWorksTrain[col].dtype == object:
        count = 0
        count = [count + 1 for x in AdvWorksTrain[col] if x == '?']
        #print(col + ' ' + str(sum(count)))    
        
AdvWorksTrain = pd.merge(AdvWorksTrain, AW_BikeBuyer)        
        
print(AdvWorksTest.shape)  
print(AdvWorksTrain.shape)

(500, 23)
(16749, 24)


In [3]:
from datetime import datetime

def age_customer(dl):
    Db = datetime.strptime(dl, '%m/%d/%Y')
    dl = datetime.strptime('1998-01-01', '%Y-%m-%d').year - Db.year
 
    if dl > 45: return 'Above 55'
    elif 45 < dl <= 55: return 'Between 45 and 55'
    elif 25 < dl <= 45: return 'Between 25 and 45'
    elif 0 < dl <= 25: return 'Under 25'
    else: return 'None'
    
def age_customer_train(dl):
    Db = datetime.strptime(dl, '%Y-%m-%d')
    dl = datetime.strptime('1998-01-01', '%Y-%m-%d').year - Db.year
 
    if dl > 45: return 'Above 55'
    elif 45 < dl <= 55: return 'Between 45 and 55'
    elif 25 < dl <= 45: return 'Between 25 and 45'
    elif 0 < dl <= 25: return 'Under 25'
    else: return 'None'    

AdvWorksTest['GroupCustomers']  = AdvWorksTest.BirthDate.apply (lambda row: age_customer(row))  
AdvWorksTrain['GroupCustomers'] = AdvWorksTrain.BirthDate.apply (lambda row: age_customer_train(row))  

AdvWorksTest.drop('Title', axis = 1, inplace = True)
AdvWorksTest.drop('Suffix', axis = 1, inplace = True)
AdvWorksTest.drop('AddressLine2', axis = 1, inplace = True)
AdvWorksTest.drop('MiddleName', axis = 1, inplace = True)
AdvWorksTrain.drop('Title', axis = 1, inplace = True)
AdvWorksTrain.drop('Suffix', axis = 1, inplace = True)
AdvWorksTrain.drop('AddressLine2', axis = 1, inplace = True)
AdvWorksTrain.drop('MiddleName', axis = 1, inplace = True)

print(AdvWorksTrain.shape)
print(AdvWorksTest.shape)

(16749, 21)
(500, 20)


In [4]:
AdvWorksTrain = AdvWorksTrain.apply(lambda x:x.fillna(x.value_counts().index[0]))
AdvWorksTest = AdvWorksTest.apply(lambda x:x.fillna(x.value_counts().index[0]))

In [5]:
labels = np.array(AdvWorksTrain['BikeBuyer'])
print(labels)

[0 1 0 ... 1 0 0]


In [6]:
def encode_string(cat_features):
    ## First encode the strings to numeric categories
    enc = preprocessing.LabelEncoder()
    enc.fit(cat_features)
    enc_cat_features = enc.transform(cat_features)
    ## Now, apply one hot encoding
    ohe = preprocessing.OneHotEncoder()
    encoded = ohe.fit(enc_cat_features.reshape(-1,1))
    return encoded.transform(enc_cat_features.reshape(-1,1)).toarray()

categorical_columns = ['Occupation','Gender','MaritalStatus']

Features = encode_string(AdvWorksTrain['GroupCustomers'])
for col in categorical_columns:
    temp = encode_string(AdvWorksTrain[col])
    Features = np.concatenate([Features, temp], axis = 1)

print(Features.shape)
print(Features[:2, :])   

(16749, 12)
[[0. 1. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0.]
 [0. 1. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1.]]


In [7]:
FeaturesTrain = np.concatenate([Features, np.array(AdvWorksTrain[['NumberCarsOwned','NumberChildrenAtHome'
                                                              ,'YearlyIncome','TotalChildren']])], axis = 1)
print(FeaturesTrain.shape)
print(FeaturesTrain[:2, :])   

(16749, 16)
[[0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  1.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00 1.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 1.37947e+05 2.00000e+00]
 [0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  1.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00 1.00000e+00
  1.00000e+00 3.00000e+00 1.01141e+05 3.00000e+00]]


In [8]:
## Randomly sample cases to create independent training and test data
nr.seed(9988)
indx = range(FeaturesTrain.shape[0])
indx = ms.train_test_split(indx, test_size = 300)

X_train = FeaturesTrain[indx[0],:]
y_train = np.ravel(labels[indx[0]])
X_test = FeaturesTrain[indx[1],:]
y_test = np.ravel(labels[indx[1]])

In [9]:
logistic_mod = linear_model.LogisticRegression() 
logistic_mod.fit(X_train, y_train) 

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [10]:
y_pred = logistic_mod.predict(X_test)
#print(classification_report(y_test, y_pred, target_names=['Y', 'N']))

In [11]:
probabilities = logistic_mod.predict_proba(X_test)
print(probabilities[:15,:])

[[0.75626405 0.24373595]
 [0.6314472  0.3685528 ]
 [0.63778531 0.36221469]
 [0.6443431  0.3556569 ]
 [0.60207408 0.39792592]
 [0.51562483 0.48437517]
 [0.63637515 0.36362485]
 [0.71187281 0.28812719]
 [0.70187592 0.29812408]
 [0.66281141 0.33718859]
 [0.50203852 0.49796148]
 [0.53107537 0.46892463]
 [0.66041148 0.33958852]
 [0.64212884 0.35787116]
 [0.66695967 0.33304033]]


In [12]:
def score_model(probs, threshold):
    return np.array([1 if x > threshold else 0 for x in probs[:,1]])
scores = score_model(probabilities, 0.50)
print(np.array(scores[:15]))
print(y_test[:15])

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[1 0 0 0 0 0 0 0 0 0 1 0 0 1 0]


In [13]:
def print_metrics(labels, scores):
    metrics = sklm.precision_recall_fscore_support(labels, scores)
    conf = sklm.confusion_matrix(labels, scores)
    print('                 Confusion matrix')
    print('                 Score positive    Score negative')
    print('Actual positive    %6d' % conf[0,0] + '             %5d' % conf[0,1])
    print('Actual negative    %6d' % conf[1,0] + '             %5d' % conf[1,1])
    print('')
    print('Accuracy  %0.2f' % sklm.accuracy_score(labels, scores))
    print(' ')
    print('           Positive      Negative')
    print('Num case   %6d' % metrics[3][0] + '        %6d' % metrics[3][1])
    print('Precision  %6.2f' % metrics[0][0] + '        %6.2f' % metrics[0][1])
    print('Recall     %6.2f' % metrics[1][0] + '        %6.2f' % metrics[1][1])
    print('F1         %6.2f' % metrics[2][0] + '        %6.2f' % metrics[2][1])

  
print_metrics(y_test, scores)     

                 Confusion matrix
                 Score positive    Score negative
Actual positive       203                11
Actual negative        54                32

Accuracy  0.78
 
           Positive      Negative
Num case      214            86
Precision    0.79          0.74
Recall       0.95          0.37
F1           0.86          0.50


In [14]:
def encode_string(cat_features):
    ## First encode the strings to numeric categories
    enc = preprocessing.LabelEncoder()
    enc.fit(cat_features)
    enc_cat_features = enc.transform(cat_features)
    ## Now, apply one hot encoding
    ohe = preprocessing.OneHotEncoder()
    encoded = ohe.fit(enc_cat_features.reshape(-1,1))
    return encoded.transform(enc_cat_features.reshape(-1,1)).toarray()

categorical_columns = ['Occupation','Gender','MaritalStatus']

FeaturesTest = encode_string(AdvWorksTest['GroupCustomers'])
for col in categorical_columns:
    temp = encode_string(AdvWorksTest[col])
    Features = np.concatenate([FeaturesTest, temp], axis = 1)

print(FeaturesTest.shape)
print(FeaturesTest[:2, :])   

(500, 3)
[[1. 0. 0.]
 [0. 1. 0.]]


In [15]:
FeaturesTest = np.concatenate([FeaturesTest, np.array(AdvWorksTest[['NumberCarsOwned','NumberChildrenAtHome'
                                                              ,'YearlyIncome','TotalChildren']])], axis = 1)
print(FeaturesTest.shape)
print(FeaturesTest[:2, :])   

(500, 7)
[[1.00000e+00 0.00000e+00 0.00000e+00 2.00000e+00 0.00000e+00 8.69310e+04
  5.00000e+00]
 [0.00000e+00 1.00000e+00 0.00000e+00 2.00000e+00 2.00000e+00 1.00125e+05
  4.00000e+00]]


In [16]:
FeaturesTest[:11]

array([[1.00000e+00, 0.00000e+00, 0.00000e+00, 2.00000e+00, 0.00000e+00,
        8.69310e+04, 5.00000e+00],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, 2.00000e+00, 2.00000e+00,
        1.00125e+05, 4.00000e+00],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, 2.00000e+00, 0.00000e+00,
        1.03985e+05, 4.00000e+00],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        1.27161e+05, 4.00000e+00],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, 1.00000e+00, 2.00000e+00,
        2.18760e+04, 2.00000e+00],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00,
        4.44670e+04, 1.00000e+00],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, 2.00000e+00, 2.00000e+00,
        7.77020e+04, 4.00000e+00],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, 3.00000e+00, 4.00000e+00,
        9.94180e+04, 4.00000e+00],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, 1.00000e+00, 4.00000e+00,
        1.35220e+04, 4.00000e+00],
       [0.00000e+00, 0.00000

In [17]:
#logistic_mod.predict(FeaturesTest[:11])