In [110]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.linear_model import LogisticRegression
from IPython.display import display

df = pd.read_csv("bank-full.csv", delimiter=";")
del df["default"]
del df["loan"]

In [111]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

# Q1 What is the most frequent mode for the column education?

In [112]:
df.education.value_counts()

education
secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: count, dtype: int64

### Answer: secondary

# Question 2
## Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.

In [113]:
df.dtypes

age           int64
job          object
marital      object
education    object
balance       int64
housing      object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [114]:
df.columns

Index(['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact',
       'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome',
       'y'],
      dtype='object')

In [115]:
numerical = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
categorical = ['job', 'marital', 'education', 'balance', 'housing', 'contact',
        'month', 'poutcome']
df[numerical].corr()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


In [116]:
# we didn't learn this ... I'm sure there is an easy thing we could append to the .corr() to get the biggest. I found it by inspection.

### Answer: pdays and previous

In [117]:
df.y = (df.y == "yes").astype(int)

# Prepare the train/val/test

In [118]:
fullTrainData, testData = train_test_split(df, test_size=0.2, random_state=42)
trainData, valData = train_test_split(fullTrainData, test_size=0.25, random_state=42)

In [119]:
yTrain = trainData.y
ytest = testData.y
del testData["y"]
yVal = valData.y
del valData["y"]

# Q3
- Calculate the mutual information score between y and other categorical variables in the dataset. Use the training set only.
- Round the scores to 2 decimals using round(score, 2).

In [120]:
def mutualInfoScore(series):
    return mutual_info_score(series, trainData.y)

In [121]:
trainData[categorical].apply(mutualInfoScore).sort_values(ascending=False).round(2)

balance      0.12
poutcome     0.03
month        0.03
contact      0.01
housing      0.01
job          0.01
education    0.00
marital      0.00
dtype: float64

### Which of these variables has the biggest mutual information score?
### Answer: poutcome

In [122]:
del trainData["y"]

# Q4
- Now let's train a logistic regression.
- Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
- Fit the model on the training dataset.
  - To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
  - model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
- Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

In [123]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer()

In [124]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
xTrainDicts = trainData.to_dict(orient="records")
xTrain = dv.fit_transform(xTrainDicts)
model.fit(xTrain, yTrain)

In [125]:
valDicts = valData.to_dict(orient="records")
xVal = dv.transform(valDicts)

In [137]:
yPred = model.predict(xVal) # in the lesson we used model.predictProb but the question isn't giving a a value to assess (like >= 0.5)
accuracyArray = (yVal == yPred).astype(int)
originalAccuracy = accuracyArray.mean()
originalAccuracy

np.float64(0.9009068790090687)

### What accuracy did you get
### Answer: 0.9

# Q5, feature elimination (probably the hard way)

In [138]:
elimFeatures = ['age', 'balance', 'marital', 'previous']
for feature in elimFeatures:
    tData = trainData.copy()
    del tData[feature]
    xTrainDicts = tData.to_dict(orient="records")
    xTrain = dv.fit_transform(xTrainDicts)
    model.fit(xTrain, yTrain)
    vData = valData.copy()
    del vData[feature]
    valDicts = vData.to_dict(orient="records")
    xVal = dv.transform(valDicts)
    yPred = model.predict(xVal)
    accuracyArray = (yVal == yPred).astype(int)
    accuracy = accuracyArray.mean()
    difference = abs(originalAccuracy - accuracy)
    display(feature + " Accuracy: " + str(accuracy) + " Difference: " + str(difference))

'age Accuracy: 0.9013492590134926 Difference: 0.00044238000442387015'

'balance Accuracy: 0.9010174740101747 Difference: 0.0001105950011059953'

'marital Accuracy: 0.9009068790090687 Difference: 0.0'

'previous Accuracy: 0.9009068790090687 Difference: 0.0'

### Q: Which had the least difference in accuracy?
### A: I get two that had no difference, maritial and previous

# Q6. use regularized logistic regression ... OK, this wasn't covered in the learning

In [147]:
cVals = [0.01, 0.1, 1, 10, 100]
for c in cVals:
    xTrainDicts = trainData.to_dict(orient="records")
    xTrain = dv.fit_transform(xTrainDicts)
    model.fit(xTrain, yTrain, c) # I'm guessing here that this is the c value
    valDicts = valData.to_dict(orient="records")
    xVal = dv.transform(valDicts)
    yPred = model.predict(xVal)
    accuracyArray = (yVal == yPred).astype(int)
    accuracy = accuracyArray.mean()
    display(str(c) + " : " + str(accuracy))

'0.01 : 0.8979208139792081'

'0.1 : 0.9007962840079629'

'1 : 0.9009068790090687'

'10 : 0.9009068790090687'

'100 : 0.9006856890068569'

### Q: Which c value produces the best accuracy?
### A: both 1 and 10, so we are asked to choose the smallest c: 1