# Predicting the Outcome of Bank Marketing
## Author: Bryan Flood

### Import libraries

In [94]:
import pandas as pd
import numpy as np

### Provide functions for cleaning and saving results

In [95]:
def clean_query_data(dataset):
    # Drop Unnecessary Columns 
    temp_dataset = dataset.drop('contact', axis=1)
    temp_dataset = temp_dataset.drop('marital', axis=1)
    temp_dataset = temp_dataset.drop('poutcome', axis=1)
    
    # Consolidate Unknown Values
    temp_dataset[['job', 'education']] = temp_dataset[['job', 'education']].replace(['unknown'], 'other')
    
    # Create Dummy Variables
    temp_dataset = pd.get_dummies(temp_dataset, columns = ['job'])
    temp_dataset = pd.get_dummies(temp_dataset, columns = ['education'])
    temp_dataset = pd.get_dummies(temp_dataset, columns = ['month'])

    # Convert to numerical
    temp_dataset['deposit'] = temp_dataset['deposit'].map({'yes': 1, 'no': 0})
    temp_dataset['housing'] = temp_dataset['housing'].map({'yes': 1, 'no': 0})
    temp_dataset['default'] = temp_dataset['default'].map({'yes': 1, 'no': 0})
    temp_dataset['loan'] = temp_dataset['loan'].map({'yes': 1, 'no': 0})

    return temp_dataset

def clean_training_data(dataset):
    # Drop Unnecessary Columns + Rows 
    temp_dataset = dataset.drop(dataset[dataset.poutcome=='other'].index, axis = 0, inplace = False)
    return clean_query_data(temp_dataset)



def save_predictions(predictions):
    string = ""
    for counter, value in enumerate(predictions):
        string += f"TEST{counter+1},"
        if(value):
            string+="yes\n"
        else:
            string+="no\n"

    csv_file = open("./data/predictions.csv", "w")
    csv_file.write(string)
    csv_file.close()

### Load in and Clean data

In [96]:
feature_names = ['id',
                 'age',
                 'job',
                 'marital',
                 'education',
                 'default',
                 'balance',
                 'housing',
                 'loan',
                 'contact',
                 'day',
                 'month',
                 'duration',
                 'campaign',
                 'previous',
                 'poutcome',
                 'deposit']

# Load in trainingset
dataset = pd.read_csv("./data/trainingset.csv",
                          sep=',',
                          names=feature_names)
dataset = dataset.drop(columns="id")

# Load in query set
query_set = pd.read_csv("./data/queries.csv",
                          sep=',',
                          names=feature_names)
query_set = query_set.drop(columns="id")


# Clean training set
temp_dataset = clean_training_data(dataset)


# Clean query set
query_set = clean_query_data(query_set)


# Features
features = temp_dataset.drop(columns="deposit")

# Target
target = temp_dataset['deposit']

### Train on training set

In [97]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

split= 0.50
seed_value = 8
features_train, features_test, target_train, target_test= train_test_split(features, target, test_size=split, random_state=seed_value)

decision_tree = DecisionTreeClassifier()

decision_tree.fit(features_train, target_train)

predictions = decision_tree.predict(features_test)
print(f"Test Ratio {split}")
print(f"Accuracy Score of: {accuracy_score(target_test, predictions)}")

Test Ratio 0.5
Accuracy Score of: 0.8266986959505834


### Make Actual Prediction

In [98]:
query_set = query_set.drop(columns="deposit")

predictions = decision_tree.predict(query_set)

save_predictions(predictions)

# Documentation
## How the problem was solved
### 1. Data
There were a few issues with the data.
Previous and Balance had 19752 (81.3%) and 1890 (7.8%) zero values respectively.

The target variable skews heavily towards no occurring 88.4% of the time.
We have to assume that the training data is representative of the whole dataset and query set.
Without the actual results of the query set all testing had to be done on the training set.

I found a lot of the rows and columns weren't that useful and ended up dropping them.
I consolidated the different types of unknown values.
I lastly converted the binary string columns to be binary storing 1s and 0s.
I found good performance with the rest as dummy variables as they all have a distance-based relationship for their values.

I considered using more advanced techniques for cleaning that took into consideration the standard deviation and similar metrics. I decided against it due to my correlation findings. Most of the different formulas for detecting correlations found nothing between deposit and the other columns.
Using Phi k Correlation, I was able to see that month and campaign had the most influence by far on predicting the deposit value


### 2. Classifier
For the classifier I used sklearn's DecisionTreeClassifier.
After I had completed a quick clean of the data, I tested the data on variety of different classifiers provided by SKLearn and the decision tree solution gave consistently better results regardless of the split of data.

One of the biggest issues with decision tree's how easy they are to be overfitted. In our case I believed this to be not much of an issue as the data is so skewed (88.4%) towards no deposit.


### 3. Testing
For testing I used a train and test split of the training data.
I used a wide variety of ratios and seed values to ensure accuracy.