In [1]:
import pandas as pd
import numpy as np

# Load Dataset

In [2]:
email_data = pd.read_csv('task_data_set.csv')
data = email_data.copy()

  interactivity=interactivity, compiler=compiler, result=result)


# Clean Dataset

In [3]:
def transform_data(df):
    df.Risk = df.Risk.fillna('Unknown')
    Risk_mapper = {'Big':3 , 'Med': 2, 'Small': 1, 'Unknown':0}
    df.Risk = df.Risk.replace(Risk_mapper)

    df.Frequency = df.Frequency.fillna('Never')
    Frequency_mapper = {'High':3 , 'Low': 2, 'OneTime': 1, 'Never':0}
    df.Frequency = df.Frequency.replace(Frequency_mapper)

    df.HasChildren = df.HasChildren.fillna('Unknown')
    HasChildren_mapper = {'Y':2 , 'N': 1, 'Unknown': 0}
    df['HasChildren'] = df.HasChildren.replace(HasChildren_mapper)

    df = data.drop('Zip',axis=1)

    df['Date'] =  pd.to_datetime(df['RegistrationDate'], format='%d/%m/%Y')
    last_date = df['Date'].max()
    df['Days_Since_Registration'] = (last_date - df['Date']).dt.days
    df.Days_Since_Registration = df.Days_Since_Registration.fillna(df.Days_Since_Registration.mean())
    df.Days_Since_Registration = round(df.Days_Since_Registration,0).astype(int)

    df.ProductsPurchased = df.ProductsPurchased.fillna(0)

    Age_mapper = {'35-39': 37, '19-24':22, '55-59':57, '40-44':42, '60-64':62, '80-84':82, '65-69':67,
           '50-54':52, '25-29':27, '45-49':47, '70-74':72, '30-34':32, '75-79':77, '85-89':87,
           '90-94':92, '95-99':97}
    df['Age'] = df['AgeBracket']
    df['Age'] = df['Age'].replace(Age_mapper)
    df.Age = df.Age.fillna(df.Age.mean())

    Income_mapper = {'$60-69':65, '$80-89':85, '$40-49':45, '$10-19':15, '$30-39':35, '$50-59':55,
           '$90-99':95, 'Under $10K':9, '$20-29':25, '$100-149':125, '$250K+':300, '$70-79':75,
           '$175-199':187, '$200-249':225, '$150-174':162}
    df['Income'] = df['HHIncome']
    df['Income'] = df['Income'].replace(Income_mapper)
    df.Income = df.Income.fillna(df.Income.mean())

    df = pd.get_dummies(df, columns = ['Gender'])

    df.loc[~df['RegistrationCode'].isnull(),'RegistrationCodeUsed'] = 1
    df.loc[df['RegistrationCode'].isnull(),'RegistrationCodeUsed'] = 0
    
    clean_data = df[['ProductsPurchased',
           'Risk', 'Frequency',
           'HasChildren', 
           'Days_Since_Registration', 'Age',
           'Income', 'Gender_A', 'Gender_B',
           'Gender_C', 'Gender_F', 'Gender_I', 'Gender_M', 'Gender_U',
           'RegistrationCodeUsed', 'IsClick']]
    return(clean_data)

In [4]:
mapped_data = transform_data(data)

# Split Data

In [5]:
import sklearn
from sklearn.model_selection import train_test_split

X = mapped_data.iloc[:,:-1]
y = mapped_data['IsClick']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

### Random guessing would get us a baseline score of

In [6]:
percent_clicked = round(len(mapped_data.loc[mapped_data.IsClick == 1]) / len(mapped_data),4)*100
print('Baseline % = {}%'.format(percent_clicked))

Baseline % = 54.63%


# Score Models

In [7]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()
clf4 = KNeighborsClassifier(n_neighbors=6)

eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3),('knn',clf4)], voting='hard')
print("Training Scores")
for clf, label in zip([clf1, clf2, clf3, clf4, eclf], ['Logistic Regression', 'Random Forest', 'naive Bayes','KNN', 'Ensemble']):
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Training Scores
Accuracy: 0.55 (+/- 0.00) [Logistic Regression]
Accuracy: 0.55 (+/- 0.00) [Random Forest]
Accuracy: 0.55 (+/- 0.00) [naive Bayes]
Accuracy: 0.54 (+/- 0.00) [KNN]
Accuracy: 0.55 (+/- 0.00) [Ensemble]


# Test Models

In [8]:
print('Testing Scores')
for clf, label in zip([clf1, clf2, clf3, clf4, eclf], ['Logistic Regression', 'Random Forest', 'naive Bayes','KNN', 'Ensemble']):
    score = clf.fit(X_train, y_train).score(X_test,y_test)
    print("Accuracy: %0.2f [%s]" % (round(score,2), label))

Testing Scores
Accuracy: 0.55 [Logistic Regression]
Accuracy: 0.55 [Random Forest]
Accuracy: 0.55 [naive Bayes]
Accuracy: 0.55 [KNN]
Accuracy: 0.55 [Ensemble]
