# Lab Assignment 7: Evaluation and Multi-Layer Perceptron
## Rupal Sanghavi, Omar Roa, Van Tran
# Business Case

This dataset represents the responses from students and their friends(ages 15-30, henceforth stated as "young people") of a Statistics class from the Faculty of Social and Economic Sciences at The Comenius University in Bratislava, Slovakia. Their survey was a mix of various topics.

* Music preferences (19 items)
* Movie preferences (12 items)
* Hobbies & interests (32 items)
* Phobias (10 items)
* Health habits (3 items)
* Personality traits, views on life, & opinions (57 items)
* Spending habits (7 items)
* Demographics (10 items)

The dataset can be found here. https://www.kaggle.com/miroslavsabo/young-people-survey

Our target is to predict how likely a young person would spend money on gadgets. 

We wanted to find a classifer that would interest advertisers. Many of the questions asked here may not be something readily available or something that can be scraped from social media. These include phobias and spending habits that aren't the likeliness of spending money on gadgets.

In [1]:
import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore', DeprecationWarning)
%matplotlib inline 
%load_ext memory_profiler
from sklearn.metrics import make_scorer
from scipy.special import expit
import time
import math
import random
from memory_profiler import memory_usage
from sklearn import metrics as mt
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, auc
from scipy import interp

target_classifier = 'Spending on gadgets'
df = pd.read_csv('responses.csv', sep=",")

In [2]:
desired_features = ["Music","Dance","Folk","Country","Classical music","Musical",
    "Pop","Rock","Metal or Hardrock","Punk","Hiphop, Rap","Reggae, Ska",
    "Swing, Jazz","Rock n roll","Alternative","Latino","Techno, Trance",
    "Opera","Movies","Horror","Thriller","Comedy","Romantic","Sci-fi","War",
    "Fantasy/Fairy tales","Animated","Documentary","Western","Action","History",
    "Psychology","Politics","Mathematics","Physics","Internet","PC","Economy Management",
    "Biology","Chemistry","Reading","Geography","Foreign languages","Medicine","Law",
    "Cars","Art exhibitions","Religion","Countryside, outdoors","Dancing",
    "Musical instruments","Writing","Passive sport","Active sport","Gardening","Celebrities",
    "Shopping","Science and technology","Theatre","Fun with friends","Adrenaline sports",
    "Pets","Smoking","Alcohol","Healthy eating","Spending on gadgets","Age","Height","Weight",
    "Number of siblings","Gender","Left - right handed","Education","Village - town","House - block of flats"
]

df = df[desired_features]

Our list of included features.  Most of these are the interest in a certain topics 

In [3]:
# remove rows whose target classfier value is NaN
df_cleaned_classifier = df[np.isfinite(df[target_classifier])]
# change NaN number values to the mean
df_imputed = df_cleaned_classifier.fillna(np.floor(df_cleaned_classifier.mean()))
# get categorical features
object_features = list(df_cleaned_classifier.select_dtypes(include=['object']).columns)
# drop anything that wasn't fixed
df_imputed = df_imputed.dropna()
print(object_features)

['Smoking', 'Alcohol', 'Gender', 'Left - right handed', 'Education', 'Village - town', 'House - block of flats']


In [4]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

encoders = dict() 
categorical_headers = object_features

for col in categorical_headers:
    df_imputed[col] = df_imputed[col].str.strip()    
    if col=="Spending on gadgets":
        tmp = LabelEncoder()
        df_imputed[col] = tmp.fit_transform(df_imputed[col])
    else:
        encoders[col] = LabelEncoder()
        df_imputed[col+'_int'] = encoders[col].fit_transform(df_imputed[col])

numeric_headers = [feature for feature in desired_features if feature not in categorical_headers]

for col in numeric_headers:
    df_imputed[col] = df_imputed[col].astype(np.int)    
#     ss = StandardScaler()
#     df_imputed[col] = ss.fit_transform(df_imputed[col].values.reshape(-1, 1))

In [5]:
from sklearn.model_selection import StratifiedShuffleSplit

# we want to predict the X and y data as follows:
if target_classifier in df_imputed:
    y = df_imputed[target_classifier].values # get the labels we want
    del df_imputed[target_classifier] # get rid of the class label
    X = df_imputed.values # use everything else to predict!

num_cv_iterations = 3
num_instances = len(y)
cv_object = StratifiedShuffleSplit(n_splits=num_cv_iterations,test_size = 0.2)

print(cv_object)

StratifiedShuffleSplit(n_splits=3, random_state=None, test_size=0.2,
            train_size=None)


In [6]:
for train_indices, test_indices in cv_object.split(X,y): 

            X_train = (X[train_indices])
            y_train = y[train_indices]

            X_test = (X[test_indices])
            y_test = y[test_indices]

In [12]:
print(y_train)

[1 2 3 2 3 1 4 2 2 5 1 4 3 1 1 1 2 3 1 2 1 5 3 1 4 1 3 2 1 2 3 3 2 2 2 2 2
 4 4 3 1 2 1 4 1 2 4 2 1 3 2 5 2 4 1 5 2 5 5 2 3 5 5 2 5 1 3 1 3 3 4 4 1 2
 5 2 5 3 4 4 3 3 1 3 5 2 1 3 3 3 2 1 3 3 2 3 1 3 1 1 5 3 3 3 1 1 4 2 2 4 2
 2 2 1 3 1 4 2 5 5 4 2 1 1 4 2 4 5 5 2 2 1 3 4 1 1 2 4 2 4 5 4 5 1 5 2 1 1
 2 5 1 3 2 3 2 2 3 3 4 2 1 2 2 2 4 5 3 4 5 3 4 4 2 3 4 2 5 1 2 3 2 3 4 3 1
 2 2 3 2 2 1 4 3 3 1 5 3 5 2 1 4 2 2 3 4 2 2 2 4 3 3 2 2 1 3 4 3 4 2 3 2 3
 2 3 4 2 5 1 1 2 2 4 5 2 3 3 2 3 3 2 1 1 1 4 2 4 2 5 3 3 3 2 5 4 1 4 4 2 2
 4 5 4 4 4 4 3 3 3 3 4 4 3 1 1 2 3 3 3 1 5 2 3 3 3 3 5 1 3 5 3 3 2 5 4 4 1
 5 4 5 3 2 5 4 3 4 2 2 3 4 3 5 4 3 3 3 2 2 1 2 5 1 2 4 2 3 4 5 3 3 2 1 4 4
 2 4 1 5 2 5 3 5 4 3 5 3 1 5 1 3 5 2 5 5 4 3 5 4 3 3 1 4 5 2 2 1 5 1 3 2 3
 2 3 3 3 4 3 3 4 5 5 2 4 1 1 2 3 4 5 3 5 5 3 2 2 2 5 3 4 2 2 3 2 1 4 1 3 2
 5 1 1 3 2 3 2 1 3 1 3 4 2 1 2 4 5 2 5 4 1 5 2 3 4 2 4 2 4 1 5 1 5 4 2 1 4
 4 2 4 3 3 3 2 5 5 5 4 2 2 2 2 4 3 3 2 1 3 2 1 1 4 2 5 3 2 2 3 2 5 1 2 3 3
 3 2 3 2 1 2 3 3 2 3 3 3 

In [None]:
import tensorflow as tf
from tensorflow.contrib import learn
from tensorflow.contrib import layers
from tensorflow.contrib.learn.python import SKCompat
from tensorflow.contrib.learn.python.learn.estimators import model_fn as model_fn_lib
tf.logging.set_verbosity(tf.logging.WARN) # control the verbosity of tensor flow

In [None]:
# we need to tell tensorflow how many inputs to expect and what the data types will be
# for this early example, everything is just numeric, real valued
features_tf = [layers.real_valued_column('', dimension=X_train.shape[1])]
clf = SKCompat(# wrap with SKCompat for easy usage like sklearn
            learn.DNNClassifier(hidden_units=[50], feature_columns=features_tf)
        )

clf.fit(X_train,y_train,steps=100)

In [None]:
from sklearn import metrics as mt

yhat = clf.predict(X_test)
# notice that the output needs some interpretation
# as its not completely the same as sklearn
yhat = yhat['classes']
print(mt.confusion_matrix(y_test,yhat),
      mt.accuracy_score(y_test,yhat))