In [None]:
import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore', DeprecationWarning)
%matplotlib inline 
%load_ext memory_profiler
from sklearn.metrics import make_scorer
from scipy.special import expit
import time
import math
import random
from memory_profiler import memory_usage
from sklearn import metrics as mt
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, auc
from scipy import interp

target_classifier = 'Spending on gadgets'
df = pd.read_csv('responses.csv', sep=",")

In [None]:
desired_features = ["Music","Dance","Folk","Country","Classical music","Musical",
    "Pop","Rock","Metal or Hardrock","Punk","Hiphop, Rap","Reggae, Ska",
    "Swing, Jazz","Rock n roll","Alternative","Latino","Techno, Trance",
    "Opera","Movies","Horror","Thriller","Comedy","Romantic","Sci-fi","War",
    "Fantasy/Fairy tales","Animated","Documentary","Western","Action","History",
    "Psychology","Politics","Mathematics","Physics","Internet","PC","Economy Management",
    "Biology","Chemistry","Reading","Geography","Foreign languages","Medicine","Law",
    "Cars","Art exhibitions","Religion","Countryside, outdoors","Dancing",
    "Musical instruments","Writing","Passive sport","Active sport","Gardening","Celebrities",
    "Shopping","Science and technology","Theatre","Fun with friends","Adrenaline sports",
    "Pets","Smoking","Alcohol","Healthy eating","Spending on gadgets","Age","Height","Weight",
    "Number of siblings","Gender","Left - right handed","Education","Village - town","House - block of flats"
]

df = df[desired_features]

In [None]:
# remove rows whose target classfier value is NaN
df_cleaned_classifier = df[np.isfinite(df[target_classifier])]
# change NaN number values to the mean
df_imputed = df_cleaned_classifier.fillna(df.mean())
# get categorical features
object_features = list(df_cleaned_classifier.select_dtypes(include=['object']).columns)
# drop anything that wasn't fixed
df_imputed = df_imputed.dropna()
print(object_features)

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

encoders = dict() 
categorical_headers = object_features

for col in categorical_headers:
    df_imputed[col] = df_imputed[col].str.strip()    
    if col=="Spending on gadgets":
        tmp = LabelEncoder()
        df_imputed[col] = tmp.fit_transform(df_imputed[col])
    else:
        encoders[col] = LabelEncoder()
        df_imputed[col+'_int'] = encoders[col].fit_transform(df_imputed[col])

numeric_headers = [feature for feature in desired_features if feature not in categorical_headers]

for col in numeric_headers:
    df_imputed[col] = df_imputed[col].astype(np.float)    
    ss = StandardScaler()
    df_imputed[col] = ss.fit_transform(df_imputed[col].values.reshape(-1, 1))

NameError: name 'object_features' is not defined

In [None]:
df_train = df
df_test = df

In [None]:
import tensorflow as tf
from tensorflow.contrib import learn
from tensorflow.contrib import layers
from tensorflow.contrib.learn.python import SKCompat
tf.logging.set_verbosity(tf.logging.WARN) # control the verbosity of tensor flow

In [None]:
categorical_headers_ints = [x+'_int' for x in categorical_headers]

# # we will forego one-hot encoding right now and instead just scale all inputs
# feature_columns = categorical_headers_ints+numeric_headers
# X_train =  ss.fit_transform(df_train[feature_columns].values).astype(np.float32)
# X_test =  ss.transform(df_test[feature_columns].values).astype(np.float32)

# y_train = df_train['Spending on gadgets'].values.astype(np.int)
# y_test = df_test['Spending on gadgets'].values.astype(np.int)

# print(feature_columns)

In [None]:
# update the model to take input features as a dictionary
def my_model(dict_features,targets):
    # the prototype for this function is as follows
    # input:  (features, targets) 
    # output: (predictions, loss, train_op)
    
    #=======DECODE FEATURES================
    # now let's combine the tensors from the input dictionary
    # into a list of the feature columns
    features = []
    for col in numeric_headers:
        features.append(dict_features[col])
    
    # also add in the one hot encoded features
    for col in categorical_headers_ints:
        features.append(dict_features[col])
    
    # now we can just combine all the features together
    features = tf.concat(values=features,axis=1)
    
    # =====SETUP ARCHITECTURE=====
    # we can use functions from learn to add layers and complexity to the model
    # pass features through one hidden layer with relu activation
    features = layers.relu(features, num_outputs=50) 
    # now pass the features through a fully connected layer
    features = layers.fully_connected(features, num_outputs=1) 
    # and pass them through a sigmoid activation
    output_layer = tf.sigmoid(features) 
    # reshape the output to be one dimensional
    predictions = tf.reshape(output_layer, [-1])
    
    # =====LOSS=======
    # we want to use MSE as our loss function
    loss_mse = tf.losses.mean_squared_error(targets, predictions) 
    
    # =====OPTIMIZER PARAMS========
    # now let's setup how we want thing to optimize 
    train_op = layers.optimize_loss(
        loss=loss_mse, 
        global_step=tf.contrib.framework.get_global_step(),
        optimizer='Adagrad', # adaptive gradient, so that the learning rate is not SO important 
        learning_rate=0.1)
    
    # what format to have the output in when calling clf.predict?
    predictions_out = predictions>0.5

    return {'incomes':predictions_out}, loss_mse, train_op

In [None]:
# update the model to take input features as a dictionary
def setup_wide_deep_columns():
    # the prototype for this function is as follows
    # input:  (features, targets) 
    # output: (predictions, loss, train_op)
    
    wide_columns = []
    deep_columns = []
    # add in each of the categorical columns to both wide and deep features
    for col in categorical_headers:
        wide_columns.append(
            layers.sparse_column_with_keys(col, keys=encoders[col].classes_)
        )
        
        dim = round(np.log2(len(encoders[col].classes_)))
        deep_columns.append(
            layers.embedding_column(wide_columns[-1], dimension=dim)
        )
        
    # also add in some specific crossed columns
    cross_columns = [('Age', 'Gender', 'Folk', 'Country', 'Western movies'), ('Age', 'Gender', 'Classical', 'Swing, Jazz', 'Opera'), ('Age', 'Gender','Musicals', 'Pop'), ('Age', 'Gender', 'Rock', 'Metal or Hardrock', 'Rock n roll', 'Hiphop, Rap', 'Reggae, Ska'), ('Age', 'Gender', 'Alternative'), ('Age', 'Gender', 'Latin'), ('Age', 'Gender', 'Techno, Trance'), ('Horror', 'Thriller'), ('Comedy', 'Romantic'), ('Sci-fi', 'Fantasy/Fairy tails', 'Animated'), ('War', 'Action'), ('History', 'Geography', 'Foreign Languages'), ('Psychology', 'Biology', 'Medicine'), ('Mathematics', 'Physics', 'Science and technology'), ('Economy Management', 'Politics', 'Law', 'Voting'), ('Biology', 'Physics', 'Chemistry'), ('Reading', 'Writing'), ('Art exhibitions', 'Theatre', 'Musical instruments'), ('Countryside, outdoors', 'Dancing', 'Active sport', 'Adrenaline sports'), ('Passive sport', 'Gardening'), ('Dancing', 'Musical instruments'), ('Smoking', 'Alcohol'), ('Age','Height', 'Weight'), ('Village-town', 'House-block of flats')]
    for tup in cross_columns:
        feature_columns = []
        for element in tup:
            feature_columns.append(layers.sparse_column_with_keys(element, keys=encoders[element].classes_))
        wide_columns.append(
            layers.crossed_column(feature_columns, hash_bucket_size=int(1e4)))
        
        
    # and add in the regular dense features 
    for col in numeric_headers:
        deep_columns.append(
            layers.real_valued_column(col)
        )
                    
    return wide_columns, deep_columns

In [None]:
%%time

wide_columns, deep_columns = setup_wide_deep_columns()
clf = learn.DNNLinearCombinedClassifier(
                        linear_feature_columns=wide_columns,
                        dnn_feature_columns=deep_columns,
                        dnn_hidden_units=[100, 50])


clf.fit(input_fn=input_wrapper, steps=2500)

yhat = clf.predict(input_fn=output_wrapper)
# the output is now an iterable value, so we need to step over it
yhat = [x for x in yhat]
print(confusion_matrix(y_test,yhat),accuracy_score(y_test,yhat))