# Deep Learning Model

In [29]:
#if necessary import the model data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

#Modeling Imports - GridSearch
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.decomposition import TruncatedSVD 
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, chi2

#CNN Model
from keras.layers import Dense, Activation, Flatten, Dropout, BatchNormalization, DenseFeatures
from keras.layers import Conv1D, MaxPooling2D, MaxPooling1D
from keras import regularizers, optimizers
from keras.models import Sequential
from tensorflow import feature_column
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
import keras

#validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

In [30]:
#A summary of missing variables represented as a percentage of the total missing content. 
def null_summary(df, print_log=False, sort='ascending'):
    s = df.isnull().sum()*100/df.isnull().count()
    
    if sort.lower() == 'ascending':
        s = s.sort_values(ascending=True)
    elif sort.lower() == 'descending':
        s = s.sort_values(ascending=False)  
    if print_log: 
        print('Percentage of null values: \n', s)
  
    return pd.Series(s)

In [31]:
#get the data
talkdata = pd.read_csv('/content/drive/MyDrive/Thinkful/Data/talkdata.csv')
#Quick look at the head.
talkdata.head()

Unnamed: 0,timestamp,longitude,latitude,coordinates,is_active,phone_brand,device_model,category,gender,age,group
0,2016-05-01 14:23:37,0.0,0.0,"(0.0, 0.0)",1.0,小米,MI 2,1 free,M,35,M32-38
1,2016-05-01 14:23:37,0.0,0.0,"(0.0, 0.0)",1.0,小米,MI 2,Cozy 1,M,35,M32-38
2,2016-05-01 14:23:37,0.0,0.0,"(0.0, 0.0)",1.0,小米,MI 2,Industry tag,M,35,M32-38
3,2016-05-01 14:23:37,0.0,0.0,"(0.0, 0.0)",1.0,小米,MI 2,Property Industry 2.0,M,35,M32-38
4,2016-05-01 14:23:37,0.0,0.0,"(0.0, 0.0)",1.0,小米,MI 2,music,M,35,M32-38


In [32]:
#The size and shape of this dataset before categorical changing.
talkdata.shape

(7832282, 11)

In [33]:
#Set the timestamp column to a datetime datatype.[Shame on me fo ridding myself of hte epoch timesetamp before I started the Deep learning]
talkdata['timestamp'] = pd.to_datetime(talkdata.timestamp)
talkdata.timestamp.dtype

#create the categorical time columns
talkdata['hour'] = talkdata.timestamp.dt.hour
talkdata['weekday'] = talkdata.timestamp.dt.dayofweek
#get the day of the week as an integer. 
talkdata['week'] = talkdata.timestamp.dt.isocalendar().week.astype(int)

#concatenate the phone brand to the device model. 
talkdata['device_type'] = talkdata.phone_brand + " "+ talkdata.device_model

In [34]:
null_summary(talkdata)

timestamp       0.0
longitude       0.0
latitude        0.0
coordinates     0.0
is_active       0.0
phone_brand     0.0
device_model    0.0
category        0.0
gender          0.0
age             0.0
group           0.0
hour            0.0
weekday         0.0
week            0.0
device_type     0.0
dtype: float64

In [35]:
#drop the non-categorical columnsand the columns that are no longer in use.
df = talkdata.drop(columns=['timestamp', 'longitude', 'latitude', 'phone_brand', 'device_model', 'age', 'group'])
df.shape

(7832282, 8)

# Create the labels
Export the labels to a directory. 


In [36]:
#I have a lot of data. Work with a much smaller subset for training. 
data, die = train_test_split(df, test_size=0.80)
y = pd.get_dummies(data.gender, drop_first=True)
X = data.drop(columns=['gender'])

In [37]:
#The test set will be 20% of the training data. 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, stratify=y)
print(X_train.shape, X_test.shape)

(783228, 7) (783228, 7)


In [38]:
enc = OneHotEncoder(handle_unknown='ignore')
X_enc = enc.fit_transform(X_train)
X_enc_test = enc.transform(X_test)

In [39]:
skb = SelectKBest(chi2, k=100)
X_skb = skb.fit_transform(X_enc, y_train)
X_skb_test = skb.transform(X_enc_test)

In [40]:
X_dense = X_skb.todense()
X_dense_test = X_skb_test.todense()

In honor of this [life-saving explanation](http://www.jussihuotari.com/2017/12/20/spell-out-convolution-1d-in-cnns/#:~:text=1D%20Convolution%20in%20Numpy&text=The%201D%20convolution%20slides%20a,having%20one%20channel%20(feature). on convolutional models, I have opted to keep the names toyX and toyY for this assignment. 

In [41]:
X_dense.shape

(783228, 100)

In [42]:
from keras import backend as K
from keras.models import Sequential
from keras.optimizers import Adam
from keras.layers import Conv2D
K.clear_session()
toyX = np.array(X_dense).reshape(1, X_dense.shape[0], X_dense.shape[1])
testX = np.array(X_dense_test).reshape(1, X_dense_test.shape[0], X_dense_test.shape[1])
toyY = np.array(y_train).reshape(1, X_dense.shape[0], 1)
testY = np.array(y_test).reshape(1, X_dense_test.shape[0], 1)
 
input_shape = toyX.shape[1], toyX.shape[2]


In [43]:
model = Sequential()

model.add(Conv1D(184, 1, padding='same',
                  input_shape=input_shape))

model.add(Dense(128, activation='relu'))

model.add(Dense(128, activation='relu'))

model.add(Dropout(.25))

model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(toyX, toyY, epochs=10, validation_data=(testX, testY))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7efb734a2320>

In [44]:
pred=model.predict(testX)
predicted_class_indices=(pred > 0.5).astype("int32") #binary class identification.

In [45]:
print(classification_report(testY.reshape(-1), predicted_class_indices.reshape(-1), zero_division=1))

              precision    recall  f1-score   support

           0       1.00      0.00      0.00    226491
           1       0.71      1.00      0.83    556737

    accuracy                           0.71    783228
   macro avg       0.85      0.50      0.42    783228
weighted avg       0.79      0.71      0.59    783228

