# HW05: Deep Learning

Remember that these homework work as a completion grade. **You can skip one section without losing credit.**

In [1]:
#Import the AG news dataset (same as hw01)
#Download them from here 
# !wget https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/train.csv

import pandas as pd
import nltk
df = pd.read_csv('train.csv')

df.columns = ["label", "title", "lead"]
label_map = {1:"world", 2:"sport", 3:"business", 4:"sci/tech"}
def replace_label(x):
	return label_map[x]
df["label"] = df["label"].apply(replace_label) 
df["text"] = df["title"] + " " + df["lead"]
df = df.sample(n=10000) # # only use 10K datapoints
df.head()

Unnamed: 0,label,title,lead,text
23445,business,Qualcomm to pay \$170 million to acquire Iridigm,Qualcomm said yesterday that it would pay \$17...,Qualcomm to pay \$170 million to acquire Iridi...
85939,business,Anthem merger could be back on fast track,California insurance commissioner expected to ...,Anthem merger could be back on fast track Cali...
73944,business,World oil trade to double in next 25 years,LONDON - World trade in oil will double over t...,World oil trade to double in next 25 years LON...
41626,sport,Bengals Struggling to Stop Opposing Backs (AP),"AP - Jamal Lewis, 180 yards. Marshall Faulk, 1...",Bengals Struggling to Stop Opposing Backs (AP)...
45606,business,General Mills Cereals Going Whole Grain,NEW YORK (Reuters) - General Mills Inc. on Th...,General Mills Cereals Going Whole Grain NEW Y...


In [2]:
##TODO create a new variable "business" that takes value 1 if the label is business and 0 otherwise
df['business'] = df['label'] == 'business'
df.head()

Unnamed: 0,label,title,lead,text,business
23445,business,Qualcomm to pay \$170 million to acquire Iridigm,Qualcomm said yesterday that it would pay \$17...,Qualcomm to pay \$170 million to acquire Iridi...,True
85939,business,Anthem merger could be back on fast track,California insurance commissioner expected to ...,Anthem merger could be back on fast track Cali...,True
73944,business,World oil trade to double in next 25 years,LONDON - World trade in oil will double over t...,World oil trade to double in next 25 years LON...,True
41626,sport,Bengals Struggling to Stop Opposing Backs (AP),"AP - Jamal Lewis, 180 yards. Marshall Faulk, 1...",Bengals Struggling to Stop Opposing Backs (AP)...,False
45606,business,General Mills Cereals Going Whole Grain,NEW YORK (Reuters) - General Mills Inc. on Th...,General Mills Cereals Going Whole Grain NEW Y...,True


In [3]:
import spacy
nlp = spacy.load('en_core_web_sm')
from sklearn.feature_extraction.text import CountVectorizer

##TODO pre-process text as you did in HW02
##TODO vectorize the pre-processed text using CountVectorizer
##Alternatively, use the output from HW02 if you saved it

In [4]:
dfs = df.sample(50)

def tokenize(x):
    return [w.lemma_.lower() for w in nlp(x) if not w.is_stop and not w.is_punct and not w.is_digit]
dfs["tokens"] = dfs["text"].apply(lambda x: tokenize(x))
dfs["preprocessed"] = dfs['tokens'].apply(lambda x: ' '.join(x))

vec = CountVectorizer(min_df=0.01, # at min 1% of docs
                        max_df=.9,
                        max_features=1000,
                        stop_words='english',
                        ngram_range=(1,3))
X = vec.fit_transform(dfs['preprocessed']).toarray()

## MLP

Your goal here is to use features from the Vectorized text to predict whether the snippet is from a business article.

In [5]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.callbacks import EarlyStopping

## TODO build a MLP model with at least 2 hidden layers with ReLU activation, followed by dropout and an output layer with sigmoid activation
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, dfs['business'], test_size=0.33, random_state=42)

In [6]:
model = Sequential()
model.add(Dense(32,activation='relu'))
model.add(Dense(16,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
## TODO compile the model
model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])
## TODO fit the model using early stopping to predict the business label
early_stopping = EarlyStopping(patience = 5,monitor='val_accuracy')
model.fit(X_train, y_train, callbacks=[early_stopping], epochs = 1000,validation_data=(X_test,y_test))

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000


<tensorflow.python.keras.callbacks.History at 0x7fdf193c91d0>

## Autoencoders

In [7]:
print(X.shape)
print(X_train.shape)

(50, 1000)
(33, 1000)


In [8]:
from keras import backend as K

def r2(y_true, y_pred):
    SS_res =  K.sum(K.square( y_true-y_pred )) 
    SS_tot = K.sum(K.square( y_true - K.mean(y_true) ) ) 
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

In [27]:
##TODO build a simple autoencoder with two compression layers and two econstruction layers using ReLu
model = Sequential()
model.add(Dense(32,activation='relu'))
model.add(Dense(16,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(16,activation='relu'))
model.add(Dense(1000,activation='relu'))

In [28]:
##TODO compile and fit the model minimizing "mean_squared_error"
model.compile(loss="mean_squared_error", optimizer='Adam', metrics=[r2])

In [29]:
##report r_squared during training (the function r2 defined above)
early_stopping = EarlyStopping(patience = 5,monitor='val_r2',mode='min')
model.fit(X_train, X_train, callbacks=[early_stopping], epochs = 1000,validation_data=(X_test,X_test))

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000


<tensorflow.python.keras.callbacks.History at 0x7fdf049161d0>

In [12]:
import keras
from keras.losses import MeanSquaredError
##TODO compress the vectorized text (X.todense())

## Embeddings

In [13]:
from keras.preprocessing.text import text_to_word_sequence

##TODO tokenize the text using text_to_word_sequence
dfs['tokenized'] = dfs['text'].apply(lambda x: text_to_word_sequence(x))

In [14]:
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences

length_vocab = 1000
max_seq_length = 100

#TODO create a one_hot representation for each word and truncate/pad the sequences such that they are all of the same length
dfs['one_hot'] = dfs['text'].apply(lambda x: pad_sequences([one_hot(x, n=length_vocab)],maxlen=100)[0])

In [31]:
from keras.layers import Embedding
model = Sequential()
model.add(Embedding(1000,64, input_length=max_seq_length))
##TODO create a sequential model with just one embedding layer and show the model summary
model.compile()

In [32]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 64)           64000     
Total params: 64,000
Trainable params: 64,000
Non-trainable params: 0
_________________________________________________________________


## LSTM

In [33]:
from keras.layers import LSTM
model.add(LSTM(32))
model.add(Dense(32,activation='relu'))
model.add(Dense(16,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
##TODO create a sequential model with an embedding layer, a LSTM layer and two hidden layers with ReLu activation function, followed by dropout
model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])

In [34]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 64)           64000     
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                12416     
_________________________________________________________________
dense_18 (Dense)             (None, 32)                1056      
_________________________________________________________________
dense_19 (Dense)             (None, 16)                528       
_________________________________________________________________
dropout_5 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_20 (Dense)             (None, 1)                 17        
Total params: 78,017
Trainable params: 78,017
Non-trainable params: 0
__________________________________________________

In [18]:
##TODO compile the model and fit it to predict the business label

early_stopping = EarlyStopping(patience = 5,monitor='accuracy', mode='max')
model.fit(dfs['one_hot'], dfs['business'], callbacks=[early_stopping], epochs = 1000)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type numpy.ndarray).

In [19]:
print(len(dfs['one_hot'].to_list()))

50


In [20]:
a = dfs['one_hot'].to_numpy()
print(a.shape)

(50,)


In [21]:
import numpy as np


model.fit( np.vstack(dfs['one_hot'].to_numpy()), dfs['business'].astype(int), callbacks=[early_stopping], epochs = 1000)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000


<tensorflow.python.keras.callbacks.History at 0x7fdf1af4a400>