# News Sarcasm Prediction

- RNN Modeling
- LSTM Modeling
- GPU Experiments

# Primary Steps

## Import Libraries

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [3]:
! ls

sample_data


## Load Data

- Drive Mount (Loading from drive)

In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
! ls

drive  sample_data


In [15]:
data = pd.read_csv('drive/MyDrive/Dataset/sarcasm.csv', index_col='Unnamed: 0')
data.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


## Basic Information about Dataset

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28619 entries, 0 to 28618
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   is_sarcastic  28619 non-null  int64 
 1   headline      28619 non-null  object
 2   article_link  28619 non-null  object
dtypes: int64(1), object(2)
memory usage: 894.3+ KB


In [17]:
data.columns

Index(['is_sarcastic', 'headline', 'article_link'], dtype='object')

## Data Spliting

- Seperation
- Spliting

### Seperation- A

In [18]:
headline = data.headline
labels = data.is_sarcastic

### Spliting- B

In [19]:
from scipy.sparse.construct import rand
from sklearn.model_selection import train_test_split
train_headline, test_headline, train_labels, test_labels = train_test_split(
    headline, labels, test_size=0.3, random_state=42
)

# Secondary Steps

## Step 1 : Tokenization

- Initialize Tokenier
- Fit the Data
- Verify x or train_data
- Generate Sequence
- Verify Sequence or Word

### Initialize Tokenizer - i

In [20]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(
    num_words = 50000, oov_token = '<OOV>'
)

### Fit the Data - ii

In [21]:
tokenizer.fit_on_texts(train_headline)  # x fitted for training
tokenizer.word_index.items()



### Verify x / trained_data conversion - iii

In [22]:
train_headline

12170    american express to offer 5 months of paternit...
28552    watch: dolphin knocks stand-up paddleboarder o...
6883             man who enjoys thing informed he is wrong
28387    jonathan lipnicki to star as young 'dark helme...
12932    publicist worried kanye west's support of trum...
                               ...                        
21575    turnout lower than expected for gala central a...
5390     retreating clinton campaign torches iowa town ...
860      national weather service to give hurricanes fu...
15795            christ returns for some of his old things
23654    loophole in curse lets archaeologist off the hook
Name: headline, Length: 20033, dtype: object

### Generate Sequence - iv

In [23]:
train_sequence = tokenizer.texts_to_sequences(train_headline)
train_sequence[:3]

[[109, 3798, 2, 1409, 84, 513, 3, 7029, 9, 7030, 592],
 [115, 3799, 5733, 787, 24, 7031, 61, 33, 1036],
 [14, 36, 2491, 240, 4906, 30, 11, 514]]

### Verify Sequence & Word - v

In [24]:
train_headline[:2]  # Sequence Verification

12170    american express to offer 5 months of paternit...
28552    watch: dolphin knocks stand-up paddleboarder o...
Name: headline, dtype: object

In [25]:
tokenizer.word_index['the']  # Word Verfication

4

## Step 2: Padding

- Train padded sequence
- Test padded sequence

### Train Padded Sequence - i

In [26]:
train_padded_sequence = tf.keras.preprocessing.sequence.pad_sequences(
    train_sequence, maxlen = 50, padding = 'post', truncating = 'post'
)
train_padded_sequence

array([[  109,  3798,     2, ...,     0,     0,     0],
       [  115,  3799,  5733, ...,     0,     0,     0],
       [   14,    36,  2491, ...,     0,     0,     0],
       ...,
       [  245,  1963,   518, ...,     0,     0,     0],
       [ 1953,   846,     6, ...,     0,     0,     0],
       [11674,     5,  4929, ...,     0,     0,     0]], dtype=int32)

### Test Padded Sequence - ii

In [27]:
test_sequence = tokenizer.texts_to_sequences(test_headline) # Before test padded seq, need to convert into seq first
test_padded_sequence = tf.keras.preprocessing.sequence.pad_sequences(
    test_sequence, maxlen = 50, padding= 'post', truncating = 'post'
)
test_padded_sequence

array([[  545,  3131,     2, ...,     0,     0,     0],
       [ 1442,  1763,  3587, ...,     0,     0,     0],
       [   41,   583,     3, ...,     0,     0,     0],
       ...,
       [ 1345,  3290,  1368, ...,     0,     0,     0],
       [11279,     9,  3723, ...,     0,     0,     0],
       [ 3998,  3360,  1498, ...,     0,     0,     0]], dtype=int32)

## Step 3 : Word Embedding

In [28]:
embedding_layer = tf.keras.layers.Embedding (
    50000, 228, input_length = 50
)

## Step 4 : Modeling

- Model Building
- Model Compile
- Fit the model

### Model Building - i

In [29]:
rnn_model = tf.keras.Sequential([
    embedding_layer,
    tf.keras.layers.SimpleRNN(228, activation = 'leaky_relu', return_sequences=True),
    tf.keras.layers.Dense(64, activation = 'relu'),
    tf.keras.layers.Dense(10, activation = 'relu'),
    tf.keras.layers.Dense(1, activation = 'sigmoid')
])

### Model Compilation - ii

In [30]:
rnn_model.compile(
    loss = 'binary_crossentropy',
    optimizer = 'adam',
    metrics = ['accuracy']
)

### Fit the Model - iii

In [31]:
rnn_model.fit(
    train_padded_sequence, train_labels,
    epochs = 4,
    verbose = 1,
    validation_data = (test_padded_sequence, test_labels)
)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f2e3c184e80>

# LSTM- When RNN

## Model Building LSTM - i

In [32]:
lstm_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(50000, 228, input_length=50),   # as fixed in embedding
    tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(128, activation='tanh')  # tanh works better for LSTM model; return_sequence bydefault is True
    ),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

## Model Compilation - ii

In [33]:
lstm_model.compile(
    loss = 'binary_crossentropy',
    optimizer = 'adam',
    metrics = ['accuracy']
)

## Fit the Model - iii

In [34]:
lstm_model.fit(
    train_padded_sequence, train_labels,
    epochs = 4,
    verbose = 1,
    validation_data = (test_padded_sequence, test_labels)
)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f2e3a8551f0>

# GPU Training

- There are multiple GPU / CPU like 1,2,3.
- dropout : 0.5 is most suitable rate of drop out
- dropout: that mean if a neuron value is < 0.5 that will be drouped out 
- dropout: sometimes drop out may down the perforamce, then can try for recurrent dropout

In [40]:
with tf.device('/device:GPU:0'):
  lstm_model2 = tf.keras.Sequential([
    tf.keras.layers.Embedding(50000, 228, input_length=50),   
    tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(128, activation='tanh', dropout= 0.5)  # adding dropout for controlling overfitting
    ),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

lstm_model2.compile(
    loss = 'binary_crossentropy',
    optimizer = 'adam',
    metrics = ['accuracy']
)

lstm_model2.fit(
    train_padded_sequence, train_labels,
    epochs = 10,
    verbose = 1,
    validation_data = (test_padded_sequence, test_labels)
)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f2e5e4cac40>

# Result History

In [43]:
lstm_model2.history.history

{'loss': [0.3914262354373932,
  0.16849760711193085,
  0.06733658909797668,
  0.027946265414357185,
  0.016444070264697075,
  0.010162525810301304,
  0.006530864629894495,
  0.006903843954205513,
  0.008079794235527515,
  0.00542078772559762],
 'accuracy': [0.8138071894645691,
  0.9351070523262024,
  0.9776368737220764,
  0.9904657602310181,
  0.9944591522216797,
  0.9967054128646851,
  0.9979533553123474,
  0.998103141784668,
  0.9970049262046814,
  0.9982029795646667],
 'val_loss': [0.3172915577888489,
  0.3277316093444824,
  0.4646774232387543,
  0.6264989376068115,
  0.7104353308677673,
  0.6917237639427185,
  0.796895444393158,
  0.9054146409034729,
  0.835746705532074,
  0.988009512424469],
 'val_accuracy': [0.8603540658950806,
  0.8644304871559143,
  0.854414165019989,
  0.8453295826911926,
  0.8457954525947571,
  0.8476589918136597,
  0.8427672982215881,
  0.8365944623947144,
  0.8400884866714478,
  0.8406708836555481]}

In [44]:
pd.DataFrame(lstm_model2.history.history)

Unnamed: 0,loss,accuracy,val_loss,val_accuracy
0,0.391426,0.813807,0.317292,0.860354
1,0.168498,0.935107,0.327732,0.86443
2,0.067337,0.977637,0.464677,0.854414
3,0.027946,0.990466,0.626499,0.84533
4,0.016444,0.994459,0.710435,0.845795
5,0.010163,0.996705,0.691724,0.847659
6,0.006531,0.997953,0.796895,0.842767
7,0.006904,0.998103,0.905415,0.836594
8,0.00808,0.997005,0.835747,0.840088
9,0.005421,0.998203,0.98801,0.840671


# Prediction / Creating Function for Prediction

In [45]:
def predict (data:list, lstm_model2) :
  sequence = tokenizer.texts_to_sequences(data)    # must be run on data  not test_sequence
  padded_sequence = tf.keras.preprocessing.sequence.pad_sequences(
  sequence, maxlen=50, padding = 'post', truncating = 'pre'           # maxlen must same as set earlier
)
   
  return lstm_model2.predict(padded_sequence)   # space befor return is issue

## Starts Prediction

In [46]:
headline # just take a headline  & take one to pass for prediction

0        thirtysomething scientists unveil doomsday clo...
1        dem rep. totally nails why congress is falling...
2        eat your veggies: 9 deliciously different recipes
3        inclement weather prevents liar from getting t...
4        mother comes pretty close to using word 'strea...
                               ...                        
28614         jews to celebrate rosh hashasha or something
28615    internal affairs investigator disappointed con...
28616    the most beautiful acceptance speech this week...
28617    mars probe destroyed by orbiting spielberg-gat...
28618                   dad clarifies this not a food stop
Name: headline, Length: 28619, dtype: object

In [69]:
headline[5] # copy from output, it just works like headline generator for prediction 

'my white inheritance'

## Prediction Process

In [55]:
predict (['inclement weather prevents liar from getting to work',], lstm_model2)   # Single headline Prediction



array([[0.99999976]], dtype=float32)

In [68]:
predict (['thirtysomething scientists unveil doomsday clock of hair loss','dem rep. totally nails why congress is falling short on gender, racial equality','my white inheritance',], lstm_model2) # Multiple Headline Prediction 



array([[1.0000000e+00],
       [1.9464697e-08],
       [3.4621348e-06]], dtype=float32)