In [158]:
import math
import time
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.layers import Dense, Flatten, Conv2D, Layer
from tensorflow.keras import Model
from wordcloud import WordCloud

print(tf.__version__)

2.10.0


In [159]:
tf.config.list_physical_devices('GPU')

[]

In [182]:
tf.config.run_functions_eagerly(True)

In [160]:
data_news_headlines = pd.read_json("../shared_data/x1.json")

# Adjust news headline data
data_news_headlines = data_news_headlines.drop(columns='article_link', axis=1)
data_news_headlines = data_news_headlines.rename(columns ={'headline':'text', 'is_sarcastic':'label'})
data_news_headlines = data_news_headlines.reindex(columns=['text','label'])
data_news_headlines.head()

Unnamed: 0,text,label
0,thirtysomething scientists unveil doomsday clo...,1
1,dem rep. totally nails why congress is falling...,0
2,eat your veggies: 9 deliciously different recipes,0
3,inclement weather prevents liar from getting t...,1
4,mother comes pretty close to using word 'strea...,1


In [161]:
data_tweets = pd.read_csv("../shared_data/dataset_csv.csv")

# Adjust tweets data
data_tweets = data_tweets.rename(columns={'tweets':'text'})
data_tweets.head()

Unnamed: 0,text,label
0,I love working midnights tweet,1
1,I hate when I buy a bag of air and there's chi...,1
2,my grandad always sounds so ill when i speak t...,0
3,"I realize I'm annoying to everyone, so I won't...",0
4,I love when I find these dudes on vine!! #Foll...,1


In [162]:
data_sitcoms = pd.read_csv("../shared_data/mustard++_text.csv")

# Adjust sitcom data
data_sitcoms = data_sitcoms.drop(columns=['SCENE','KEY','END_TIME','SPEAKER','SHOW','Sarcasm_Type','Implicit_Emotion','Explicit_Emotion','Valence','Arousal'], axis=1)
data_sitcoms = data_sitcoms.rename(columns={'SENTENCE':'text','Sarcasm':'label'})

# remove empty label rows
for index, row in data_sitcoms.iterrows():
    if math.isnan(row['label']):
        data_sitcoms = data_sitcoms.drop(index, axis='index')

data_sitcoms.head()

Unnamed: 0,text,label
5,"And of those few months, how long have you bee...",0.0
14,"Let the dead man talk. So, why do you think that?",0.0
18,"What else? Sell it on eBay as ""slightly used.""",0.0
24,"Good idea, sit with her. Hold her, comfort her...",1.0
31,"Well, now that I've given up string theory, I'...",0.0


In [163]:
data_reddit = pd.read_csv("../shared_data/train-balanced-sarcasm.csv")

# Adjust reddit data
data_reddit = data_reddit.drop(columns=['author','subreddit','score','ups','downs','date','created_utc','parent_comment'], axis=1)
data_reddit = data_reddit.rename(columns={'comment':'text'})
data_reddit = data_reddit.reindex(columns=['text','label'])

data_reddit.head()

Unnamed: 0,text,label
0,NC and NH.,0
1,You do know west teams play against west teams...,0
2,"They were underdogs earlier today, but since G...",0
3,"This meme isn't funny none of the ""new york ni...",0
4,I could use one of those tools.,0


In [164]:
# Combine all 4 datasets
data = pd.concat([data_news_headlines,data_tweets,data_sitcoms,data_reddit], ignore_index=True)

# remove non string (nan) rows
for index, row in data.iterrows():
    if not type(row['text']) == str:
        data = data.drop(index, axis='index')

# Shuffle the rows
data = data.sample(frac=1).reset_index(drop=True)

data.head()

Unnamed: 0,text,label
0,Install Windows,1.0
1,"Not sure how you guys all feel, but they shoul...",0.0
2,With a long niqab,1.0
3,Calm down sjw.,1.0
4,xd,0.0


In [165]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1042588 entries, 0 to 1042587
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   text    1042588 non-null  object 
 1   label   1042588 non-null  float64
dtypes: float64(1), object(1)
memory usage: 15.9+ MB


In [166]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1042588 entries, 0 to 1042587
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   text    1042588 non-null  object 
 1   label   1042588 non-null  float64
dtypes: float64(1), object(1)
memory usage: 15.9+ MB


Set the variables for the model and training/testing processes

In [241]:
subset_size = 400
training_size = int(subset_size * 0.2)
shuffle_size = subset_size - training_size

data_batch_size = 32
image_size = (64, 64)
word_cloud_font_path = '../shared_data/font/DroidSansMono.ttf'

EPOCHS = 5

Randomly shuffle the data and select the top subset size

In [242]:
data = data.sample(frac=1).reset_index(drop=True)
data = data.head(subset_size)

Create tensorflow dataset tensor objects and split the data between training and testing

In [243]:
train_ds = tf.data.Dataset.from_tensor_slices(
    (
        data['text'][training_size:], 
        data['label'][training_size:]
    )
).shuffle(shuffle_size).batch(data_batch_size)

test_ds = tf.data.Dataset.from_tensor_slices(
    (
        data['text'][:training_size],
        data['label'][:training_size]
    )
).batch(data_batch_size)

Build a custom layer that takes a sentence text tensor and returns a wordcloud of that sentence as an image tensor

In [244]:
class Sentence2WordCloud(Layer):
    def __init__(self) -> None:
        super(Sentence2WordCloud, self).__init__()
        self.trainable = False
    
    def call(self, inputs):
        output = []
        for tensor in inputs:
            new_tensor = tf.convert_to_tensor(self.__sentence2wordcloud__(tensor), dtype_hint=tf.float32)
            output.append(new_tensor)
        return tf.convert_to_tensor(output)
    
    def __sentence2wordcloud__(self, tensor):
        frequencies = self.__freqcount__(tensor)
        cloud = WordCloud(width=image_size[0], height=image_size[1], stopwords=[''], min_word_length=1, repeat=True, normalize_plurals=False, include_numbers=True, font_path=word_cloud_font_path, min_font_size=1)
        image = cloud.generate_from_frequencies(frequencies)
        return image.to_array()
    
    def __freqcount__(self, tensor):
        words = tf.get_static_value(tensor).decode().split()
        freq_count = [words.count(k) for k in words]
        return dict(zip(words, freq_count))

In [245]:
""""
test model for troubleshooting layer

class MyModel(Model):
    def __init__(self) -> None:
        super(MyModel, self).__init__()
        self.l1 = Sentence2WordCloud()
    
    def call(self, x):
        return self.l1(x)

my_model = MyModel()
for sentences, labels in test_ds:
    print(my_model(sentences))
"""

'"\ntest model for troubleshooting layer\n\nclass MyModel(Model):\n    def __init__(self) -> None:\n        super(MyModel, self).__init__()\n        self.l1 = Sentence2WordCloud()\n    \n    def call(self, x):\n        return self.l1(x)\n\nmy_model = MyModel()\nfor sentences, labels in test_ds:\n    print(my_model(sentences))\n'

Build a custom convolution neural network model

In [246]:
class CloudyCNN(Model):
    def __init__(self) -> None:
        super(CloudyCNN, self).__init__()
        self.sentence2wordcloud = Sentence2WordCloud()
        self.conv1 = Conv2D(164, 3, activation='relu')
        self.flatten = Flatten()
        self.d1 = Dense(512, activation='relu')
        self.d2 = Dense(100, activation='relu')
        self.d3 = Dense(10, activation='relu')
        self.d4 = Dense(1, activation='sigmoid')

    def call(self, x):
        x = self.sentence2wordcloud(x)
        x = self.conv1(x)
        x = self.flatten(x)
        x = self.d1(x)
        x = self.d2(x)
        x = self.d3(x)
        return self.d4(x)

model = CloudyCNN()

Select metrics to measure the loss and the accuracy of the model. These metrics accumulate the values over epochs and then print the overall result.

In [247]:
loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam()

Select metrics to measure the loss and the accuracy of the model. These metrics accumulate the values over epochs and then print the overall result.

In [255]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.BinaryAccuracy(name='train_accuracy')

test_loss = tf.keras.metrics.Mean(name='test_loss')
test_accuracy = tf.keras.metrics.BinaryAccuracy(name='test_accuracy')

Setup the training function: using `tf.GradientTape `to train the model

In [259]:
@tf.function
def train_step(sentences, labels):
    with tf.GradientTape() as tape:
        # training=True is only needed if there are layers with different
        # behavior during training versus inference (e.g. Dropout).
        predictions = model(sentences, training=True)
        loss = loss_object(labels, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    train_loss(loss)
    train_accuracy(labels, predictions)

Setup the testing function

In [260]:
@tf.function
def test_step(sentences, labels):
    # training=False is only needed if there are layers with different
    # behavior during training versus inference (e.g. Dropout).
    predictions = model(sentences, training=False)
    t_loss = loss_object(labels, predictions)

    test_loss(t_loss)
    test_accuracy(labels, predictions)

Train and Test the CloudyCNN Model

In [261]:
for epoch in range(EPOCHS):
    # Reset the metrics at the start of the next epoch
    train_loss.reset_states()
    train_accuracy.reset_states()
    test_loss.reset_states()
    test_accuracy.reset_states()

    print(f'Epoch {epoch + 1}:', end=" ")
    start_time = time.time()

    for sentences, labels in train_ds:
        train_step(sentences, labels)
    
    train_time = time.time()

    print(
        f'Loss: {train_loss.result()}, '
        f'Accuracy: {train_accuracy.result()}, '
        f'Train Time: {round(train_time - start_time)}s',
        end=" "
    )

    for test_sentences, test_labels in test_ds:
        test_step(test_sentences, test_labels)

    test_time = time.time()

    print(
        f'Test Loss: {test_loss.result()}, '
        f'Test Accuracy: {test_accuracy.result()}, '
        f'Test Time: {round(test_time - train_time)}s, '
        f'Epoch Time: {round(test_time - start_time)}s'
    )    

    

Epoch 1: Loss: 0.6931471824645996, Accuracy: 0.4781250059604645, Train Time: 54s Test Loss: 0.6931471824645996, Test Accuracy: 0.512499988079071, Test Time: 11s, Epoch Time: 64s
Epoch 2: Loss: 0.6931471824645996, Accuracy: 0.4781250059604645, Train Time: 60s Test Loss: 0.6931471824645996, Test Accuracy: 0.512499988079071, Test Time: 11s, Epoch Time: 71s
Epoch 3: 

KeyboardInterrupt: 

In [None]:
new_sentences = ["Now we know why some animals eat their own children.", "game of thrones season finale showing this sunday night","Please, keep talking. I always yawn when I am interested."]
new_dataset = tf.data.Dataset.from_tensor_slices(new_sentences)
for sentence in new_dataset:
    print(round(model.predict(sentence)))

AttributeError: in user code:

    File "C:\Users\ftn0813\AppData\Roaming\Python\Python39\site-packages\keras\engine\training.py", line 2041, in predict_function  *
        return step_function(self, iterator)
    File "C:\Users\ftn0813\AppData\Roaming\Python\Python39\site-packages\keras\engine\training.py", line 2027, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\ftn0813\AppData\Roaming\Python\Python39\site-packages\keras\engine\training.py", line 2015, in run_step  **
        outputs = model.predict_step(data)
    File "C:\Users\ftn0813\AppData\Roaming\Python\Python39\site-packages\keras\engine\training.py", line 1983, in predict_step
        return self(x, training=False)
    File "C:\Users\ftn0813\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\ftn0813\AppData\Local\Temp\__autograph_generated_fileeyr78_pg.py", line 10, in tf__call
        x = ag__.converted_call(ag__.ld(self).sentence2wordcloud, (ag__.ld(x),), None, fscope)
    File "C:\Users\ftn0813\AppData\Local\Temp\__autograph_generated_file6v53uftz.py", line 24, in tf__call
        ag__.for_stmt(ag__.ld(inputs), None, loop_body, get_state, set_state, (), {'iterate_names': 'tensor'})
    File "C:\Users\ftn0813\AppData\Local\Temp\__autograph_generated_file6v53uftz.py", line 20, in loop_body
        new_tensor = ag__.converted_call(ag__.ld(tf).convert_to_tensor, (ag__.converted_call(ag__.ld(self).__sentence2wordcloud__, (ag__.ld(tensor),), None, fscope),), None, fscope)
    File "C:\Users\ftn0813\AppData\Local\Temp\__autograph_generated_file3gk6jg83.py", line 10, in tf____sentence2wordcloud__
        frequencies = ag__.converted_call(ag__.ld(self).__freqcount__, (ag__.ld(tensor),), None, fscope)
    File "C:\Users\ftn0813\AppData\Local\Temp\__autograph_generated_file0l_znj3v.py", line 10, in tf____freqcount__
        words = ag__.converted_call(ag__.converted_call(ag__.converted_call(ag__.ld(tensor).numpy, (), None, fscope).decode, (), None, fscope).split, (), None, fscope)

    AttributeError: Exception encountered when calling layer "cloudy_cnn_5" "                 f"(type CloudyCNN).
    
    in user code:
    
        File "C:\Users\ftn0813\AppData\Local\Temp\ipykernel_2844\442203258.py", line 13, in call  *
            x = self.sentence2wordcloud(x)
        File "C:\Users\ftn0813\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler  **
            raise e.with_traceback(filtered_tb) from None
        File "C:\Users\ftn0813\AppData\Local\Temp\__autograph_generated_file6v53uftz.py", line 24, in tf__call
            ag__.for_stmt(ag__.ld(inputs), None, loop_body, get_state, set_state, (), {'iterate_names': 'tensor'})
        File "C:\Users\ftn0813\AppData\Local\Temp\__autograph_generated_file6v53uftz.py", line 20, in loop_body
            new_tensor = ag__.converted_call(ag__.ld(tf).convert_to_tensor, (ag__.converted_call(ag__.ld(self).__sentence2wordcloud__, (ag__.ld(tensor),), None, fscope),), None, fscope)
        File "C:\Users\ftn0813\AppData\Local\Temp\__autograph_generated_file3gk6jg83.py", line 10, in tf____sentence2wordcloud__
            frequencies = ag__.converted_call(ag__.ld(self).__freqcount__, (ag__.ld(tensor),), None, fscope)
        File "C:\Users\ftn0813\AppData\Local\Temp\__autograph_generated_file0l_znj3v.py", line 10, in tf____freqcount__
            words = ag__.converted_call(ag__.converted_call(ag__.converted_call(ag__.ld(tensor).numpy, (), None, fscope).decode, (), None, fscope).split, (), None, fscope)
    
        AttributeError: Exception encountered when calling layer "sentence2_word_cloud_16878" "                 f"(type Sentence2WordCloud).
        
        in user code:
        
            File "C:\Users\ftn0813\AppData\Local\Temp\ipykernel_2844\977086180.py", line 9, in call  *
                new_tensor = tf.convert_to_tensor(self.__sentence2wordcloud__(tensor))
            File "C:\Users\ftn0813\AppData\Local\Temp\ipykernel_2844\2642161880.py", line 14, in __sentence2wordcloud__  *
                frequencies = self.__freqcount__(tensor)
            File "C:\Users\ftn0813\AppData\Local\Temp\ipykernel_2844\977086180.py", line 20, in __freqcount__  *
                words = tensor.numpy().decode().split()
        
            AttributeError: 'Tensor' object has no attribute 'numpy'
        
        
        Call arguments received by layer "sentence2_word_cloud_16878" "                 f"(type Sentence2WordCloud):
          • inputs=tf.Tensor(shape=(None,), dtype=string)
    
    
    Call arguments received by layer "cloudy_cnn_5" "                 f"(type CloudyCNN):
      • x=tf.Tensor(shape=(None,), dtype=string)


In [None]:
model.save('./model_saves/cnn/')