In [2]:
import math
import time
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.layers import Dense, Flatten, Conv2D, Layer
from tensorflow.keras import Model
from wordcloud import WordCloud

print(tf.__version__)

2.10.0


In [3]:
tf.config.list_physical_devices('GPU')

[]

In [4]:
tf.config.run_functions_eagerly(True)

In [5]:
data_news_headlines = pd.read_json("../shared_data/x1.json")

# Adjust news headline data
data_news_headlines = data_news_headlines.drop(columns='article_link', axis=1)
data_news_headlines = data_news_headlines.rename(columns ={'headline':'text', 'is_sarcastic':'label'})
data_news_headlines = data_news_headlines.reindex(columns=['text','label'])
data_news_headlines.head()

Unnamed: 0,text,label
0,thirtysomething scientists unveil doomsday clo...,1
1,dem rep. totally nails why congress is falling...,0
2,eat your veggies: 9 deliciously different recipes,0
3,inclement weather prevents liar from getting t...,1
4,mother comes pretty close to using word 'strea...,1


In [6]:
data_tweets = pd.read_csv("../shared_data/dataset_csv.csv")

# Adjust tweets data
data_tweets = data_tweets.rename(columns={'tweets':'text'})
data_tweets.head()

Unnamed: 0,text,label
0,I love working midnights tweet,1
1,I hate when I buy a bag of air and there's chi...,1
2,my grandad always sounds so ill when i speak t...,0
3,"I realize I'm annoying to everyone, so I won't...",0
4,I love when I find these dudes on vine!! #Foll...,1


In [7]:
data_sitcoms = pd.read_csv("../shared_data/mustard++_text.csv")

# Adjust sitcom data
data_sitcoms = data_sitcoms.drop(columns=['SCENE','KEY','END_TIME','SPEAKER','SHOW','Sarcasm_Type','Implicit_Emotion','Explicit_Emotion','Valence','Arousal'], axis=1)
data_sitcoms = data_sitcoms.rename(columns={'SENTENCE':'text','Sarcasm':'label'})

# remove empty label rows
for index, row in data_sitcoms.iterrows():
    if math.isnan(row['label']):
        data_sitcoms = data_sitcoms.drop(index, axis='index')

data_sitcoms.head()

Unnamed: 0,text,label
5,"And of those few months, how long have you bee...",0.0
14,"Let the dead man talk. So, why do you think that?",0.0
18,"What else? Sell it on eBay as ""slightly used.""",0.0
24,"Good idea, sit with her. Hold her, comfort her...",1.0
31,"Well, now that I've given up string theory, I'...",0.0


In [8]:
data_reddit = pd.read_csv("../shared_data/train-balanced-sarcasm.csv")

# Adjust reddit data
data_reddit = data_reddit.drop(columns=['author','subreddit','score','ups','downs','date','created_utc','parent_comment'], axis=1)
data_reddit = data_reddit.rename(columns={'comment':'text'})
data_reddit = data_reddit.reindex(columns=['text','label'])

data_reddit.head()

Unnamed: 0,text,label
0,NC and NH.,0
1,You do know west teams play against west teams...,0
2,"They were underdogs earlier today, but since G...",0
3,"This meme isn't funny none of the ""new york ni...",0
4,I could use one of those tools.,0


In [9]:
# Combine all 4 datasets
data = pd.concat([data_news_headlines,data_tweets,data_sitcoms,data_reddit], ignore_index=True)

# remove non string (nan) rows
for index, row in data.iterrows():
    if not type(row['text']) == str:
        data = data.drop(index, axis='index')

# Shuffle the rows
data = data.sample(frac=1).reset_index(drop=True)

data.head()

Unnamed: 0,text,label
0,He was trying to help those farmers find the g...,1.0
1,you normal get to see in the grave before its ...,0.0
2,But how am I supposed to feel like the Raptors...,1.0
3,A,0.0
4,I would be too Stanley,0.0


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1042588 entries, 0 to 1042587
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   text    1042588 non-null  object 
 1   label   1042588 non-null  float64
dtypes: float64(1), object(1)
memory usage: 15.9+ MB


In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1042588 entries, 0 to 1042587
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   text    1042588 non-null  object 
 1   label   1042588 non-null  float64
dtypes: float64(1), object(1)
memory usage: 15.9+ MB


Set the variables for the model and training/testing processes

In [None]:
subset_size = 1400
training_size = int(subset_size * 0.2)
shuffle_size = subset_size - training_size

data_batch_size = 32
image_size = (64, 64)
word_cloud_font_path = '../shared_data/font/DroidSansMono.ttf'

EPOCHS = 400
epoch_steps = int((subset_size - training_size) / data_batch_size)

Randomly shuffle the data and select the top subset size

In [24]:
data = data.sample(frac=1).reset_index(drop=True)
data = data.head(subset_size)

Create tensorflow dataset tensor objects and split the data between training and testing

In [25]:
train_ds = tf.data.Dataset.from_tensor_slices(
    (
        data['text'][training_size:], 
        data['label'][training_size:]
    )
).shuffle(shuffle_size).batch(data_batch_size)

test_ds = tf.data.Dataset.from_tensor_slices(
    (
        data['text'][:training_size],
        data['label'][:training_size]
    )
).batch(data_batch_size)

Build a custom layer that takes a sentence text tensor and returns a wordcloud of that sentence as an image tensor

In [26]:
class Sentence2WordCloud(Layer):
    def __init__(self) -> None:
        super(Sentence2WordCloud, self).__init__()
        self.trainable = False
    
    def call(self, inputs):
        output = []
        for tensor in inputs:
            new_tensor = tf.convert_to_tensor(self.__sentence2wordcloud__(tensor), dtype_hint=tf.float32)
            output.append(new_tensor)
        return tf.convert_to_tensor(output)
    
    def __sentence2wordcloud__(self, tensor):
        frequencies = self.__freqcount__(tensor)
        cloud = WordCloud(width=image_size[0], height=image_size[1], stopwords=[''], min_word_length=1, repeat=True, normalize_plurals=False, include_numbers=True, font_path=word_cloud_font_path, min_font_size=1)
        image = cloud.generate_from_frequencies(frequencies)
        return image.to_array()
    
    def __freqcount__(self, tensor):
        words = tf.get_static_value(tensor).decode().split()
        freq_count = [words.count(k) for k in words]
        return dict(zip(words, freq_count))

In [27]:
""""
test model for troubleshooting layer

class MyModel(Model):
    def __init__(self) -> None:
        super(MyModel, self).__init__()
        self.l1 = Sentence2WordCloud()
    
    def call(self, x):
        return self.l1(x)

my_model = MyModel()
for sentences, labels in test_ds:
    print(my_model(sentences))
"""

'"\ntest model for troubleshooting layer\n\nclass MyModel(Model):\n    def __init__(self) -> None:\n        super(MyModel, self).__init__()\n        self.l1 = Sentence2WordCloud()\n    \n    def call(self, x):\n        return self.l1(x)\n\nmy_model = MyModel()\nfor sentences, labels in test_ds:\n    print(my_model(sentences))\n'

In [28]:
model = tf.keras.Sequential()

# Custom Sentence to WordCloud Layer
model.add(Sentence2WordCloud())

# Convolutional layer and maxpool layer 1
model.add(tf.keras.layers.Conv2D(32,(3,3),activation='relu',input_shape=(image_size[0],image_size[1],3)))
model.add(tf.keras.layers.MaxPool2D(2,2))

# Convolutional layer and maxpool layer 2
model.add(tf.keras.layers.Conv2D(64,(3,3),activation='relu'))
model.add(tf.keras.layers.MaxPool2D(2,2))

# Convolutional layer and maxpool layer 3
model.add(tf.keras.layers.Conv2D(128,(3,3),activation='relu'))
model.add(tf.keras.layers.MaxPool2D(2,2))

# Convolutional layer and maxpool layer 4
model.add(tf.keras.layers.Conv2D(128,(3,3),activation='relu'))
model.add(tf.keras.layers.MaxPool2D(2,2))

# This layer flattens the resulting image array to 1D array
model.add(tf.keras.layers.Flatten())

# Hidden layer with 512 neurons and Rectified Linear Unit activation function 
model.add(tf.keras.layers.Dense(512,activation='relu'))

# Output layer with single neuron which gives 0 for Cat or 1 for Dog 
#Here we use sigmoid activation function which makes our model output to lie between 0 and 1
model.add(tf.keras.layers.Dense(1,activation='sigmoid'))

model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [29]:
history = model.fit(
        train_ds, 
        epochs=EPOCHS, 
        steps_per_epoch=epoch_steps,
        validation_data=test_ds
    )

Epoch 1/10
Epoch 2/10

In [None]:
def plot_metrics(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric])
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])
  plt.show()
  
plot_metrics(history, "accuracy")
plot_metrics(history, "loss")

In [None]:
new_sentences = ["Now we know why some animals eat their own children.", "game of thrones season finale showing this sunday night","Please, keep talking. I always yawn when I am interested."]
new_dataset = tf.data.Dataset.from_tensor_slices(new_sentences)
for sentence in new_dataset:
    print(round(model.predict(sentence)))

AttributeError: in user code:

    File "C:\Users\ftn0813\AppData\Roaming\Python\Python39\site-packages\keras\engine\training.py", line 2041, in predict_function  *
        return step_function(self, iterator)
    File "C:\Users\ftn0813\AppData\Roaming\Python\Python39\site-packages\keras\engine\training.py", line 2027, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\ftn0813\AppData\Roaming\Python\Python39\site-packages\keras\engine\training.py", line 2015, in run_step  **
        outputs = model.predict_step(data)
    File "C:\Users\ftn0813\AppData\Roaming\Python\Python39\site-packages\keras\engine\training.py", line 1983, in predict_step
        return self(x, training=False)
    File "C:\Users\ftn0813\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\ftn0813\AppData\Local\Temp\__autograph_generated_fileeyr78_pg.py", line 10, in tf__call
        x = ag__.converted_call(ag__.ld(self).sentence2wordcloud, (ag__.ld(x),), None, fscope)
    File "C:\Users\ftn0813\AppData\Local\Temp\__autograph_generated_file6v53uftz.py", line 24, in tf__call
        ag__.for_stmt(ag__.ld(inputs), None, loop_body, get_state, set_state, (), {'iterate_names': 'tensor'})
    File "C:\Users\ftn0813\AppData\Local\Temp\__autograph_generated_file6v53uftz.py", line 20, in loop_body
        new_tensor = ag__.converted_call(ag__.ld(tf).convert_to_tensor, (ag__.converted_call(ag__.ld(self).__sentence2wordcloud__, (ag__.ld(tensor),), None, fscope),), None, fscope)
    File "C:\Users\ftn0813\AppData\Local\Temp\__autograph_generated_file3gk6jg83.py", line 10, in tf____sentence2wordcloud__
        frequencies = ag__.converted_call(ag__.ld(self).__freqcount__, (ag__.ld(tensor),), None, fscope)
    File "C:\Users\ftn0813\AppData\Local\Temp\__autograph_generated_file0l_znj3v.py", line 10, in tf____freqcount__
        words = ag__.converted_call(ag__.converted_call(ag__.converted_call(ag__.ld(tensor).numpy, (), None, fscope).decode, (), None, fscope).split, (), None, fscope)

    AttributeError: Exception encountered when calling layer "cloudy_cnn_5" "                 f"(type CloudyCNN).
    
    in user code:
    
        File "C:\Users\ftn0813\AppData\Local\Temp\ipykernel_2844\442203258.py", line 13, in call  *
            x = self.sentence2wordcloud(x)
        File "C:\Users\ftn0813\AppData\Roaming\Python\Python39\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler  **
            raise e.with_traceback(filtered_tb) from None
        File "C:\Users\ftn0813\AppData\Local\Temp\__autograph_generated_file6v53uftz.py", line 24, in tf__call
            ag__.for_stmt(ag__.ld(inputs), None, loop_body, get_state, set_state, (), {'iterate_names': 'tensor'})
        File "C:\Users\ftn0813\AppData\Local\Temp\__autograph_generated_file6v53uftz.py", line 20, in loop_body
            new_tensor = ag__.converted_call(ag__.ld(tf).convert_to_tensor, (ag__.converted_call(ag__.ld(self).__sentence2wordcloud__, (ag__.ld(tensor),), None, fscope),), None, fscope)
        File "C:\Users\ftn0813\AppData\Local\Temp\__autograph_generated_file3gk6jg83.py", line 10, in tf____sentence2wordcloud__
            frequencies = ag__.converted_call(ag__.ld(self).__freqcount__, (ag__.ld(tensor),), None, fscope)
        File "C:\Users\ftn0813\AppData\Local\Temp\__autograph_generated_file0l_znj3v.py", line 10, in tf____freqcount__
            words = ag__.converted_call(ag__.converted_call(ag__.converted_call(ag__.ld(tensor).numpy, (), None, fscope).decode, (), None, fscope).split, (), None, fscope)
    
        AttributeError: Exception encountered when calling layer "sentence2_word_cloud_16878" "                 f"(type Sentence2WordCloud).
        
        in user code:
        
            File "C:\Users\ftn0813\AppData\Local\Temp\ipykernel_2844\977086180.py", line 9, in call  *
                new_tensor = tf.convert_to_tensor(self.__sentence2wordcloud__(tensor))
            File "C:\Users\ftn0813\AppData\Local\Temp\ipykernel_2844\2642161880.py", line 14, in __sentence2wordcloud__  *
                frequencies = self.__freqcount__(tensor)
            File "C:\Users\ftn0813\AppData\Local\Temp\ipykernel_2844\977086180.py", line 20, in __freqcount__  *
                words = tensor.numpy().decode().split()
        
            AttributeError: 'Tensor' object has no attribute 'numpy'
        
        
        Call arguments received by layer "sentence2_word_cloud_16878" "                 f"(type Sentence2WordCloud):
          • inputs=tf.Tensor(shape=(None,), dtype=string)
    
    
    Call arguments received by layer "cloudy_cnn_5" "                 f"(type CloudyCNN):
      • x=tf.Tensor(shape=(None,), dtype=string)


In [None]:
model.save('./model_saves/cnn/')