In [7]:
import math
import time
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from wordcloud import WordCloud

print(tf.__version__)

2.10.0


In [8]:
data_news_headlines = pd.read_json("../shared_data/x1.json")

# Adjust news headline data
data_news_headlines = data_news_headlines.drop(columns='article_link', axis=1)
data_news_headlines = data_news_headlines.rename(columns ={'headline':'text', 'is_sarcastic':'label'})
data_news_headlines = data_news_headlines.reindex(columns=['text','label'])
data_news_headlines.head()

Unnamed: 0,text,label
0,thirtysomething scientists unveil doomsday clo...,1
1,dem rep. totally nails why congress is falling...,0
2,eat your veggies: 9 deliciously different recipes,0
3,inclement weather prevents liar from getting t...,1
4,mother comes pretty close to using word 'strea...,1


In [9]:
data_tweets = pd.read_csv("../shared_data/dataset_csv.csv")

# Adjust tweets data
data_tweets = data_tweets.rename(columns={'tweets':'text'})
data_tweets.head()

Unnamed: 0,text,label
0,I love working midnights tweet,1
1,I hate when I buy a bag of air and there's chi...,1
2,my grandad always sounds so ill when i speak t...,0
3,"I realize I'm annoying to everyone, so I won't...",0
4,I love when I find these dudes on vine!! #Foll...,1


In [10]:
data_sitcoms = pd.read_csv("../shared_data/mustard++_text.csv")

# Adjust sitcom data
data_sitcoms = data_sitcoms.drop(columns=['SCENE','KEY','END_TIME','SPEAKER','SHOW','Sarcasm_Type','Implicit_Emotion','Explicit_Emotion','Valence','Arousal'], axis=1)
data_sitcoms = data_sitcoms.rename(columns={'SENTENCE':'text','Sarcasm':'label'})

# remove empty label rows
for index, row in data_sitcoms.iterrows():
    if math.isnan(row['label']):
        data_sitcoms = data_sitcoms.drop(index, axis='index')

data_sitcoms.head()

Unnamed: 0,text,label
5,"And of those few months, how long have you bee...",0.0
14,"Let the dead man talk. So, why do you think that?",0.0
18,"What else? Sell it on eBay as ""slightly used.""",0.0
24,"Good idea, sit with her. Hold her, comfort her...",1.0
31,"Well, now that I've given up string theory, I'...",0.0


In [11]:
data_reddit = pd.read_csv("../shared_data/train-balanced-sarcasm.csv")

# Adjust reddit data
data_reddit = data_reddit.drop(columns=['author','subreddit','score','ups','downs','date','created_utc','parent_comment'], axis=1)
data_reddit = data_reddit.rename(columns={'comment':'text'})
data_reddit = data_reddit.reindex(columns=['text','label'])

data_reddit.head()

Unnamed: 0,text,label
0,NC and NH.,0
1,You do know west teams play against west teams...,0
2,"They were underdogs earlier today, but since G...",0
3,"This meme isn't funny none of the ""new york ni...",0
4,I could use one of those tools.,0


In [12]:
# Combine all 4 datasets
data = pd.concat([data_news_headlines,data_tweets,data_sitcoms,data_reddit], ignore_index=True)

# remove non string (nan) rows
for index, row in data.iterrows():
    if not type(row['text']) == str:
        data = data.drop(index, axis='index')

# Shuffle the rows
data = data.sample(frac=1).reset_index(drop=True)

data.head()

Unnamed: 0,text,label
0,Up vote because Boone.,0.0
1,Something somewthing 4GB,0.0
2,Gorgeous conures.,0.0
3,You only sort of missed the point.,1.0
4,Thanks Obama,1.0


In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1042588 entries, 0 to 1042587
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   text    1042588 non-null  object 
 1   label   1042588 non-null  float64
dtypes: float64(1), object(1)
memory usage: 15.9+ MB


In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1042588 entries, 0 to 1042587
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   text    1042588 non-null  object 
 1   label   1042588 non-null  float64
dtypes: float64(1), object(1)
memory usage: 15.9+ MB


Set the variables for the model and training/testing processes

In [15]:
subset_size = 14000
training_size = int(subset_size * 0.2)
shuffle_size = subset_size - training_size

data_batch_size = 32
image_size = (400, 400)
word_cloud_font_path = './word_cloud_data/font/DroidSansMono.ttf'

EPOCHS = 5

Randomly shuffle the data and select the top subset size

In [16]:
data = data.sample(frac=1).reset_index(drop=True)
data = data.head(subset_size)

Save wordcloud images to disc

In [17]:
row_count = 0
sarcastic_count = 0
non_sarcastic_count = 0

for index, row in data.iterrows():
    label = int(row['label'])
    text = row['text']
    file_path = './word_cloud_data/'

    if row_count < training_size:
        file_path = file_path + 'test/'
    else:
        file_path = file_path + 'train/'

    if label == 1:
       file_path = file_path + 'sarcastic/sarcastic.' + str(sarcastic_count) + '.jpg'
       sarcastic_count += 1
    else:
        file_path = file_path + 'non_sarcastic/non_sarcastic.' + str(non_sarcastic_count) + '.jpg'
        non_sarcastic_count += 1

    words = text.split()
    freq_count = [words.count(k) for k in words]
    frequencies = dict(zip(words, freq_count))
    cloud = WordCloud(width=image_size[0], height=image_size[1], stopwords=[''], min_word_length=1, repeat=True, normalize_plurals=False, include_numbers=True, font_path=word_cloud_font_path)
    image = cloud.generate_from_frequencies(frequencies)
    image.to_file(file_path)

    row_count += 1