## Reddit Sarcasm Detection

### Import Libraries

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np

### Import CSV

In [2]:
training_csv_1 = pd.read_csv("train-balanced-sarcasm.csv")

In [3]:
training_csv_1["comment"] = training_csv_1["comment"].astype(str)

In [4]:
training_csv_1.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass don't kill my buzz
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...


### Exploratory Data Analysis

In [5]:
print(f"The total training data has {training_csv_1.author.nunique()} rows.")
training_csv_1.groupby("author").mean()["label"].value_counts()

The total training data has 256561 rows.


0.500000    251903
1.000000      2302
0.333333       770
0.400000       405
0.428571       268
             ...  
0.533333         1
0.494253         1
0.555556         1
0.499408         1
0.495050         1
Name: label, Length: 64, dtype: int64

##### The authors is mostly 0.5 probability of each label, might consider dropping it

In [6]:
print(f"The total training data has {training_csv_1.subreddit.nunique()} rows.")
training_csv_1.groupby("subreddit").mean()["label"].value_counts()

The total training data has 14878 rows.


0.000000    5883
1.000000    2042
0.500000    1242
0.333333     585
0.250000     362
            ... 
0.465517       1
0.506567       1
0.257874       1
0.550802       1
0.373333       1
Name: label, Length: 1430, dtype: int64

##### Subreddit seems to provide more info than expected, should probably keep

In [7]:
training_csv_1[["ups", "downs"]]

Unnamed: 0,ups,downs
0,-1,-1
1,-1,-1
2,3,0
3,-1,-1
4,-1,-1
...,...,...
1010821,2,0
1010822,1,0
1010823,1,0
1010824,1,0


##### Notice how ups and downs seem to have a correlation? Lets test this theory out

In [8]:
training_csv_1[training_csv_1["ups"].apply(lambda x: -1 if x <= -1 else 0) != training_csv_1["downs"]]

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
140,0,My comment very similar to this went down a fu...,Schumarker,Android,-6,-6,0,2016-09,2016-09-24 21:50:56,Badumm-tzz
204,0,it really does,Horus_Krishna_2,radiohead,-1,-1,0,2016-09,2016-09-14 20:07:04,"As far as I know, someone's reddit history doe..."
414,0,"Meh, my upper body blows his away.",GiveMeSomeIhedigbo,bodybuilding,-6,-6,0,2016-09,2016-09-19 06:27:32,Do you Agree that this version is The BEST Ver...
431,0,Such a shitty meme.,Geralt-of_Rivia,AdviceAnimals,-4,-4,0,2016-09,2016-09-02 02:39:44,Front page post with 2000 comments and is 10 h...
454,0,"This sub is for open ended questions, not yes ...",hunterz5,AskReddit,-3,-3,0,2016-09,2016-09-10 01:54:47,Do you think IB/AP classes are truly worth it?...
...,...,...,...,...,...,...,...,...,...,...
1010772,1,"NSFW, thanks.",Underdogg13,pics,-1,-1,0,2009-09,2009-09-06 17:16:11,"(PIC) Penelope Cruz, firm and without shirt. F..."
1010790,1,".. erm .. good for them, i guess .. they're ma...",mijj,Economics,-2,-2,0,2009-10,2009-10-30 01:12:47,There was a rash of muggings in my neighborhoo...
1010791,1,"zombie, frankenstein, jesus, now thats a real ...",Rip_Van_Winkle,pics,-2,-2,0,2009-10,2009-10-31 23:20:10,"jesus christ, that's a funny diagram."
1010801,1,"Yes, and there's no such thing as mental illne...",Davin900,worldnews,-1,-1,0,2009-08,2009-08-14 18:34:29,And my parents had a rough upbringing/backgrou...


##### Only 6.1% does not follow the rules, is downs worth keeping? Debatable I guess

### Build model using Comment Column only (Unigram Model)

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier

In [10]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [11]:
## Better abstraction

class sklearnClassifier:
    def __init__(self, model, data, label, fitBool = True):
        self.model = model
        if fitBool: self.fit(data, label)
            
    def fit(self, data, label):
        self.model.fit(data, label)
    
    def score(self, X, y_true):
        y_pred = self.model.predict(X)
        print(f"Accuracy score: {accuracy_score(y_true, y_pred)}")
        print(f"Recall score: {recall_score(y_true, y_pred)}")
        print(f"Precision score: {precision_score(y_true, y_pred)}")
        print(f"F1 score: {f1_score(y_true, y_pred)}")

In [12]:
training_csv_1["comment"] = training_csv_1["comment"].apply(lambda x: x.lower())

In [13]:
X_train, X_val, y_train, y_val = train_test_split(
    training_csv_1["comment"], 
    training_csv_1["label"], 
    test_size = 0.2
)

In [14]:
def create_ngram_vectorizer(text_train, ngram_range = (1,1), **kwargs):
    vectorizer = CountVectorizer(ngram_range = ngram_range, **kwargs)
    vectorizer.fit(text_train)
    return vectorizer

In [15]:
unigram_vectorizer = create_ngram_vectorizer(X_train)

In [16]:
X_train_transformed = unigram_vectorizer.transform(X_train)
X_val_transformed = unigram_vectorizer.transform(X_val)

In [17]:
base_classifier = sklearnClassifier(SGDClassifier(), X_train_transformed, y_train)

In [18]:
print("Training: ")
base_classifier.score(X_train_transformed, y_train)
print("Validation: ")
base_classifier.score(X_val_transformed, y_val)

Training: 
Accuracy score: 0.6890992506121237
Recall score: 0.5690355668866672
Precision score: 0.7489519866687496
F1 score: 0.6467137222279211
Validation: 
Accuracy score: 0.681791201290029
Recall score: 0.5605424668382498
Precision score: 0.7396000679179238
F1 score: 0.6377412252298924


### Now what? Bigrams and Trigrams, LETZ GO!!!

In [19]:
# for i in range(1, 3): # Trigram is a bit slow so we'll bring that back later
#     igram_vectorizer = create_ngram_vectorizer(X_train, ngram_range = (1,i))
#     X_train_transformed = igram_vectorizer.transform(X_train)
#     X_val_transformed = igram_vectorizer.transform(X_val)
    
#     base_classifier = sklearnClassifier(SGDClassifier(), X_train_transformed, y_train)
    
#     print("Training: ")
#     base_classifier.score(X_train_transformed, y_train)
#     print("Validation: ")
#     base_classifier.score(X_val_transformed, y_val)
#     print()

### Using TFIDF instead of just counting

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [21]:
def create_tfidf_ngram_vectorizer(text_train, ngram_range = (1,1), **kwargs):
    vectorizer = TfidfVectorizer(ngram_range = ngram_range, **kwargs)
    vectorizer.fit(text_train)
    return vectorizer

In [22]:
# for i in range(1,3):
#     tfidf_igram_vectorizer = create_tfidf_ngram_vectorizer(X_train, ngram_range = (1,i))
#     X_train_transformed = tfidf_igram_vectorizer.transform(X_train)
#     X_val_transformed = tfidf_igram_vectorizer.transform(X_val)
    
#     base_classifier = sklearnClassifier(SGDClassifier(), X_train_transformed, y_train)
    
#     print("Training: ")
#     base_classifier.score(X_train_transformed, y_train)
#     print("Validation: ")
#     base_classifier.score(X_val_transformed, y_val)
#     print()

### Vector Representation Test

In [23]:
### Abstraction for easier work
class EmbeddingTechniques:
    def __init__(self, method):
        self.transformMethod = method
    
    def transform(self, X):
        return self.transformMethod(X)

In [25]:
class EmbeddingTester:
    def __init__(self, sklearnmodel):
        self.list_of_techniques = {}
        self.tokenized = {}
        self.model = sklearnmodel
        
    def addEmbeddingTechniques(self, key, method, tokenized = False):
        self.list_of_techniques[key] = method
        self.tokenized[key] = tokenized
        
        
    def testModel(self, X_train_transformed, y_train_true, X_test_transformed, y_test_true, text = None):
        if text is not None: print(text)
        self.model.fit(X_train_transformed, y_train_true)
        print("Training: ")
        self.model.score(X_train_transformed, y_train_true)
        print()
        print("Validation: ")
        self.model.score(X_test_transformed, y_test_true)
        print("-" * 80)
        
    def test(self, X_train_untransformed, y_train_true, X_test_untransformed, y_test_true,
            X_train_tokenized, X_test_tokenized):
        for key, val in self.list_of_techniques.items():
            if self.tokenized[key]:
                X_train_transformed = val.transform(X_train_tokenized)
                X_test_transformed = val.transform(X_test_tokenized)
            else:
                X_train_transformed = val.transform(X_train_untransformed)
                X_test_transformed = val.transform(X_test_untransformed)
            self.testModel(X_train_transformed, y_train_true, X_test_transformed, y_test_true, text = key)

In [32]:
tester = EmbeddingTester(base_classifier)
tester.addEmbeddingTechniques(
    "Count Vectorizer(No stopwords removal)", 
    create_ngram_vectorizer(X_train, ngram_range = (1,2))
)

tester.addEmbeddingTechniques(
    "TFIDF Vectorizer(No stopwords removal)", 
    create_tfidf_ngram_vectorizer(X_train, ngram_range = (1,2))
)

tester.addEmbeddingTechniques(
    "Count Vectorizer(With stopwords removal)", 
    create_ngram_vectorizer(X_train, ngram_range = (1,2), stop_words='english')
)

tester.addEmbeddingTechniques(
    "TFIDF Vectorizer(With stopwords removal)", 
    create_tfidf_ngram_vectorizer(X_train, ngram_range = (1,2), stop_words='english')
)

In [27]:
## Thanks Rama, like srsly
from gensim.models import Word2Vec
from nltk.tokenize import TreebankWordTokenizer

In [29]:
vector_size = 128
word_tokenizer = TreebankWordTokenizer()

X_train_tokenized = [word_tokenizer.tokenize(text) for text in X_train]
X_val_tokenized = [word_tokenizer.tokenize(text) for text in X_val]

model = Word2Vec(X_train_tokenized, min_count = 1, vector_size= vector_size, workers = 3, window = 3, sg = 1)

In [30]:
def transform(X_tokenized):
    temp = np.matrix(
        [np.mean([model.wv[i] if i in model.wv else np.array([0.0] * vector_size, dtype=np.float64) for i in tokens], axis = 0) for tokens in X_tokenized],
        dtype=np.float64
    )
    return temp

In [33]:
tester.addEmbeddingTechniques(
    "word2Vec Mean Embedding", 
    EmbeddingTechniques(transform),
    True
)

tester.test(X_train, y_train, X_val, y_val, X_train_tokenized, X_val_tokenized)

word2Vec Mean Embedding
Training: 
Accuracy score: 0.6668909059431652
Recall score: 0.6332701110058779
Precision score: 0.6789974520030014
F1 score: 0.6553370729010993

Validation: 
Accuracy score: 0.6652948567019182
Recall score: 0.6293110275193031
Precision score: 0.6778083418628454
F1 score: 0.6526600004106523
--------------------------------------------------------------------------------
