## Reddit Sarcasm Detection

### Import Libraries

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
import os
print(os.getcwd())

### Import CSV

In [None]:
training_csv_1 = pd.read_csv("train-balanced-sarcasm.csv")

In [None]:
training_csv_1["comment"] = training_csv_1["comment"].astype(str)

In [None]:
training_csv_1.head()

### Exploratory Data Analysis

In [None]:
print(f"The total training data has {training_csv_1.author.nunique()} rows.")
training_csv_1.groupby("author").mean()["label"].value_counts()

##### The authors is mostly 0.5 probability of each label, might consider dropping it

In [None]:
print(f"The total training data has {training_csv_1.subreddit.nunique()} rows.")
training_csv_1.groupby("subreddit").mean()["label"].value_counts()

##### Subreddit seems to provide more info than expected, should probably keep

In [None]:
training_csv_1[["ups", "downs"]]

##### Notice how ups and downs seem to have a correlation? Lets test this theory out

In [None]:
training_csv_1[training_csv_1["ups"].apply(lambda x: -1 if x <= -1 else 0) != training_csv_1["downs"]]

##### Only 6.1% does not follow the rules, is downs worth keeping? Debatable I guess

### Build model using Comment Column only (Unigram Model)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier

In [None]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [None]:
## Better abstraction

class sklearnClassifier:
    def __init__(self, model, data, label, fitBool = True):
        self.model = model
        if fitBool: self.fit(data, label)
            
    def fit(self, data, label):
        self.model.fit(data, label)
    
    def score(self, X, y_true):
        y_pred = self.model.predict(X)
        print(f"Accuracy score: {accuracy_score(y_true, y_pred)}")
        print(f"Recall score: {recall_score(y_true, y_pred)}")
        print(f"Precision score: {precision_score(y_true, y_pred)}")
        print(f"F1 score: {f1_score(y_true, y_pred)}")

In [None]:
training_csv_1["comment"] = training_csv_1["comment"].apply(lambda x: x.lower())

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    training_csv_1["comment"], 
    training_csv_1["label"], 
    test_size = 0.2
)

In [None]:
def create_ngram_vectorizer(text_train, ngram_range = (1,1), **kwargs):
    vectorizer = CountVectorizer(ngram_range = ngram_range, **kwargs)
    vectorizer.fit(text_train)
    return vectorizer

In [None]:
unigram_vectorizer = create_ngram_vectorizer(X_train)

In [None]:
X_train_transformed = unigram_vectorizer.transform(X_train)
X_val_transformed = unigram_vectorizer.transform(X_val)

In [None]:
base_classifier = sklearnClassifier(SGDClassifier(), X_train_transformed, y_train)

In [None]:
print("Training: ")
base_classifier.score(X_train_transformed, y_train)
print("Validation: ")
base_classifier.score(X_val_transformed, y_val)

### Now what? Bigrams and Trigrams, LETZ GO!!!

In [None]:
# for i in range(1, 3): # Trigram is a bit slow so we'll bring that back later
#     igram_vectorizer = create_ngram_vectorizer(X_train, ngram_range = (1,i))
#     X_train_transformed = igram_vectorizer.transform(X_train)
#     X_val_transformed = igram_vectorizer.transform(X_val)
    
#     base_classifier = sklearnClassifier(SGDClassifier(), X_train_transformed, y_train)
    
#     print("Training: ")
#     base_classifier.score(X_train_transformed, y_train)
#     print("Validation: ")
#     base_classifier.score(X_val_transformed, y_val)
#     print()

### Using TFIDF instead of just counting

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
def create_tfidf_ngram_vectorizer(text_train, ngram_range = (1,1), **kwargs):
    vectorizer = TfidfVectorizer(ngram_range = ngram_range, **kwargs)
    vectorizer.fit(text_train)
    return vectorizer

In [None]:
# for i in range(1,3):
#     tfidf_igram_vectorizer = create_tfidf_ngram_vectorizer(X_train, ngram_range = (1,i))
#     X_train_transformed = tfidf_igram_vectorizer.transform(X_train)
#     X_val_transformed = tfidf_igram_vectorizer.transform(X_val)
    
#     base_classifier = sklearnClassifier(SGDClassifier(), X_train_transformed, y_train)
    
#     print("Training: ")
#     base_classifier.score(X_train_transformed, y_train)
#     print("Validation: ")
#     base_classifier.score(X_val_transformed, y_val)
#     print()

### Vector Representation Test

In [None]:
### Abstraction for easier work
class EmbeddingTechniques:
    def __init__(self, method):
        self.transformMethod = method
    
    def transform(self, X):
        return self.transformMethod(X)

In [None]:
class EmbeddingTester:
    def __init__(self, sklearnmodel):
        self.list_of_techniques = {}
        self.tokenized = {}
        self.model = sklearnmodel
        
    def addEmbeddingTechniques(self, key, method, tokenized = False):
        self.list_of_techniques[key] = method
        self.tokenized[key] = tokenized
        
        
    def testModel(self, X_train_transformed, y_train_true, X_test_transformed, y_test_true, text = None):
        if text is not None: print(text)
        self.model.fit(X_train_transformed, y_train_true)
        print("Training: ")
        self.model.score(X_train_transformed, y_train_true)
        print()
        print("Validation: ")
        self.model.score(X_test_transformed, y_test_true)
        print("-" * 80)
        
    def test(self, X_train_untransformed, y_train_true, X_test_untransformed, y_test_true,
            X_train_tokenized, X_test_tokenized):
        for key, val in self.list_of_techniques.items():
            if self.tokenized[key]:
                X_train_transformed = val.transform(X_train_tokenized)
                X_test_transformed = val.transform(X_test_tokenized)
            else:
                X_train_transformed = val.transform(X_train_untransformed)
                X_test_transformed = val.transform(X_test_untransformed)
            self.testModel(X_train_transformed, y_train_true, X_test_transformed, y_test_true, text = key)

In [None]:
tester = EmbeddingTester(base_classifier)
tester.addEmbeddingTechniques(
    "Count Vectorizer(No stopwords removal)", 
    create_ngram_vectorizer(X_train, ngram_range = (1,2))
)

tester.addEmbeddingTechniques(
    "TFIDF Vectorizer(No stopwords removal)", 
    create_tfidf_ngram_vectorizer(X_train, ngram_range = (1,2))
)

tester.addEmbeddingTechniques(
    "Count Vectorizer(With stopwords removal)", 
    create_ngram_vectorizer(X_train, ngram_range = (1,2), stop_words='english')
)

tester.addEmbeddingTechniques(
    "TFIDF Vectorizer(With stopwords removal)", 
    create_tfidf_ngram_vectorizer(X_train, ngram_range = (1,2), stop_words='english')
)

In [None]:
## Thanks Rama, like srsly
from gensim.models import Word2Vec
from nltk.tokenize import TreebankWordTokenizer

In [None]:
vector_size = 128
word_tokenizer = TreebankWordTokenizer()

X_train_tokenized = [word_tokenizer.tokenize(text) for text in X_train]
X_val_tokenized = [word_tokenizer.tokenize(text) for text in X_val]

model = Word2Vec(X_train_tokenized, min_count = 1, vector_size= vector_size, workers = 3, window = 3, sg = 1)

In [None]:
def transform(X_tokenized):
    temp = np.matrix(
        [np.mean([model.wv[i] if i in model.wv else np.array([0.0] * vector_size, dtype=np.float64) for i in tokens], axis = 0) for tokens in X_tokenized],
        dtype=np.float64
    )
    return temp

In [None]:
tester.addEmbeddingTechniques(
    "word2Vec Mean Embedding", 
    EmbeddingTechniques(transform),
    True
)

tester.test(X_train, y_train, X_val, y_val, X_train_tokenized, X_val_tokenized)

## Feature Engineering

### Imports

In [94]:
from nltk.tokenize import TreebankWordTokenizer, WordPunctTokenizer
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')
porter_stemmer = PorterStemmer()
word_tokenizer = TreebankWordTokenizer()
word_tokenizer2 = WordPunctTokenizer()
lemmatizer = WordNetLemmatizer()

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1123)>


In [95]:
## tokenize cols
## Define function to remove stopwords and tokenize comments and or parent_comments
def create_stopwords_dict():
  stopwords_dict = {}
  for word in set(stopwords.words('english')):
    stopwords_dict[word] = True
  return stopwords_dict

stop_words_dict = create_stopwords_dict()


def remove_stopwords_and_tokenize(text):
  arr = word_tokenizer.tokenize(text)
  arr = [word for word in arr if word not in stop_words_dict]
  return arr

def remove_stopwords_and_tokenize_cols_in_dataset(dataset, cols):
    for col in cols:
        dataset.dropna(subset=[col], inplace=True)
        dataset[col] = training_csv_1[col].apply(lambda x: remove_stopwords_and_tokenize(x))
        dataset[col] = training_csv_1[col].apply(lambda x: remove_stopwords_and_tokenize(x))
    return dataset

In [96]:
## After tokenization
## Define function to add length of comments and or parent comments
def add_length_feature_to_dataset(dataset, cols):
    for col in cols:
        new_col = "num_" + col + "_words" 
        dataset[new_col] = dataset[col].apply(lambda x: len(x))
    return dataset

In [None]:
## Define a function that splits training set into just sarcasm and just non-sarcasm
def split_training_dataset_into_separate_labels(training_dataset):
    sarcasm = training_dataset[training_dataset['label'] == 1]
    non_sarcasm = training_dataset[training_dataset['label'] == 0]
    return sarcasm, non_sarcasm

## Define function to engineer features for model such as subreddit history and author history
def feature_history(training_dataset, col):
    history_sarcasm = {}
    history_non_sarcasm = {}
    
    total_comments_by_feature_history = {}
    proportion_sarcasm_by_feature_history = {}
    
    for index, row in training_dataset.iterrows():
        if int(row['label']) == 1:
            if row[col] not in history_sarcasm:
                history_sarcasm[row[col]] = 0
                history_non_sarcasm[row[col]] = 0
            history_sarcasm[row[col]] += 1
    
        elif int(row['label']) == 0:
            if row[col] not in history_non_sarcasm:
                history_non_sarcasm[row[col]] = 0
                history_sarcasm[row[col]] = 0
            history_non_sarcasm[row[col]] += 1
    
    for val in history_sarcasm.keys():
        num_sarcasm = history_sarcasm[val]
        num_non_sarcasm = history_non_sarcasm[val]
        total_comments = num_sarcasm + num_non_sarcasm
        sarcasm_proportion = num_sarcasm/total_comments
        
        proportion_sarcasm_by_feature_history[val] = sarcasm_proportion
        total_comments_by_feature_history[val] = total_comments
    
    return proportion_sarcasm_by_feature_history, total_comments_by_feature_history



## Define function to prepare training dataset

def add_feature_history_to_train(train_dataset, col):
    (proportion_history, total_comments_history) = feature_history(train_dataset, col)
    proportion_col = "sarcasm_proportion_by_" + col
    total_col = "total_num_comments_by_" + col
    
    train_dataset[proportion_col] = train_dataset[col].apply(lambda x: proportion_history[x])
    train_dataset[total_col] = train_dataset[col].apply(lambda x: total_comments_history[x])
    
    return train_dataset

## Define function to prepare testing dataset

def calculate_mean(table):
    values = table.values()
    return sum(values)/(len(values))

def add_feature_history_to_test(test_dataset, col, proportion_history, total_comments_history):
    default_proportion = calculate_mean(proportion_history)
    default_total_comments = calculate_mean(total_comments_history)
    
    def getProportion(col_val):
        proportion = default_proportion
        if col_val in proportion_history:
            proportion = proportion_history[col_val]
    
        return proportion
    
    def getTotal(col_val):
        total = default_total_comments
        if col_val in total_comments_history:
            total = total_comments_history[col_val]
        
        return total
    
    proportion_col = "sarcasm_proportion_by_" + col
    total_col = "total_num_comments_by_" + col
    
    test_dataset[proportion_col] = test_dataset[col].apply(lambda x: getProportion(x))
    test_dataset[total_col] = test_dataset[col].apply(lambda x: getTotal(x))
    
    return test_dataset

In [98]:
## Before tokenizing
## Counting number of exclamation marks
def count_num_exclamation_marks(text):
    return text.count("!")
        
def add_num_exclamation_mark_in_feature(dataset, cols):
    for col in cols:
        dataset[col + "_num_exclamation_marks"] = dataset[col].apply(lambda x: count_num_exclamation_marks(x))
    return dataset

In [99]:
## Before tokenizing
## Counting number of repeated exclamation marks
def count_num_repeated_explanation_marks(text):
    return text.count("!!")

def add_num_repeated_exclamation_mark_in_feature(dataset, cols):
    for col in cols:
        dataset[col + "_num_repeated_exclamation_marks"] = dataset[col].apply(lambda x: count_num_repeated_explanation_marks(x))
    return dataset

In [100]:
## Before tokenizing
## Count number of emoticons
def count_num_common_emoticons(text):
    common_emoticons = [":(", ":)", "<3", ":'(", ":')", "):", "(:"]
    count = 0
    for emoticon in common_emoticons:
        count += text.count(emoticon)
    return count

def add_num_emoticons_in_feature(dataset, cols):
    for col in cols:
        dataset[col + "_num_emoticons"] = dataset[col].apply(lambda x: count_num_common_emoticons(x))
    return dataset

In [101]:
## Before tokenizing
## Count number of common "slang" style abbreviations
def count_num_common_slang(text):
    common_slang = ["kms", "smh", "smdh", "smfh", "rofl", "roflmao", "sic ", "lol"]
    count = 0
    for slang in common_slang:
        count += text.count(slang)
    return count

def add_num_slang_in_feature(dataset, cols):
    for col in cols:
        dataset[col + "_num_slang"] = dataset[col].apply(lambda x: count_num_common_slang(x))
    return dataset

In [102]:
## After tokenizing
## Count number of misspelled words
from spellchecker import SpellChecker

spellchecker = SpellChecker(language="en")

def count_number_of_misspelled_words(text):
    count = 0
    misspelled_words = spellchecker.unknown(text)
    return len(misspelled_words)

def add_num_misspelled_words_feature(dataset, cols):
    for col in cols:
        dataset[col + "_num_misspelled_words"] = dataset[col].apply(lambda x: count_number_of_misspelled_words)
    return dataset

In [103]:
## After tokenizing
## Measure misspelling in a different way - by summing up edit distances
from nltk.metrics import edit_distance

def measure_sum_of_edit_distances(text):
    distances = 0
    misspelled_words = spellchecker.unknown(text)
    for misspelled_word in misspelled_words:
        corrected_word = spellchecker.correction(misspelled_word)
        distances += edit_distance(corrected_word, misspelled_word)
    return distances

def add_sum_of_edit_distances_feature(dataset, cols):
    for col in cols:
        dataset[col + "_edit_distance_misspelled_words"] = dataset[col].apply(lambda x: measure_sum_of_edit_distances(x))
    return dataset

In [104]:
## Load CSV
training_csv_feature_engineering = pd.read_csv("train-balanced-sarcasm.csv")
## training_csv_feature_engineering.dropna(subset=['comment', 'parent_comment'], inplace=True)
training_csv_feature_engineering["comment"] = training_csv_feature_engineering["comment"].astype(str)
training_csv_feature_engineering["parent_comment"] = training_csv_feature_engineering["parent_comment"].astype(str)

In [105]:
COMMENT_AND_PARENT_COMMENT = ["comment", "parent_comment"]
COMMENT = ["comment"]
PARENT_COMMENT = ["parent_comment"]
AUTHOR = "author"
SUBREDDIT = "subreddit"

In [106]:
## Add BEFORE tokenization features
training_csv_feature_engineering = add_num_exclamation_mark_in_feature(
    training_csv_feature_engineering, COMMENT_AND_PARENT_COMMENT)

training_csv_feature_engineering = add_num_repeated_exclamation_mark_in_feature(
    training_csv_feature_engineering, COMMENT_AND_PARENT_COMMENT)

training_csv_feature_engineering = add_num_emoticons_in_feature(
    training_csv_feature_engineering, COMMENT_AND_PARENT_COMMENT)

training_csv_feature_engineering = add_num_slang_in_feature(
    training_csv_feature_engineering, COMMENT_AND_PARENT_COMMENT)

In [108]:
## tokenize
training_csv_feature_engineering = remove_stopwords_and_tokenize_cols_in_dataset(
    training_csv_feature_engineering, COMMENT_AND_PARENT_COMMENT)

In [None]:
## Add AFTER tokenization features
training_csv_feature_engineering = add_length_feature_to_dataset(
    training_csv_feature_engineering, COMMENT_AND_PARENT_COMMENT)

training_csv_feature_engineering = add_feature_history_to_train(
    training_csv_feature_engineering, AUTHOR)

training_csv_feature_engineering = add_feature_history_to_train(
    training_csv_feature_engineering, SUBREDDIT)

In [None]:
## Normalise added features
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler() 
scaled_values = scaler.fit_transform(training_csv_feature_engineering.iloc[:, 10:]) 
training_csv_feature_engineering.iloc[:,10:] = scaled_values

In [None]:
## Preview of features
## Engineered features from column 10 to end (0 based indexing)
training_csv_feature_engineering.head()