<a href="https://colab.research.google.com/github/Minusadd/haotianfu.github.io/blob/master/irony_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install spacy
!pip install sklearn
!pip install spacytextblob
import csv
import pandas as pd
import random
from functools import reduce
import spacy
import numpy as np
import pickle
import os

from google.colab import drive
drive.mount("/content/drive", force_remount=True)

In [2]:
FILEPATH = "subreddit_irony_data.csv"

In [3]:
#load data for baseline and remove nan
def load_data():

  file = pd.read_csv(FILEPATH)
  file.replace("nan",np.nan,inplace=True)
  file = file.dropna(subset=['comment', 'label'], axis=0, how='any')
  raw_comment = file.loc[:, "comment"]
  labels = file.loc[:, "label"]

  return file, list(raw_comment), list(labels)


In [4]:
from spacy.tokenizer import Tokenizer

def preprocessing_part(raw_tweets):
  
  # parse all the comments with spacy

  cache_path = "raw_comments.pickle"
  nlp = spacy.load('en_core_web_sm')
  if os.path.exists(cache_path): 
    print("Loading parses from cache at %s"%cache_path)
    parsed_tweets = pickle.load(open(cache_path, 'rb'))
  else:
    parsed_tweets = []
    for i,r in enumerate(raw_tweets):
      if i % 1000 == 0:
        print("Processed %d out of %d"%(i, len(raw_tweets)))
      #print(r)
      parsed_tweets.append(nlp(r))
    if cache_path is not None:
       pickle.dump(parsed_tweets, open(cache_path, 'wb'))
  new = []
  cot = []
  #simple preprocessing
  for r in parsed_tweets:
    new1= []
    for token in r:
      token = nlp(token.lemma_)
      token = token[0].norm_.lower()
      
      new1.append(token)
      cot.append(token)

    new.append(new1)
  
  return new


In [5]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
#uni-gram and bi-gram features extraction
def featurize_part(preproc_X, dv=None, isTest = False):

  vectorizer = CountVectorizer(ngram_range=(1,2))
  return vectorizer.fit_transform(preproc_X)

In [6]:
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import recall_score, precision_score
from sklearn.model_selection import GridSearchCV


def run_kfold_crossval1(X, y, k=5, feature_fn=featurize_part):
  acc = []
  recall_scores = []
  precision_scores = []
  X = feature_fn(X)
  for i in range(k):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, train_size=0.8, random_state=42) #split data
    svm = SGDClassifier(loss="log", penalty="l2", alpha=.01, class_weight='balanced')
    parameters = {'alpha':[0.001, .01,  .1, 1., 0.0001, 0.05, 0.005, 0.5]}
    clf = GridSearchCV(svm, parameters, scoring='f1') #hyperparameter search
    clf = clf.fit(X_train, y_train)
    recall_scores.append(recall_score(y_test, clf.predict(X_test)))
    precision_scores.append(precision_score(y_test, clf.predict(X_test)))
  return recall_scores, precision_scores, clf

In [7]:
from scipy.stats import describe
from math import sqrt

def make_plots(perfs, names):
  means = []
  stds = []
  for i,perf in enumerate(perfs):
   n, minmax, mean, var, skew, kurt = describe(perf)
   means.append(mean)
   stds.append(sqrt(var))
   print("%s:\t%.03f"%(names[i], mean))
  plt.bar(np.arange(len(means)), means, yerr=stds)
  plt.xticks(np.arange(len(names)), names)
  plt.show()

In [8]:

import spacy
from spacytextblob.spacytextblob import SpacyTextBlob
#load data for fully-featurized model
def load_data2():
  file = pd.read_csv(FILEPATH)
  raw_comment = file.loc[:, "comment"]
  labels = file.loc[:, "label"]
  threads = file.loc[:, "thread_title"]


  return file, list(raw_comment), list(labels), list(threads)


def preprocessing_part1(raw_tweets):
  # parse all the comments with spacy and calculate sentiments

  cache_path = "raw_comments_nnp.pickle"
  cache_path2 = "raw_comments_sentiment.pickle"
  nlp = spacy.load('en_core_web_sm')
  nlp.add_pipe('spacytextblob')
  if os.path.exists(cache_path) and os.path.exists(cache_path2): 
    print("Loading parses from cache at %s"%cache_path)
    parsed_tweets = pickle.load(open(cache_path, 'rb'))
    sentiments = pickle.load(open(cache_path2, 'rb'))
  else:
    parsed_tweets = []
    sentiments = []
    for i,r in enumerate(raw_tweets):
      if i % 1000 == 0:
        print("Processed %d out of %d"%(i, len(raw_tweets)))
      temp = nlp(r)
      parsed_tweets.append(temp)
      sentiments.append(temp._.blob.polarity>0) #sentiment calculation
    if cache_path is not None:
       pickle.dump(parsed_tweets, open(cache_path, 'wb'))
       pickle.dump(sentiments, open(cache_path2, 'wb'))
  new = []
  #simple preprocessing
  for r in parsed_tweets:
    new1= []
    for token in r:
      if token.is_stop or token.is_punct or token.is_space:
        continue
      if token.tag_ != 'NNP': #extract NNPs
        continue  
      else:
        if token.like_num:
          token = str("NUM")  
        else:
          token = nlp(token.lemma_)
          token = token[0].norm_.lower()
      
        new1.append(token)
      
      new1.append(token)

    new.append(new1)

  return new, sentiments

def preprocessing_part2(raw_tweets):
  
  # parse all the thread titles with spacy
  cache_path = "raw_threads_nnp.pickle"
  nlp = spacy.load('en_core_web_sm')
  if os.path.exists(cache_path): 
    print("Loading parses from cache at %s"%cache_path)
    parsed_tweets = pickle.load(open(cache_path, 'rb'))
  else:
    parsed_tweets = []
    for i,r in enumerate(raw_tweets):
      if i % 1000 == 0:
        print("Processed %d out of %d"%(i, len(raw_tweets)))
      parsed_tweets.append(nlp(r))
    if cache_path is not None:
       pickle.dump(parsed_tweets, open(cache_path, 'wb'))
  new = []
  # simple preprocessing
  for r in parsed_tweets:
    new1= []
    for token in r:
      if token.is_stop or token.is_punct or token.is_space:
        continue
      if token.tag_ != 'NNP':
        continue
      else:
        if token.like_num:
          token = str("NUM")  
        else:
          token = nlp(token.lemma_)
          token = token[0].norm_.lower()
      
        new1.append(token)

    new.append(new1)

  return new  

In [9]:
#iteratively find the parent comments of a given comment and concatenate all of them
def find_comments(file, comments, full_comments, n):
  comments = comments + full_comments[n]
  if not np.isnan(file.loc[n,'parent_id']):
    parent_index = file[file.comment_id==file.loc[n,'parent_id']].index.tolist()
    for i in range(len(parent_index)):
      comments = find_comments(file, comments, full_comments, parent_index[i])
  return comments
#feature extraction of the interaction term and sentiments
def featurize_part2(file, X, comments, threads, sentiments, dv=None, isTest = False):
  
  dicts = []
  words = []
  #bag of NNPs
  for i in X:
    words.extend(comments[i])

  for i in X:
    words.extend(threads[i])
  words = sorted(list(set(words)))
  words_ = ["".join([val, "{}".format(i)]) for val in words for i in range(4)]
  #add another feature for sentiments
  words_.append('sentiment1')
  step = 0
  #interaction term of NNP + subreddit + sentiment
  for j in X:
    if not np.isnan(file.loc[j,'label']): #remove nan
      full_comments_w = find_comments(file, [], comments, j)
      full_comments_w = full_comments_w + threads[j]
      bow = np.zeros(4*len(words)+1)
      for w in full_comments_w:
        for i, w1 in enumerate(words):
          if w==w1:
            if sentiments[j] == 0 and file.loc[j, 'subreddit'] == 'progressive':
              bow[i*4] += 1
            elif sentiments[j] == 0 and file.loc[j, 'subreddit'] == 'Conservative':
              bow[i*4+1] += 1
            elif sentiments[j] == 1 and file.loc[j, 'subreddit'] == 'progressive':
              bow[i*4+2] += 1
            elif sentiments[j] == 1 and file.loc[j, 'subreddit'] == 'Conservative':
              bow[i*4+3] += 1
      bow[-1] = sentiments[j]
      ow = dict(zip(words_, bow))
      dicts.append(ow)
      step += 1
  #Dictvectorizer
  if isTest is False:
    dv = DictVectorizer()
    X = dv.fit_transform(dicts)
    return X, dv
  else:
    return dv.transform(dicts), dv

In [10]:
# Input: List<List<String>> X, Vector<Int> y
# Output: List<Float> accuracies
def run_kfold_crossval2(comments, threads, y, file, sentiments, k=5, feature_fn=featurize_part2):
  acc = []
  recall_scores = []
  precision_scores = []
  X = list(range(len(y)))
  for i in range(k):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, train_size=0.8, random_state=42)
    X_train, dv = feature_fn(file, X_train, comments, threads, sentiments)
    X_test, dv = feature_fn(file, X_test, comments, threads, sentiments, dv=dv, isTest=True)
    y_train = np.array(y_train)[~np.isnan(y_train)].tolist() #remove nan
    y_test = np.array(y_test)[~np.isnan(y_test)].tolist() #remove nan
    svm = SGDClassifier(loss="log", penalty="l2", alpha=.01, class_weight='balanced')
    parameters = {'alpha':[0.001, 0.005, 0.0001, .01, 0.05, 0.5, .1]} #hyperparameter search
    clf = GridSearchCV(svm, parameters, scoring='f1')
    clf = clf.fit(X_train, y_train)
    recall_scores.append(recall_score(y_test, clf.predict(X_test)))
    precision_scores.append(precision_score(y_test, clf.predict(X_test)))
    print("recall:", recall_score(y_test, clf.predict(X_test)))
    print("precision:", precision_score(y_test, clf.predict(X_test)))

  return recall_scores, precision_scores, clf


In [11]:
def main():
  file, comments, labels = load_data()
  X_preproc = preprocessing_part(comments)
  recall_score1, precision_score1, clf = run_kfold_crossval1(comments, labels, k=100, feature_fn=featurize_part) 
  bow_mean_precision, bow_median_precision, bow_25th_perc_precision, bow_75th_perc_precision = np.mean(precision_score1), np.quantile(precision_score1, 0.5), np.quantile(precision_score1, 0.25), np.quantile(precision_score1, 0.75)
  bow_mean_recall, bow_median_recall, bow_25th_perc_recall, bow_75th_perc_recall = np.mean(recall_score1), np.quantile(recall_score1, 0.5), np.quantile(recall_score1, 0.25), np.quantile(recall_score1, 0.75)
  print("mean_precision:", bow_mean_precision)
  print("mean_recall:", bow_mean_recall)
  file, raw_comment, labels, threads = load_data2()
  comments, sentiments = preprocessing_part1(raw_comment)
  threads = preprocessing_part2(threads)
  recall_score2, precision_score2, clf_ = run_kfold_crossval2(comments, threads, labels, file, sentiments, k=50, feature_fn=featurize_part2)
  np_mean_precision, np_median_precision, np_25th_perc_precision, np_75th_perc_precision = np.mean(precision_score2), np.quantile(precision_score2, 0.5), np.quantile(precision_score2, 0.25), np.quantile(precision_score2, 0.75)
  np_mean_recall, np_median_recall, np_25th_perc_recall, np_75th_perc_recall = np.mean(recall_score2), np.quantile(recall_score2, 0.5), np.quantile(recall_score2, 0.25), np.quantile(recall_score2, 0.75)

  
  #do not edit below this line

  def fformat(f):
    return "%.2f" % f

  print("Bag of Words Baseline")
  print("Precision")
  print(fformat(bow_mean_precision), fformat(bow_median_precision), fformat(bow_25th_perc_precision), fformat(bow_75th_perc_precision))
  print("Recall")
  print(fformat(bow_mean_recall), fformat(bow_median_recall), fformat(bow_25th_perc_recall), fformat(bow_75th_perc_recall))

  print("NP Sentiment Context Model")
  print("Precision")
  print(fformat(np_mean_precision), fformat(np_median_precision), fformat(np_25th_perc_precision), fformat(np_75th_perc_precision))
  print("Recall")
  print(fformat(np_mean_recall), fformat(np_median_recall), fformat(np_25th_perc_recall), fformat(np_75th_perc_recall))



In [None]:
main()

Loading parses from cache at raw_comments.pickle
mean_precision: 0.08763273856175895
mean_recall: 0.13825
Loading parses from cache at raw_comments_nnp.pickle
Loading parses from cache at raw_threads_nnp.pickle
recall: 0.34285714285714286
precision: 0.1875
recall: 0.37142857142857144
precision: 0.1625
recall: 0.37142857142857144
precision: 0.19696969696969696
recall: 0.37142857142857144
precision: 0.19117647058823528
recall: 0.42857142857142855
precision: 0.17857142857142858
recall: 0.42857142857142855
precision: 0.24193548387096775
recall: 0.37142857142857144
precision: 0.22413793103448276
recall: 0.37142857142857144
precision: 0.18309859154929578
recall: 0.4
precision: 0.175
recall: 0.34285714285714286
precision: 0.21052631578947367
recall: 0.34285714285714286
precision: 0.21818181818181817
recall: 0.37142857142857144
precision: 0.20634920634920634
recall: 0.34285714285714286
precision: 0.18461538461538463
recall: 0.37142857142857144
precision: 0.20634920634920634
recall: 0.4
precisi