In [1]:
#@unpublished{SARC,
#  authors={Mikhail Khodak and Nikunj Saunshi and Kiran Vodrahalli},
#  title={A Large Self-Annotated Corpus for Sarcasm},
#  url={https://arxiv.org/abs/1704.05579},
#  year=2017
#}
#https://www.kaggle.com/danofer/sarcasm#train-balanced-sarcasm.csv
import os
import string
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /Users/asel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/asel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# df - data frame
df = pd.read_csv('../train-balanced-sarcasm.csv')
# dropping empty comment entries
df.dropna(subset=['comment'], inplace=True)

print('Type:')
print('Labels array ',type(df.label))
print('Comments array ',type(df.comment))
print('one comment line ', type(df.comment[0]))
print('Shape:')
print('Labels array ',df.label.shape)
print('Comments array ',df.comment.shape)
print('Two first entries:')
print (df.label[0], df.comment[0])
print (df.label[1], df.comment[1])

Type:
Labels array  <class 'pandas.core.series.Series'>
Comments array  <class 'pandas.core.series.Series'>
one comment line  <class 'str'>
Shape:
Labels array  (1010773,)
Comments array  (1010773,)
Two first entries:
0 NC and NH.
0 You do know west teams play against west teams more than east teams right?


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
# create the transform
tf_idf_vectorizer = TfidfVectorizer()
# tokenize and build vocab
T = tf_idf_vectorizer.fit(df.comment)
print(len(T.vocabulary_))

167435


In [8]:
# preprocess with nltk
def my_tokenizer(corpus):
    corpus_tokenized = []
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    sbs = nltk.stem.SnowballStemmer('english', ignore_stopwords=False)
    for comment in corpus:
        words = tokenizer.tokenize(comment)
        # Remove stop words
        stop_words = set(stopwords.words('english'))
        words = [word for word in words if not word in stop_words]
        
        cmnt_t = []
        for token in words:
            cmnt_t.append(sbs.stem(token))
            # make a string to be compatible with TfidfVectorizer
            c = ' '.join(cmnt_t)
        # Lemmitize
        #words = [lemmatizer.lemmatize(word) for word in words]
        
        corpus_tokenized.append(c)
    return corpus_tokenized

In [9]:
df2 = my_tokenizer(df.comment)

In [10]:
for i in range(10):
    print(df.comment[i])
    print(df2[i], '\n')

NC and NH.
nc nh 

You do know west teams play against west teams more than east teams right?
you know west team play west team east team right 

They were underdogs earlier today, but since Gronk's announcement this afternoon, the Vegas line has moved to patriots -1
they underdog earlier today sinc gronk announc afternoon vega line move patriot 1 

This meme isn't funny none of the "new york nigga" ones are.
this meme funni none new york nigga one 

I could use one of those tools.
i could use one tool 

I don't pay attention to her, but as long as she's legal I wouldn't kick her out of bed (before she took a load)
i pay attent long legal i kick bed took load 

Trick or treating in general is just weird...
trick treat general weird 

Blade Mastery+Masamune or GTFO!
blade masteri masamun gtfo 

You don't have to, you have a good build, buy games or save it
you good build buy game save 

I would love to see him at lolla.
i would love see lolla 



In [11]:
print(df2[0:5], '\n')
# convert to pandas Series type type to be compatible with TfidfVectorizer
df3 = pd.Series((v for v in df2))
print(type(df3), '\n')
print(type(df3[0]))
print(df3[0:5])

['nc nh', 'you know west team play west team east team right', 'they underdog earlier today sinc gronk announc afternoon vega line move patriot 1', 'this meme funni none new york nigga one', 'i could use one tool'] 

<class 'pandas.core.series.Series'> 

<class 'str'>
0                                                nc nh
1    you know west team play west team east team right
2    they underdog earlier today sinc gronk announc...
3              this meme funni none new york nigga one
4                                 i could use one tool
dtype: object


In [12]:
tf_idf_vectorizer2 = TfidfVectorizer()
T2 = tf_idf_vectorizer2.fit(df3)
print(len(T2.vocabulary_))

131021


167435 features from before now reduced to 131022 features, that is about 22%

In [13]:
from sklearn.model_selection import train_test_split
# divide into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df.comment, df.label, train_size=0.33)



In [14]:
# Vetorize the training data set
X_train = tf_idf_vectorizer2.transform(X_train)
print(X_train.shape)
print(X_train[0:3])

(333555, 131021)
  (0, 129066)	0.2747577803641861
  (0, 83885)	0.3454441632114769
  (0, 54387)	0.8973162721317763
  (1, 118236)	0.6184954767488434
  (1, 107004)	0.535507342683356
  (1, 69724)	0.5750610673427923
  (2, 10482)	1.0


In [15]:
# Vetorize the test data set
X_test = tf_idf_vectorizer.transform(X_test)
print(X_test.shape)
print(X_test[0:3])

(677218, 167435)
  (0, 165906)	0.19481101885681346
  (0, 157974)	0.7667828790068465
  (0, 114177)	0.4769141948466149
  (0, 93933)	0.3829432518634413
  (1, 165550)	0.30540612723431504
  (1, 160254)	0.1450546477521508
  (1, 156169)	0.2306806244076342
  (1, 148853)	0.1825650594216048
  (1, 147462)	0.0808942046788216
  (1, 143649)	0.3860938594733284
  (1, 123764)	0.24001960733465044
  (1, 90264)	0.4996971830466415
  (1, 88063)	0.2399326048038073
  (1, 85575)	0.22231469861583344
  (1, 60957)	0.4558715196361233
  (1, 19173)	0.14633299493665675
  (2, 165906)	0.17309321079465828
  (2, 161688)	0.17793153930935893
  (2, 147462)	0.13683337543760463
  (2, 122485)	0.37994267871695603
  (2, 120010)	0.3085458982007701
  (2, 106800)	0.2305002592986953
  (2, 105958)	0.09355212000252965
  (2, 99457)	0.1598801578929154
  (2, 90699)	0.18262713575224507
  (2, 82551)	0.20405534647894139
  (2, 77613)	0.09116120329020458
  (2, 73867)	0.12244897712687072
  (2, 70599)	0.14817513282961015
  (2, 68759)	0.21553380

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier

# bagging with Logistic Regressoin
# oob Out-Of-Bag
bag_log_reg = BaggingClassifier(
      LogisticRegression(), n_estimators=50,
    max_samples=50000, bootstrap=True, n_jobs=-1, oob_score=True)

bag_log_reg.fit(X_train,y_train)
print('oob_score is ', bag_log_reg.oob_score_)

KeyboardInterrupt: 

In [17]:
from sklearn.ensemble import RandomForestClassifier
rnd_frst = RandomForestClassifier(
      n_estimators=100, max_leaf_nodes=1000, n_jobs=-1, oob_score=True)
rnd_frst.fit(X_train,y_train)
print('oob_score is ', bag_log_reg.oob_score_)

# 5-folds cross validation with grid search
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# I tried with n_jobs=-1 but kernel died after a few hours
params = [{'n_estimators': [100, 200], 'min_samples_leaf': [1, 10, 50, 100],
                     'random_state': [47], 'n_jobs': [2]}]
rnd_frst = GridSearchCV(RandomForestClassifier(), params, cv=5, scoring='accuracy')

#clf = RandomForestClassifier(n_estimators=200, min_samples_leaf=50, random_state=31, n_jobs = -1)
rnd_frst.fit(X_train, y_train)
rint('oob_score is ', rnd_frst.oob_score_)

KeyboardInterrupt: 