In [1]:
#@unpublished{SARC,
#  authors={Mikhail Khodak and Nikunj Saunshi and Kiran Vodrahalli},
#  title={A Large Self-Annotated Corpus for Sarcasm},
#  url={https://arxiv.org/abs/1704.05579},
#  year=2017
#}
#https://www.kaggle.com/danofer/sarcasm#train-balanced-sarcasm.csv
import os
import string
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /Users/asel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/asel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# df - data frame
df = pd.read_csv('../train-balanced-sarcasm.csv')
# dropping empty comment entries
df.dropna(subset=['comment'], inplace=True)

print('Type:')
print('Labels array ',type(df.label))
print('Comments array ',type(df.comment))
print('one comment line ', type(df.comment[0]))
print('Shape:')
print('Labels array ',df.label.shape)
print('Comments array ',df.comment.shape)
print('Two first entries:')
print (df.label[0], df.comment[0])
print (df.label[1], df.comment[1])

Type:
Labels array  <class 'pandas.core.series.Series'>
Comments array  <class 'pandas.core.series.Series'>
one comment line  <class 'str'>
Shape:
Labels array  (1010773,)
Comments array  (1010773,)
Two first entries:
0 NC and NH.
0 You do know west teams play against west teams more than east teams right?


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
# create the transform
tf_idf_vectorizer = TfidfVectorizer()
# tokenize and build vocab
T = tf_idf_vectorizer.fit(df.comment)
print(len(T.vocabulary_))

167435


In [4]:
# preprocess with nltk
def my_tokenizer(corpus):
    corpus_tokenized = []
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    sbs = nltk.stem.SnowballStemmer('english', ignore_stopwords=False)
    for comment in corpus:
        words = tokenizer.tokenize(comment)
        # Remove stop words
        stop_words = set(stopwords.words('english'))
        words = [word for word in words if not word in stop_words]
        
        cmnt_t = []
        for token in words:
            cmnt_t.append(sbs.stem(token))
            # make a string to be compatible with TfidfVectorizer
            c = ' '.join(cmnt_t)
        # Lemmitize
        #words = [lemmatizer.lemmatize(word) for word in words]
        
        corpus_tokenized.append(c)
    return corpus_tokenized

In [5]:
df2 = my_tokenizer(df.comment)

In [6]:
for i in range(10):
    print(df.comment[i])
    print(df2[i], '\n')

NC and NH.
nc nh 

You do know west teams play against west teams more than east teams right?
you know west team play west team east team right 

They were underdogs earlier today, but since Gronk's announcement this afternoon, the Vegas line has moved to patriots -1
they underdog earlier today sinc gronk announc afternoon vega line move patriot 1 

This meme isn't funny none of the "new york nigga" ones are.
this meme funni none new york nigga one 

I could use one of those tools.
i could use one tool 

I don't pay attention to her, but as long as she's legal I wouldn't kick her out of bed (before she took a load)
i pay attent long legal i kick bed took load 

Trick or treating in general is just weird...
trick treat general weird 

Blade Mastery+Masamune or GTFO!
blade masteri masamun gtfo 

You don't have to, you have a good build, buy games or save it
you good build buy game save 

I would love to see him at lolla.
i would love see lolla 



In [7]:
print(df2[0:5], '\n')
# convert to pandas Series type type to be compatible with TfidfVectorizer
df3 = pd.Series((v for v in df2))
print(type(df3), '\n')
print(type(df3[0]))
print(df3[0:5])

['nc nh', 'you know west team play west team east team right', 'they underdog earlier today sinc gronk announc afternoon vega line move patriot 1', 'this meme funni none new york nigga one', 'i could use one tool'] 

<class 'pandas.core.series.Series'> 

<class 'str'>
0                                                nc nh
1    you know west team play west team east team right
2    they underdog earlier today sinc gronk announc...
3              this meme funni none new york nigga one
4                                 i could use one tool
dtype: object


In [8]:
tf_idf_vectorizer2 = TfidfVectorizer()
T2 = tf_idf_vectorizer2.fit(df3)
print(len(T2.vocabulary_))

131021


167435 features from before now reduced to 131022 features, that is about 22%

In [18]:
from sklearn.model_selection import train_test_split
# divide into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df.comment, df.label, train_size=0.33)



In [19]:
# Vetorize the training data set
X_train = tf_idf_vectorizer2.transform(X_train)
print(X_train.shape)
print(X_train[0:3])

(333555, 131021)
  (0, 114778)	0.2334939475344019
  (0, 102935)	0.5677332522023989
  (0, 84472)	0.22282278344663098
  (0, 83478)	0.27529710484178005
  (0, 60384)	0.30468182380446995
  (0, 30995)	0.5934644037853228
  (0, 21924)	0.22954308905787657
  (1, 115136)	0.2284741761173503
  (1, 114778)	0.19298844528743025
  (1, 95230)	0.33542329444018865
  (1, 61783)	0.3039260641598386
  (1, 56525)	0.24137101127713298
  (1, 44898)	0.289972082570119
  (1, 42787)	0.24238170379244572
  (1, 32650)	0.4204493472434724
  (1, 25744)	0.3534429208489977
  (1, 12075)	0.278175774258411
  (1, 7590)	0.3542614776346883
  (2, 101473)	0.7029704241989476
  (2, 75344)	0.6817627300723437
  (2, 68870)	0.20256397158886882


In [20]:
# Vetorize the test data set
X_test = tf_idf_vectorizer2.transform(X_test)
print(X_test.shape)
print(X_test[0:3])

(677218, 131021)
  (0, 129066)	0.22166608712200553
  (0, 123953)	0.3295551274578615
  (0, 82633)	0.3632277947927076
  (0, 72174)	0.2764799124558602
  (0, 70172)	0.37626630689607005
  (0, 60384)	0.31897725569178426
  (0, 49674)	0.24421637791368989
  (0, 44898)	0.3672938685479092
  (0, 17256)	0.27717811021565053
  (0, 9708)	0.34523186068774
  (1, 129631)	0.37260552473816205
  (1, 95634)	0.741786536583155
  (1, 82407)	0.2496975558846948
  (1, 81766)	0.21767408314268974
  (1, 75520)	0.4485387176061454
  (2, 127590)	0.5287348428366397
  (2, 100229)	0.4211753755497816
  (2, 84817)	0.4372026276829753
  (2, 82407)	0.42255716974413754
  (2, 17160)	0.4163532990693601


In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier

# bagging with Logistic Regressoin
# oob Out-Of-Bag
bag_log_reg = BaggingClassifier(
      LogisticRegression(), n_estimators=50,
    max_samples=50000, bootstrap=True, n_jobs=-1, oob_score=True)

bag_log_reg.fit(X_train,y_train)
print('oob_score is ', bag_log_reg.oob_score_)

oob_score is  0.6562306066465801


In [23]:
from sklearn.ensemble import RandomForestClassifier

rnd_frst = RandomForestClassifier(n_estimators=200, min_samples_leaf=50, random_state=31, n_jobs = -1, oob_score=True)
rnd_frst.fit(X_train, y_train)
print('oob_score is ', rnd_frst.oob_score_)

oob_score is  0.6509691055448127


In [24]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, bag_log_reg.predict(X_test))

0.6575209164552626

In [25]:
accuracy_score(y_test, rnd_frst.predict(X_test))

0.6526775720668972

In [28]:
from xgboost import XGBClassifier

eval_set = [(X_train, y_train), (X_test, y_test)]
params = {
    'booster': 'gbtree',
    'eta': np.random.uniform(.05,.2),
    'min_child_weight': np.random.randint(3,7),
    'max_depth': np.random.randint(3,10),
    'subsample': np.random.uniform(.5,1),
    'colsample_bytree': np.random.uniform(.5,1)
}

xgb = XGBClassifier(**params)
xgb.fit(X_train, y_train, eval_metric=["error", "logloss"], eval_set=eval_set)

[0]	validation_0-error:0.413851	validation_0-logloss:0.68777	validation_1-error:0.416305	validation_1-logloss:0.69115
[1]	validation_0-error:0.405477	validation_0-logloss:0.683568	validation_1-error:0.408481	validation_1-logloss:0.687557
[2]	validation_0-error:0.39422	validation_0-logloss:0.681793	validation_1-error:0.397187	validation_1-logloss:0.679756
[3]	validation_0-error:0.392319	validation_0-logloss:0.67551	validation_1-error:0.395148	validation_1-logloss:0.674936
[4]	validation_0-error:0.390014	validation_0-logloss:0.674582	validation_1-error:0.3929	validation_1-logloss:0.673854
[5]	validation_0-error:0.391249	validation_0-logloss:0.67102	validation_1-error:0.3947	validation_1-logloss:0.674602
[6]	validation_0-error:0.38987	validation_0-logloss:0.66867	validation_1-error:0.393392	validation_1-logloss:0.672301
[7]	validation_0-error:0.384221	validation_0-logloss:0.666242	validation_1-error:0.388182	validation_1-logloss:0.665669
[8]	validation_0-error:0.384203	validation_0-loglos

[69]	validation_0-error:0.343799	validation_0-logloss:0.621251	validation_1-error:0.354344	validation_1-logloss:0.62888
[70]	validation_0-error:0.34356	validation_0-logloss:0.621064	validation_1-error:0.35409	validation_1-logloss:0.628605
[71]	validation_0-error:0.343374	validation_0-logloss:0.620349	validation_1-error:0.353975	validation_1-logloss:0.628176
[72]	validation_0-error:0.343197	validation_0-logloss:0.620032	validation_1-error:0.353786	validation_1-logloss:0.62902
[73]	validation_0-error:0.343068	validation_0-logloss:0.619661	validation_1-error:0.353583	validation_1-logloss:0.628831
[74]	validation_0-error:0.342894	validation_0-logloss:0.619553	validation_1-error:0.353502	validation_1-logloss:0.628628
[75]	validation_0-error:0.342504	validation_0-logloss:0.619305	validation_1-error:0.353164	validation_1-logloss:0.6267
[76]	validation_0-error:0.341848	validation_0-logloss:0.619039	validation_1-error:0.352801	validation_1-logloss:0.62651
[77]	validation_0-error:0.341221	valida

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.7025961627866093, eta=0.17370050434368975,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=9,
       min_child_weight=6, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.5947331682504324)

In [29]:
accuracy_score(y_test, xgb.predict(X_test))

0.6514416332702321

In [30]:
eval_set = [(X_train, y_train), (X_test, y_test)]
params = {
    'booster': 'gblinear',
    'lambda': 0.2,
}

xgb_lin = XGBClassifier(**params)
xgb_lin.fit(X_train, y_train, eval_metric=["error", "logloss"], eval_set=eval_set)

[0]	validation_0-error:0.498506	validation_0-logloss:0.691342	validation_1-error:0.500763	validation_1-logloss:0.689393
[1]	validation_0-error:0.498506	validation_0-logloss:0.691335	validation_1-error:0.500763	validation_1-logloss:0.68939
[2]	validation_0-error:0.498506	validation_0-logloss:0.691293	validation_1-error:0.500763	validation_1-logloss:0.689369
[3]	validation_0-error:0.498506	validation_0-logloss:0.691256	validation_1-error:0.500763	validation_1-logloss:0.68935
[4]	validation_0-error:0.498506	validation_0-logloss:0.691264	validation_1-error:0.500763	validation_1-logloss:0.689355
[5]	validation_0-error:0.498506	validation_0-logloss:0.69124	validation_1-error:0.500763	validation_1-logloss:0.689349
[6]	validation_0-error:0.498506	validation_0-logloss:0.691063	validation_1-error:0.500763	validation_1-logloss:0.689267
[7]	validation_0-error:0.498506	validation_0-logloss:0.690808	validation_1-error:0.500763	validation_1-logloss:0.68915
[8]	validation_0-error:0.498506	validation_0

[68]	validation_0-error:0.498506	validation_0-logloss:0.693535	validation_1-error:0.500763	validation_1-logloss:0.690995
[69]	validation_0-error:0.498506	validation_0-logloss:0.693535	validation_1-error:0.500763	validation_1-logloss:0.690995
[70]	validation_0-error:0.498506	validation_0-logloss:0.693535	validation_1-error:0.500763	validation_1-logloss:0.690995
[71]	validation_0-error:0.498506	validation_0-logloss:0.693535	validation_1-error:0.500763	validation_1-logloss:0.690995
[72]	validation_0-error:0.498506	validation_0-logloss:0.693535	validation_1-error:0.500763	validation_1-logloss:0.690995
[73]	validation_0-error:0.498506	validation_0-logloss:0.693535	validation_1-error:0.500763	validation_1-logloss:0.690995
[74]	validation_0-error:0.498506	validation_0-logloss:0.693535	validation_1-error:0.500763	validation_1-logloss:0.690995
[75]	validation_0-error:0.498506	validation_0-logloss:0.693535	validation_1-error:0.500763	validation_1-logloss:0.690995
[76]	validation_0-error:0.498506

XGBClassifier(base_score=0.5, booster='gblinear', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, lambda=0.2, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1)

In [None]:
accuracy_score(y_test, xgb.predict(X_test))