In [1]:
import pandas as pd, numpy as np, re, time
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
# import sklearn.externals.joblib as extjoblib
import joblib

In [2]:
dataset = pd.read_json('Sarcasm_Headlines_Dataset.json', lines = True)

In [3]:
dataset

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0
...,...,...,...
26704,https://www.huffingtonpost.com/entry/american-...,american politics in moral free-fall,0
26705,https://www.huffingtonpost.com/entry/americas-...,america's best 20 hikes,0
26706,https://www.huffingtonpost.com/entry/reparatio...,reparations and obama,0
26707,https://www.huffingtonpost.com/entry/israeli-b...,israeli ban targeting boycott supporters raise...,0


In [4]:
dataset.isnull().any(axis = 0)

article_link    False
headline        False
is_sarcastic    False
dtype: bool

In [5]:
dataset.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


In [6]:
# Relacing special symbols and digits in headline column
# re stands for Regular Expression
dataset['headline'] = dataset['headline'].apply(lambda s : re.sub('[^a-zA-Z]', ' ', s))

In [7]:
type(dataset['headline'])

pandas.core.series.Series

In [8]:
dataset.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the roseanne revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son s web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,boehner just wants wife to listen not come up...,1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j k rowling wishes snape happy birthday in th...,0


In [36]:
# getting features and labels
features = dataset['headline']
labels = dataset['is_sarcastic']

In [37]:
features

0        former versace store clerk sues over secret  b...
1        the  roseanne  revival catches up to our thorn...
2        mom starting to fear son s web series closest ...
3        boehner just wants wife to listen  not come up...
4        j k  rowling wishes snape happy birthday in th...
                               ...                        
26704                 american politics in moral free fall
26705                              america s best    hikes
26706                                reparations and obama
26707    israeli ban targeting boycott supporters raise...
26708                    gourmet gifts for the foodie     
Name: headline, Length: 26709, dtype: object

In [38]:
labels

0        0
1        0
2        1
3        1
4        0
        ..
26704    0
26705    0
26706    0
26707    0
26708    0
Name: is_sarcastic, Length: 26709, dtype: int64

In [39]:
ps = PorterStemmer()
features = features.apply(lambda x: x.split())
features = features.apply(lambda x : ' '.join([ps.stem(word) for word in x]))

In [40]:
features

0        former versac store clerk sue over secret blac...
1        the roseann reviv catch up to our thorni polit...
2        mom start to fear son s web seri closest thing...
3        boehner just want wife to listen not come up w...
4        j k rowl wish snape happi birthday in the most...
                               ...                        
26704                    american polit in moral free fall
26705                                  america s best hike
26706                                      repar and obama
26707    isra ban target boycott support rais alarm abroad
26708                           gourmet gift for the foodi
Name: headline, Length: 26709, dtype: object

In [41]:
# vectorizing the data with maximum of 5000 features
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(max_features = 5000)
features = tv.fit_transform(features).toarray()

In [17]:
joblib.dump(tv, 'tfidfVectorizer.pkl')

['tfidfVectorizer.pkl']

In [42]:
features.shape

(26709, 5000)

In [43]:
# getting training and testing data
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = .20, random_state = 0)

In [45]:
# Using linear support vector classifier
lsvc = LinearSVC()
# training the model
lsvc.fit(features_train, labels_train)
# getting the score of train and test data
print(lsvc.score(features_train, labels_train)) 
print(lsvc.score(features_test, labels_test))   

0.9159919502035849
0.8227255709472108


In [105]:
joblib.dump(lsvc, 'linearsvc_84acc.pkl')

['linearsvc_84acc.pkl']

In [44]:
gnb = GaussianNB()
gnb.fit(features_train, labels_train)
print(gnb.score(features_train, labels_train)) 
print(gnb.score(features_test, labels_test))    

0.7840127299106098
0.7059153874953201


In [106]:
joblib.dump(gnb, 'gnb_74acc.pkl')

['gnb_74acc.pkl']

In [54]:
lr = LogisticRegression()
lr.fit(features_train, labels_train)
print(lr.score(features_train, labels_train))  
print(lr.score(features_test, labels_test))    

0.8831375485561848
0.8305877948333957


In [107]:
joblib.dump(lr, 'lr_83acc.pkl')

['lr_83acc.pkl']

In [53]:
rfc = RandomForestClassifier(n_estimators = 10, random_state = 0)
rfc.fit(features_train, labels_train)
print(rfc.score(features_train, labels_train))  # 98.82
print(rfc.score(features_test, labels_test))    # 79.71

0.9896101464875743
0.7822912766754024


In [108]:
joblib.dump(rfc, 'rfc_80acc.pkl')

['rfc_80acc.pkl']

In [52]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=40)
neigh.fit(features_train, labels_train)
print(neigh.score(features_train, labels_train))  # 98.82
print(neigh.score(features_test, labels_test)) 

0.8253381382505733
0.8006364657431674


In [51]:
query = "1 dead, 3 injured in shooting at t.i. concert in nyc"

In [52]:
query = pd.Series(query)
    
query = query.apply(lambda s : re.sub('[^a-zA-Z]', ' ', s))

In [53]:
type(query)

pandas.core.series.Series

In [54]:
ps = PorterStemmer()
query = query.apply(lambda x: x.split())
query = query.apply(lambda x : ' '.join([ps.stem(word) for word in x]))

In [55]:

query = list(query)


In [56]:
tv1 = joblib.load("tfidfVectorizer.pkl")
query = tv1.transform(query).toarray()

In [57]:
query

array([[0., 0., 0., ..., 0., 0., 0.]])

In [58]:
query.shape

(1, 5000)

In [59]:
lsvc = joblib.load("linearsvc_84acc.pkl")

In [60]:
result = lsvc.predict(query)

In [63]:
type(result[0])

numpy.int64