This notebook will be refactored to be used in production. Ideal case is the models will be saved beforehand to be used for any function call. Pickle load if needed.

In [94]:
import pandas as pd, numpy as np, re, time
from nltk.stem.porter import PorterStemmer
import pickle as pk

In [79]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score, precision_recall_fscore_support
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV

In [34]:
data = pd.read_json('./data/Sarcasm_Headlines_Dataset.json', lines = True)

In [35]:
data.iloc[0].headline, data.iloc[0].is_sarcastic

("former versace store clerk sues over secret 'black code' for minority shoppers",
 0)

In [36]:
# Relacing special symbols and digits in headline column
# re stands for Regular Expression
data['headline'] = data['headline'].apply(lambda s : re.sub('[^a-zA-Z]', ' ', s))

In [37]:
data

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the roseanne revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son s web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,boehner just wants wife to listen not come up...,1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j k rowling wishes snape happy birthday in th...,0
5,https://www.huffingtonpost.com/entry/advancing...,advancing the world s women,0
6,https://www.huffingtonpost.com/entry/how-meat-...,the fascinating case for eating lab grown meat,0
7,https://www.huffingtonpost.com/entry/boxed-col...,this ceo will send your kids to school if you...,0
8,https://politics.theonion.com/top-snake-handle...,top snake handler leaves sinking huckabee camp...,1
9,https://www.huffingtonpost.com/entry/fridays-m...,friday s morning email inside trump s presser...,0


In [65]:
len(data[data.is_sarcastic==0]), len(data[data.is_sarcastic==1])

(14985, 11724)

In [38]:
# getting features and labels
features = data['headline']
labels = data['is_sarcastic']

In [39]:
features[:4]

0    former versace store clerk sues over secret  b...
1    the  roseanne  revival catches up to our thorn...
2    mom starting to fear son s web series closest ...
3    boehner just wants wife to listen  not come up...
Name: headline, dtype: object

In [40]:
# Stemming our data
ps = PorterStemmer()
features = features.apply(lambda x: x.split())
features = features.apply(lambda x : " ".join([ps.stem(word) for word in x]))

In [66]:
example = "Hey there. What are you up to? Too many people are considering to leave."
example = " ".join(ps.stem(word) for word in example.split())
example

'hey there. what are you up to? too mani peopl are consid to leave.'

In [42]:
[example]
tv.transform([example]).toarray()

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


array([[0., 0., 0., ..., 0., 0., 0.]])

In [43]:
# vectorizing the data with maximum of 5000 features
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(max_features = 5000)
features = list(features)
features = tv.fit_transform(features).toarray()

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [44]:
# getting training and testing data
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = .05, random_state = 0)

In [45]:
# Using linear support vector classifier
lsvc = LinearSVC()

# CalibratedSVM classifier for the prediction probabilities
clf = CalibratedClassifierCV(svm) 
clf.fit(features_train, labels_train)
y_proba = clf.predict_proba(features_test)

# training the model
lsvc.fit(features_train, labels_train)
# getting the score of train and test data
print(lsvc.score(features_train, labels_train)) # 90.93
print(lsvc.score(features_test, labels_test))   # 83.75

0.9093524612777362
0.8375748502994012


In [85]:
# Using linear support vector classifier
lsvc = LinearSVC()
# CalibratedSVM classifier for the prediction probabilities
clf = CalibratedClassifierCV(lsvc) 
# Training the model
clf.fit(features_train, labels_train)
# Predictions
train_proba = clf.predict_proba(features_train)
test_proba = clf.predict_proba(features_test)
test_proba

array([[0.32386567, 0.67613433],
       [0.15802251, 0.84197749],
       [0.05776573, 0.94223427],
       ...,
       [0.93537021, 0.06462979],
       [0.98041633, 0.01958367],
       [0.87724721, 0.12275279]])

In [87]:
threshold = 0.5
train_probs = [0 if i[0]>threshold else 1 for i in train_proba]
test_probs = [0 if i[0]>threshold else 1 for i in test_proba]
test_probs[:10]

[1, 1, 1, 0, 0, 0, 1, 0, 0, 0]

In [95]:
# getting the score of train and test data
train_scores = precision_recall_fscore_support(labels_train, train_probs, average='weighted') # 90.52%
test_scores = precision_recall_fscore_support(labels_test, test_probs, average='weighted') # 83.20%
print(train_scores)
print(test_scores)

(0.9052545968963728, 0.9053324399952706, 0.9052165944284256, None)
(0.8320514448619291, 0.8323353293413174, 0.8320724625839245, None)


In [46]:
tv.transform(["Hey there. What are you up to?"]).toarray()

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


array([[0., 0., 0., ..., 0., 0., 0.]])

In [56]:
lsvc.score(tv.transform(["Hey there. What are you up to?"]), [0])

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


1.0

In [53]:
lsvc.predict(tv.transform(["I had an enormoous lunch this evening for make this 'important' call"]))
# slvc.predict_proba()

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


array([0])

In [92]:
transformed_word = tv.transform(["I will take this point to the heart.", "I got shot", "As if you meant that"])
# print(lsvc.predict(transformed_word))
# print(lsvc.decision_function(transformed_word))

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


__Now that we trained a prediction probability classifier__

In [93]:
clf.predict_proba(transformed_word)

array([[0.80901038, 0.19098962],
       [0.72472804, 0.27527196],
       [0.37193294, 0.62806706]])

In [100]:
# Save models, transformers, results
model_transform_results = [clf, tv, (train_scores, test_scores)]
pk.dump(model_transform_results, open("model_transform_results.pk", "wb"))

In [102]:
# Load the model to test
load_test = pk.load(open("model_transform_results.pk", "rb"))
load_test

[CalibratedClassifierCV(base_estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
      intercept_scaling=1, loss='squared_hinge', max_iter=1000,
      multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
      verbose=0),
             cv=3, method='sigmoid'),
 TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=1.0, max_features=5000, min_df=1,
         ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
         stop_words=None, strip_accents=None, sublinear_tf=False,
         token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
         vocabulary=None),
 ((0.9052545968963728, 0.9053324399952706, 0.9052165944284256, None),
  (0.8320514448619291, 0.8323353293413174, 0.8320724625839245, None))]