In [1]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
import urllib.request
from nltk.corpus import stopwords
import re
import numpy as np
import pandas as pd
import collections
from collections import Counter
from sklearn import preprocessing
import string

from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
# load the data
non_clickbait_url = "http://www.cs.columbia.edu/~sarahita/CL/non_clickbait_data.txt"
clickbait_url = "http://www.cs.columbia.edu/~sarahita/CL/clickbait_data.txt"

# read url .txt file into string "data"
def get_data(url):
  data = urllib.request.urlopen(url).read().decode('utf-8')
  return data

non_clickbait_data = get_data(non_clickbait_url)
clickbait_data = get_data(clickbait_url)

In [3]:
# combine clickbait and non-clickbait data in a single list
non_clickbait_headlines = non_clickbait_data.rstrip('\n').split('\n')
clickbait_headlines = clickbait_data.rstrip('\n').split('\n')
all_headlines = non_clickbait_headlines + clickbait_headlines

In [4]:
df = pd.DataFrame(all_headlines, columns = ['text'])

In [5]:
df.head()

Unnamed: 0,text
0,Bill Changing Credit Card Rules Is Sent to Oba...
1,"In Hollywood, the Easy-Money Generation Toughe..."
2,1700 runners still unaccounted for in UK's Lak...
3,Yankees Pitchers Trade Fielding Drills for Put...
4,Large earthquake rattles Indonesia; Seventh in...


In [6]:
# create a list of corresponding labels
non_cb_labels = [0] * len(non_clickbait_headlines)
cb_labels = [1] * len(clickbait_headlines)
all_labels = non_cb_labels + cb_labels

In [7]:
df['clickbait'] = all_labels

In [8]:
df.head()

Unnamed: 0,text,clickbait
0,Bill Changing Credit Card Rules Is Sent to Oba...,0
1,"In Hollywood, the Easy-Money Generation Toughe...",0
2,1700 runners still unaccounted for in UK's Lak...,0
3,Yankees Pitchers Trade Fielding Drills for Put...,0
4,Large earthquake rattles Indonesia; Seventh in...,0


In [9]:
clickbait_headlines[:10]

['Should I Get Bings',
 'Which TV Female Friend Group Do You Belong In',
 'The New "Star Wars: The Force Awakens" Trailer Is Here To Give You Chills',
 'This Vine Of New York On "Celebrity Big Brother" Is Fucking Perfect',
 'A Couple Did A Stunning Photo Shoot With Their Baby After Learning She Had An Inoperable Brain Tumor',
 'How To Flirt With Queer Girls Without Making A Total Fool Of Yourself',
 '32 Cute Things To Distract From Your Awkward Thanksgiving',
 'If Disney Princesses Were From Florida',
 "What's A Quote Or Lyric That Best Describes Your Depression",
 'Natalie Dormer And Sam Claflin Play A Game To See How They\'d Actually Last In "The Hunger Games"']

In [10]:
non_clickbait_headlines[:10]

['Bill Changing Credit Card Rules Is Sent to Obama With Gun Measure Included',
 'In Hollywood, the Easy-Money Generation Toughens Up',
 "1700 runners still unaccounted for in UK's Lake District following flood",
 'Yankees Pitchers Trade Fielding Drills for Putting Practice',
 'Large earthquake rattles Indonesia; Seventh in two days',
 "Coldplay's new album hits stores worldwide this week",
 'U.N. Leader Presses Sri Lanka on Speeding Relief to War Refugees in Camps',
 '2 Somali-Americans Charged With Aiding Terror',
 "US Highway Administration releases interim report on Boston's Big Dig: press release claims tunnel safe, but report does not",
 'White House Announces International Meetings to Address Energy and Climate Issues']

In [11]:
# extract features: bag of stop words
def stop_words(texts):
  bow = []
  eng_stopwords = stopwords.words('english')
  for text in texts:
    counts = []
    tokens = nltk.word_tokenize(text.lower())
    for sw in eng_stopwords:
      sw_count = tokens.count(sw)
      counts.append(sw_count)
    bow.append(counts)
  bow_np = np.array(bow).astype(float)
  return bow_np

# pos tagging
def pos_tags(text):
  tokens = nltk.word_tokenize(text)
  pos = nltk.pos_tag(tokens)
  nns = 0
  nnps = 0
  dts = 0
  ins = 0
  jjs = 0
  nnss = 0
  ccs = 0
  prps = 0
  vbs = 0
  vbgs = 0
  for word,tag in pos:
    if tag == 'NN':
      nns += 1
    elif tag == 'NNP':
      nnps += 1
    elif tag == 'DT':
      dts += 1
    elif tag == 'IN':
      ins += 1
    elif tag == 'JJ':
      jjs += 1
    elif tag == 'NNS':
      nnss += 1
    elif tag == 'CC':
      ccs += 1
    elif tag == 'PRP':
      prps += 1
    elif tag == 'VB':
      vbs += 1
    elif tag == 'VBG':
      vbgs += 1
  return (nns,nnps,dts,ins,jjs,nnss,ccs,prps,vbs,vbgs)

# 30 most commmon unigrams
def common_unigrams(n=1):
  all_data = clickbait_data + non_clickbait_data
  all_data = all_data.replace('\n',' ')
  tokens = nltk.word_tokenize(all_data.lower())
  new = []
  for token in tokens:
    if (token not in string.punctuation) and (token not in stopwords.words('english')):
      new.append(token)
  coms = [new[i:i+n] for i in range(len(new)-n+1)]
  comr = [j for i in coms for j in i]
  obj = collections.Counter(comr)
  r = obj.most_common(3)
  commons = [word[0] for word in r]
  return commons

# counts of most common unigrams
def count_unigrams(text):
  tokens = nltk.word_tokenize(text.lower())
  c = 0
  for token in tokens:
    if token in most_common_unis:
      c += 1
  return c

# punctuations count
def punct(text):
  tokens = nltk.word_tokenize(text.lower())
  c = 0
  for token in tokens:
    if token in string.punctuation:
      c += 1
  return c

# complexity of the text
def complexity(text):
  tokens = nltk.word_tokenize(text.lower())
  types = len(set(tokens))
  norm_text = []
  for token in tokens:
    if token not in string.punctuation:
      norm_text.append(token)
  tot_chars = 0
  for i in tokens:
    tot_chars += len(i)
  avg_chars = tot_chars / len(tokens)
  ttr = types / len(tokens)
  un_words_ratio = len(set(norm_text)) / len(norm_text)
  longw = 0
  for token in tokens:
    if len(token) >= 6:
      longw += 1
  # return (no.of words, ttr, averge chars per word, no.of long words, unique words/total words)
  return (len(norm_text), ttr, avg_chars, longw, un_words_ratio)

# contractions count
def contractions(text):
  tokens = nltk.word_tokenize(text.lower())
  c = 0
  for token in tokens:
    if token in contract_keys:
      c += 1
  return c

# slangs count
def slangs(text):
  tokens = nltk.word_tokenize(text.lower())
  c = 0
  for token in tokens:
    if token in slang_keys:
      c += 1
  return c

# title case words - observed that almost all the clickbait texts have title case,
# meaning that the first letter of each word is capitalized, which is not found in non-clickbait
def titlecase(text):
  words = text.split()
  c = 0
  for word in words:
    if word[0].isupper():
      c += 1
  return c



most_common_unis = common_unigrams()
contract_url = "https://gist.githubusercontent.com/Lewy09-Tm25/2ca6392c5741b5522e5abccf267a2cf0/raw/c8e7f7ccd3aad74d7b2e0135cc1f3e9e5e0f72f9/contractions.json"
contract_dict = pd.read_json(contract_url, typ = 'series')
contract_keys = list(contract_dict.keys())
slangs_url = "https://gist.githubusercontent.com/Lewy09-Tm25/528393fd9d22d393e6660837d0fd289a/raw/74f9e132f2267088eaed1eb5f5cc43a170b2a558/slangs.json"
slang_dict = pd.read_json(slangs_url, typ = 'series')
slang_keys = list(slang_dict.keys())

In [12]:
df['punctuations'] = df['text'].apply(punct)
df['titlecase'] = df['text'].apply(titlecase)
df['NN'],df['NNP'],df['DT'],df['IN'],df['JJ'],df['NNS'],df['CC'],df['PRP'],df['VB'],df['VBG'] = zip(*df['text'].apply(pos_tags))
df['len'],df['ttr'],df['avg_char_ratio'],df['long_words'],df['unique_words_ratio'] = zip(*df['text'].apply(complexity))
df['contractions'] = df['text'].apply(contractions)
df['slangs'] = df['text'].apply(slangs)
df['common_unigrams'] = df['text'].apply(count_unigrams)

In [13]:
df.shape

(31998, 22)

In [14]:
df.head()

Unnamed: 0,text,clickbait,punctuations,titlecase,NN,NNP,DT,IN,JJ,NNS,...,VB,VBG,len,ttr,avg_char_ratio,long_words,unique_words_ratio,contractions,slangs,common_unigrams
0,Bill Changing Credit Card Rules Is Sent to Oba...,0,0,12,0,8,0,1,0,0,...,0,0,13,1.0,4.769231,4,1.0,0,0,0
1,"In Hollywood, the Easy-Money Generation Toughe...",0,1,6,0,3,1,1,0,0,...,0,0,7,1.0,5.625,4,1.0,0,0,0
2,1700 runners still unaccounted for in UK's Lak...,0,0,3,1,3,0,2,0,1,...,0,1,12,1.0,5.166667,4,1.0,1,0,1
3,Yankees Pitchers Trade Fielding Drills for Put...,0,0,7,0,5,0,1,0,2,...,0,0,8,1.0,6.5,6,1.0,0,0,0
4,Large earthquake rattles Indonesia; Seventh in...,0,1,3,1,2,0,1,1,2,...,0,0,8,1.0,5.333333,4,1.0,0,0,0


In [15]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
clickbait,31998.0,0.5,0.500008,0.0,0.0,0.5,1.0,1.0
punctuations,31998.0,0.23464,0.53459,0.0,0.0,0.0,0.0,6.0
titlecase,31998.0,6.729671,3.576783,0.0,4.0,7.0,9.0,25.0
NN,31998.0,0.705575,1.019411,0.0,0.0,0.0,1.0,10.0
NNP,31998.0,3.683074,2.233299,0.0,2.0,3.0,5.0,15.0
DT,31998.0,0.4024,0.64556,0.0,0.0,0.0,1.0,4.0
IN,31998.0,0.909838,0.816411,0.0,0.0,1.0,1.0,6.0
JJ,31998.0,0.395681,0.6338,0.0,0.0,0.0,1.0,6.0
NNS,31998.0,0.478842,0.66934,0.0,0.0,0.0,1.0,5.0
CC,31998.0,0.104257,0.318615,0.0,0.0,0.0,0.0,3.0


In [16]:
df.corrwith(df['clickbait']).sort_values(ascending = False)

  df.corrwith(df['clickbait']).sort_values(ascending = False)


clickbait             1.000000
titlecase             0.715416
PRP                   0.469075
len                   0.387403
DT                    0.304701
common_unigrams       0.277022
NNP                   0.265197
contractions          0.186380
VB                    0.173482
CC                    0.044532
VBG                  -0.001377
slangs               -0.017700
ttr                  -0.082923
IN                   -0.096045
unique_words_ratio   -0.108448
NNS                  -0.127094
JJ                   -0.182298
punctuations         -0.240624
long_words           -0.261102
avg_char_ratio       -0.402069
NN                   -0.435181
dtype: float64

In [17]:
df = df.drop(['text'],axis=1)

# model performance on individual features
fea_syntactic = ['NN', 'NNP', 'DT', 'IN', 'JJ', 'NNS','CC','PRP','VB','VBG']
fea_complex = ['len','ttr','avg_char_ratio','long_words','unique_words_ratio']
fea_punctuations = ['punctuations']
fea_contractions = ['contractions']
fea_slangs = ['slangs']
fea_title_case = ['titlecase']
fea_lexical = ['common_unigrams']
fea_to_take = {1:fea_syntactic, 2:fea_complex, 3:fea_punctuations, 4:fea_contractions, 5:fea_slangs, 6:fea_title_case, 7:fea_lexical}

for i in range(1,8):
  data = df[fea_to_take[i]]
  X = data.to_numpy()
  y = np.array(all_labels)
  print(fea_to_take[i])
  print(X.shape)
  scores = cross_val_score(MultinomialNB(), X, y, scoring='accuracy', cv=10)
  print(f"Average accuracy is {round(scores.mean(),4)}\n")

['NN', 'NNP', 'DT', 'IN', 'JJ', 'NNS', 'CC', 'PRP', 'VB', 'VBG']
(31998, 10)
Average accuracy is 0.7578

['len', 'ttr', 'avg_char_ratio', 'long_words', 'unique_words_ratio']
(31998, 5)
Average accuracy is 0.7321

['punctuations']
(31998, 1)
Average accuracy is 0.5

['contractions']
(31998, 1)
Average accuracy is 0.5

['slangs']
(31998, 1)
Average accuracy is 0.5

['titlecase']
(31998, 1)
Average accuracy is 0.5

['common_unigrams']
(31998, 1)
Average accuracy is 0.5



In [18]:
# only stopwords
X = stop_words(all_headlines)
y = np.array(all_labels)
print(X.shape)
scores = cross_val_score(MultinomialNB(), X, y, scoring='accuracy', cv=10)
print(round(scores.mean(),4))

(31998, 179)
0.8736


In [19]:
# all features
X = df.to_numpy()
y = np.array(all_labels)
print(X.shape)
stop_words_features = stop_words(all_headlines)
X = np.hstack((X,stop_words_features))
scores = cross_val_score(MultinomialNB(), X, y, scoring='accuracy', cv=10)
print(round(scores.mean(),4))

(31998, 21)
0.9676
