In [59]:
# Imports

import pandas as pd
import spacy

In [60]:
# Set max column width so that more characters are printed to the console

pd.set_option('display.max_colwidth', 500)

In [61]:
# Load, label and concatenate data

clickbait = pd.read_csv('inputs/clickbait_data.txt', names=['text'], sep='\t')
non_clickbait = pd.read_csv('inputs/non_clickbait_data.txt', names=['text'], sep='\t')

clickbait['clickbait'] = 1
non_clickbait['clickbait'] = 0

data = pd.concat([clickbait, non_clickbait]).reset_index(drop=True)

In [62]:
# Inspect clickbait titles

data.head(10)

Unnamed: 0,text,clickbait
0,Should I Get Bings,1
1,Which TV Female Friend Group Do You Belong In,1
2,"The New ""Star Wars: The Force Awakens"" Trailer Is Here To Give You Chills",1
3,"This Vine Of New York On ""Celebrity Big Brother"" Is Fucking Perfect",1
4,A Couple Did A Stunning Photo Shoot With Their Baby After Learning She Had An Inoperable Brain Tumor,1
5,How To Flirt With Queer Girls Without Making A Total Fool Of Yourself,1
6,32 Cute Things To Distract From Your Awkward Thanksgiving,1
7,If Disney Princesses Were From Florida,1
8,What's A Quote Or Lyric That Best Describes Your Depression,1
9,"Natalie Dormer And Sam Claflin Play A Game To See How They'd Actually Last In ""The Hunger Games""",1


In [63]:
# Inspect non-clickbait titles

data.tail(10)

Unnamed: 0,text,clickbait
31990,Bainimarama sworn in as Fiji caretaker PM,0
31991,Iran's Supreme Leader wants religious army,0
31992,Albanian girl murdered in tangle of crime,0
31993,Pentagon unable to explain 'mystery missile' video,0
31994,Blair: G8 leaders announce $50 billion aid increase; talks on trade and climate change,0
31995,"To Make Female Hearts Flutter in Iraq, Throw a Shoe",0
31996,"British Liberal Democrat Patsy Calton, 56, dies of cancer",0
31997,Drone smartphone app to help heart attack victims in remote areas announced,0
31998,"Netanyahu Urges Pope Benedict, in Israel, to Denounce Iran",0
31999,Computer Makers Prepare to Stake Bigger Claim in Phones,0


In [64]:
# Save original text

data['original_text'] = data['text']

data.head(10)

Unnamed: 0,text,clickbait,original_text
0,Should I Get Bings,1,Should I Get Bings
1,Which TV Female Friend Group Do You Belong In,1,Which TV Female Friend Group Do You Belong In
2,"The New ""Star Wars: The Force Awakens"" Trailer Is Here To Give You Chills",1,"The New ""Star Wars: The Force Awakens"" Trailer Is Here To Give You Chills"
3,"This Vine Of New York On ""Celebrity Big Brother"" Is Fucking Perfect",1,"This Vine Of New York On ""Celebrity Big Brother"" Is Fucking Perfect"
4,A Couple Did A Stunning Photo Shoot With Their Baby After Learning She Had An Inoperable Brain Tumor,1,A Couple Did A Stunning Photo Shoot With Their Baby After Learning She Had An Inoperable Brain Tumor
5,How To Flirt With Queer Girls Without Making A Total Fool Of Yourself,1,How To Flirt With Queer Girls Without Making A Total Fool Of Yourself
6,32 Cute Things To Distract From Your Awkward Thanksgiving,1,32 Cute Things To Distract From Your Awkward Thanksgiving
7,If Disney Princesses Were From Florida,1,If Disney Princesses Were From Florida
8,What's A Quote Or Lyric That Best Describes Your Depression,1,What's A Quote Or Lyric That Best Describes Your Depression
9,"Natalie Dormer And Sam Claflin Play A Game To See How They'd Actually Last In ""The Hunger Games""",1,"Natalie Dormer And Sam Claflin Play A Game To See How They'd Actually Last In ""The Hunger Games"""


In [65]:
# Lowercase text

data['text'] = data['text'].str.lower()

data.head(10)

Unnamed: 0,text,clickbait,original_text
0,should i get bings,1,Should I Get Bings
1,which tv female friend group do you belong in,1,Which TV Female Friend Group Do You Belong In
2,"the new ""star wars: the force awakens"" trailer is here to give you chills",1,"The New ""Star Wars: The Force Awakens"" Trailer Is Here To Give You Chills"
3,"this vine of new york on ""celebrity big brother"" is fucking perfect",1,"This Vine Of New York On ""Celebrity Big Brother"" Is Fucking Perfect"
4,a couple did a stunning photo shoot with their baby after learning she had an inoperable brain tumor,1,A Couple Did A Stunning Photo Shoot With Their Baby After Learning She Had An Inoperable Brain Tumor
5,how to flirt with queer girls without making a total fool of yourself,1,How To Flirt With Queer Girls Without Making A Total Fool Of Yourself
6,32 cute things to distract from your awkward thanksgiving,1,32 Cute Things To Distract From Your Awkward Thanksgiving
7,if disney princesses were from florida,1,If Disney Princesses Were From Florida
8,what's a quote or lyric that best describes your depression,1,What's A Quote Or Lyric That Best Describes Your Depression
9,"natalie dormer and sam claflin play a game to see how they'd actually last in ""the hunger games""",1,"Natalie Dormer And Sam Claflin Play A Game To See How They'd Actually Last In ""The Hunger Games"""


In [66]:
# Replace words containing apostrophes

data['text'] = data['text'].str.replace("i'm", 'i am')
data['text'] = data['text'].str.replace("you're", 'you are')
data['text'] = data['text'].str.replace("he's", 'he is')
data['text'] = data['text'].str.replace("she's", 'she is')
data['text'] = data['text'].str.replace("it's", 'it is')
data['text'] = data['text'].str.replace("we're", 'we are')
data['text'] = data['text'].str.replace("they're", 'they are')
data['text'] = data['text'].str.replace("there's", 'there is')
data['text'] = data['text'].str.replace("doesn't", 'does not')
data['text'] = data['text'].str.replace("wasn't", 'was not')
data['text'] = data['text'].str.replace("weren't", 'were not')
data['text'] = data['text'].str.replace("haven't", 'have not')
data['text'] = data['text'].str.replace("can't", 'cannot')
data['text'] = data['text'].str.replace("won't", 'will not')
data['text'] = data['text'].str.replace("don't", 'do not')
data['text'] = data['text'].str.replace("i've", 'i have')
data['text'] = data['text'].str.replace("i'd", 'i would')
data['text'] = data['text'].str.replace("i'll", 'i will')
data['text'] = data['text'].str.replace("you'll", 'you will')
data['text'] = data['text'].str.replace("he'll", 'he will')
data['text'] = data['text'].str.replace("she'll", 'she will')
data['text'] = data['text'].str.replace("it'll", 'it will')
data['text'] = data['text'].str.replace("that'll", 'that will')
data['text'] = data['text'].str.replace("we'll", 'we will')
data['text'] = data['text'].str.replace("they'll", 'they will')
data['text'] = data['text'].str.replace("i'd", 'i would')
data['text'] = data['text'].str.replace("you'd", 'you would')
data['text'] = data['text'].str.replace("he'd", 'he would')
data['text'] = data['text'].str.replace("she'd", 'she would')
data['text'] = data['text'].str.replace("it'd", 'it would')
data['text'] = data['text'].str.replace("that'd", 'that would')
data['text'] = data['text'].str.replace("we'd", 'we would')
data['text'] = data['text'].str.replace("they'd", 'they would')
data['text'] = data['text'].str.replace("you've", 'you have')
data['text'] = data['text'].str.replace("couldn't", 'could not')
data['text'] = data['text'].str.replace("wouldn't", 'would not')
data['text'] = data['text'].str.replace("shouldn't", 'should not')
data['text'] = data['text'].str.replace("what's", 'what is')
data['text'] = data['text'].str.replace("who's", 'who is')
data['text'] = data['text'].str.replace("how's", 'how is')
data['text'] = data['text'].str.replace("where's", 'where is')
data['text'] = data['text'].str.replace("aren't", 'are not')
data['text'] = data['text'].str.replace("isn't", 'is not')
data['text'] = data['text'].str.replace("that's", 'that is')
data['text'] = data['text'].str.replace("here's", 'here is')

data.head(10)

Unnamed: 0,text,clickbait,original_text
0,should i get bings,1,Should I Get Bings
1,which tv female friend group do you belong in,1,Which TV Female Friend Group Do You Belong In
2,"the new ""star wars: the force awakens"" trailer is here to give you chills",1,"The New ""Star Wars: The Force Awakens"" Trailer Is Here To Give You Chills"
3,"this vine of new york on ""celebrity big brother"" is fucking perfect",1,"This Vine Of New York On ""Celebrity Big Brother"" Is Fucking Perfect"
4,a couple did a stunning photo shoot with their baby after learning she had an inoperable brain tumor,1,A Couple Did A Stunning Photo Shoot With Their Baby After Learning She Had An Inoperable Brain Tumor
5,how to flirt with queer girls without making a total fool of yourself,1,How To Flirt With Queer Girls Without Making A Total Fool Of Yourself
6,32 cute things to distract from your awkward thanksgiving,1,32 Cute Things To Distract From Your Awkward Thanksgiving
7,if disney princesses were from florida,1,If Disney Princesses Were From Florida
8,what is a quote or lyric that best describes your depression,1,What's A Quote Or Lyric That Best Describes Your Depression
9,"natalie dormer and sam claflin play a game to see how they would actually last in ""the hunger games""",1,"Natalie Dormer And Sam Claflin Play A Game To See How They'd Actually Last In ""The Hunger Games"""


In [67]:
# Remove punctuation

punctuation = ['!', '(', ')', '[', ']', '{', '}', ';', ':', '<', '=', ',', '|', "'", '#',
               '÷', '+', '>', '.', '/', '?', '$', '%', '^', '&', '*', '_', '~', '"']

for punctuation in punctuation:
    data['text'] = data['text'].str.replace(punctuation, '')

data.head(10)

Unnamed: 0,text,clickbait,original_text
0,should i get bings,1,Should I Get Bings
1,which tv female friend group do you belong in,1,Which TV Female Friend Group Do You Belong In
2,the new star wars the force awakens trailer is here to give you chills,1,"The New ""Star Wars: The Force Awakens"" Trailer Is Here To Give You Chills"
3,this vine of new york on celebrity big brother is fucking perfect,1,"This Vine Of New York On ""Celebrity Big Brother"" Is Fucking Perfect"
4,a couple did a stunning photo shoot with their baby after learning she had an inoperable brain tumor,1,A Couple Did A Stunning Photo Shoot With Their Baby After Learning She Had An Inoperable Brain Tumor
5,how to flirt with queer girls without making a total fool of yourself,1,How To Flirt With Queer Girls Without Making A Total Fool Of Yourself
6,32 cute things to distract from your awkward thanksgiving,1,32 Cute Things To Distract From Your Awkward Thanksgiving
7,if disney princesses were from florida,1,If Disney Princesses Were From Florida
8,what is a quote or lyric that best describes your depression,1,What's A Quote Or Lyric That Best Describes Your Depression
9,natalie dormer and sam claflin play a game to see how they would actually last in the hunger games,1,"Natalie Dormer And Sam Claflin Play A Game To See How They'd Actually Last In ""The Hunger Games"""


In [68]:
# # Perform lemmatization - commented out here as decreases predictive accuracy
#
# nlp = spacy.load('en_core_web_sm')  # python -m spacy download en_core_web_sm
#
# for index, entry in enumerate(data['text']):
#     tokens = [token.lemma_ for token in nlp(entry)]
#     clean_text = ' '.join(tokens)
#     data.loc[index, 'text'] = clean_text
#
# data.head(10)

In [69]:
# Inspect clean text - clickbaits

data[['text', 'original_text']].head(50)


Unnamed: 0,text,original_text
0,should i get bings,Should I Get Bings
1,which tv female friend group do you belong in,Which TV Female Friend Group Do You Belong In
2,the new star wars the force awakens trailer is here to give you chills,"The New ""Star Wars: The Force Awakens"" Trailer Is Here To Give You Chills"
3,this vine of new york on celebrity big brother is fucking perfect,"This Vine Of New York On ""Celebrity Big Brother"" Is Fucking Perfect"
4,a couple did a stunning photo shoot with their baby after learning she had an inoperable brain tumor,A Couple Did A Stunning Photo Shoot With Their Baby After Learning She Had An Inoperable Brain Tumor
5,how to flirt with queer girls without making a total fool of yourself,How To Flirt With Queer Girls Without Making A Total Fool Of Yourself
6,32 cute things to distract from your awkward thanksgiving,32 Cute Things To Distract From Your Awkward Thanksgiving
7,if disney princesses were from florida,If Disney Princesses Were From Florida
8,what is a quote or lyric that best describes your depression,What's A Quote Or Lyric That Best Describes Your Depression
9,natalie dormer and sam claflin play a game to see how they would actually last in the hunger games,"Natalie Dormer And Sam Claflin Play A Game To See How They'd Actually Last In ""The Hunger Games"""


In [70]:
# Inspect clean text - non-clickbaits

data[['text', 'original_text']].tail(50)

Unnamed: 0,text,original_text
31950,haitian earthquake in pictures,Haitian earthquake: in pictures
31951,alan turing building opens at university of manchester,Alan Turing Building opens at University of Manchester
31952,a reality check on mortgage modification,A Reality Check on Mortgage Modification
31953,to rescue captain us snipers held steady despite many moving parts,"To Rescue Captain, U.S. Snipers Held Steady Despite Many Moving Parts"
31954,tom menino wins historic fifth term as mayor of boston plus results from area cities,Tom Menino wins historic fifth term as Mayor of Boston; plus results from area cities
31955,tamil tigers promise to fight back against sri lankan forces,Tamil Tigers promise to fight back against Sri Lankan forces
31956,obama announces choice for secretary of housing,Obama announces choice for Secretary of Housing
31957,iran says it is ready for standoff over nuclear activities,Iran says it's ready for standoff over nuclear activities
31958,un council may rebuke north korea,U.N. Council May Rebuke North Korea
31959,a new plan to help modify second mortgages,A New Plan to Help Modify Second Mortgages


In [71]:
# Save to csv

data.to_csv('intermediary_outputs/data_processed.csv', index=False)