In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [2]:
np.random.seed(500)

In [3]:
df_train = pd.read_csv('train.tsv',delimiter='\t')
df_test = pd.read_csv('test.tsv',delimiter='\t')

In [4]:
# Data Cleaning on Train Data
# Splitting the text from title, body and urls
boilerplate_title = df_train["boilerplate"].str.split('"title":"', n = 1, expand = True)
boilerplate_title = boilerplate_title[1].str.split('"body":"', n = 1, expand = True)
boilerplate_body = boilerplate_title[1].str.split('"url":"', n = 1, expand = True)

In [5]:
df_train['boilerplate_title'] = boilerplate_title[0]
df_train['boilerplate_body'] = boilerplate_body[0]
df_train['boilerplate_urls'] = boilerplate_body[1]

In [6]:
df_train['boilerplate_title'] = df_train['boilerplate_title'].astype(str)
df_train['boilerplate_body'] = df_train['boilerplate_body'].astype(str)
df_train['boilerplate_urls'] = df_train['boilerplate_urls'].astype(str)

In [7]:
df_train['boilerplate_title'] = df_train['boilerplate_title'].str.replace('[^\w\s]','')
df_train['boilerplate_body'] = df_train['boilerplate_body'].str.replace('[^\w\s]','')
df_train['boilerplate_urls'] = df_train['boilerplate_urls'].str.replace('[^\w\s]','')

In [8]:
data = pd.DataFrame(list(zip(df_train['boilerplate_title'], df_train['boilerplate_body'],df_train['boilerplate_urls'],df_train['label'])), 
               columns =['Title', 'Body','Url','label'])

In [9]:
data

Unnamed: 0,Title,Body,Url,label
0,IBM Sees Holographic Calls Air Breathing Batte...,A sign stands outside the International Busine...,bloomberg news 2010 12 23 ibm predicts hologra...,0
1,The Fully Electronic Futuristic Starting Gun T...,And that can be carried on a plane without the...,popsci technology article 2012 07 electronic f...,1
2,Fruits that Fight the Flu fruits that fight th...,Apples The most popular source of antioxidants...,menshealth health flu fighting fruits cm mmc F...,1
3,10 Foolproof Tips for Better Sleep,There was a period in my life when I had a lot...,dumblittleman 2007 12 10 foolproof tips for be...,1
4,The 50 Coolest Jerseys You Didn t Know Existed...,Jersey sales is a curious business Whether you...,bleacherreport articles 1205138 the 50 coolest...,0
...,...,...,...,...
7390,Kno Raises 46 Million More To Build Most Power...,Marc Andreessen is normally enthusiastic about...,techcrunch 2010 09 08 kno raises 46 million mo...,0
7391,Why I Miss College,Mar 30 2009 I d like to congratulate Jane on h...,uncoached category why i miss college,0
7392,Sweet Potatoes Eat This Not That im eating th...,They re loaded with vitamin C which smoothes o...,eatthis menshealth slide sweet potatoes slides...,1
7393,Naturally Ella,,naturallyella,1


In [12]:
# Remove blank rows if any.
data = data.dropna()
data

Unnamed: 0,Title,Body,Url,label
0,ibm sees holographic calls air breathing batte...,A sign stands outside the International Busine...,bloomberg news 2010 12 23 ibm predicts hologra...,0
1,the fully electronic futuristic starting gun t...,And that can be carried on a plane without the...,popsci technology article 2012 07 electronic f...,1
2,fruits that fight the flu fruits that fight th...,Apples The most popular source of antioxidants...,menshealth health flu fighting fruits cm mmc F...,1
3,10 foolproof tips for better sleep,There was a period in my life when I had a lot...,dumblittleman 2007 12 10 foolproof tips for be...,1
4,the 50 coolest jerseys you didn t know existed...,Jersey sales is a curious business Whether you...,bleacherreport articles 1205138 the 50 coolest...,0
...,...,...,...,...
7390,kno raises 46 million more to build most power...,Marc Andreessen is normally enthusiastic about...,techcrunch 2010 09 08 kno raises 46 million mo...,0
7391,why i miss college,Mar 30 2009 I d like to congratulate Jane on h...,uncoached category why i miss college,0
7392,sweet potatoes eat this not that im eating th...,They re loaded with vitamin C which smoothes o...,eatthis menshealth slide sweet potatoes slides...,1
7393,naturally ella,,naturallyella,1


In [11]:
# We are extracting Title only. We will later extract Body and url to check if it helps in the accuracy of model.
# We are not taking it hear to decrease the overfitting of the model.
# Data Cleaning of train data set
# Change all the text to lower case.
data['Title'] = [entry.lower() for entry in data['Title']]

In [13]:
# Tokenization : In this each entry will be broken into set of words
data['Title']= [word_tokenize(entry) for entry in data['Title']]

In [14]:
data

Unnamed: 0,Title,Body,Url,label
0,"[ibm, sees, holographic, calls, air, breathing...",A sign stands outside the International Busine...,bloomberg news 2010 12 23 ibm predicts hologra...,0
1,"[the, fully, electronic, futuristic, starting,...",And that can be carried on a plane without the...,popsci technology article 2012 07 electronic f...,1
2,"[fruits, that, fight, the, flu, fruits, that, ...",Apples The most popular source of antioxidants...,menshealth health flu fighting fruits cm mmc F...,1
3,"[10, foolproof, tips, for, better, sleep]",There was a period in my life when I had a lot...,dumblittleman 2007 12 10 foolproof tips for be...,1
4,"[the, 50, coolest, jerseys, you, didn, t, know...",Jersey sales is a curious business Whether you...,bleacherreport articles 1205138 the 50 coolest...,0
...,...,...,...,...
7390,"[kno, raises, 46, million, more, to, build, mo...",Marc Andreessen is normally enthusiastic about...,techcrunch 2010 09 08 kno raises 46 million mo...,0
7391,"[why, i, miss, college]",Mar 30 2009 I d like to congratulate Jane on h...,uncoached category why i miss college,0
7392,"[sweet, potatoes, eat, this, not, that, im, ea...",They re loaded with vitamin C which smoothes o...,eatthis menshealth slide sweet potatoes slides...,1
7393,"[naturally, ella]",,naturallyella,1


In [15]:
# Data cleaning on Test Data
# Splitting the text from title, body and urls
boilerplatetest_title = df_test["boilerplate"].str.split('"title":"', n = 1, expand = True)
boilerplatetest_title = boilerplatetest_title[1].str.split('"body":"', n = 1, expand = True)
boilerplatetest_body = boilerplatetest_title[1].str.split('"url":"', n = 1, expand = True)

In [16]:
df_test['boilerplate_title'] = boilerplatetest_title[0]
df_test['boilerplate_body'] = boilerplatetest_body[0]
df_test['boilerplate_urls'] = boilerplatetest_body[1]

In [17]:
df_test['boilerplate_title'] = df_test['boilerplate_title'].astype(str)
df_test['boilerplate_body'] = df_test['boilerplate_body'].astype(str)
df_test['boilerplate_urls'] = df_test['boilerplate_urls'].astype(str)

In [18]:
df_test['boilerplate_title'] = df_test['boilerplate_title'].str.replace('[^\w\s]','')
df_test['boilerplate_body'] = df_test['boilerplate_body'].str.replace('[^\w\s]','')
df_test['boilerplate_urls'] = df_test['boilerplate_urls'].str.replace('[^\w\s]','')

In [19]:
dataTest = pd.DataFrame(list(zip(df_test['boilerplate_title'], df_test['boilerplate_body'],df_test['boilerplate_urls'])), 
               columns =['Title', 'Body','Url'])

In [20]:
dataTest

Unnamed: 0,Title,Body,Url
0,Homemade Enchilada Sauce Lynn s Kitchen Advent...,I usually buy my enchilada sauce Yes I knew I ...,lynnskitchenadventures 2009 04 homemade enchil...
1,lolpics Stun grenade ar,funny pictures at lolpicsse the best funny im...,lolpics se 18552 stun grenade ar
2,Treadmills,treadmills stair climbers treadmills,xcelerationfitness treadmills html
3,Father s Tactics Used by Assad to Crush Revolt...,Enlarge image Syrian President Bashar al Assad...,bloomberg news 2012 02 06 syria s assad deploy...
4,Stem Turns Lemons and Limes Into Juicy Atomize...,Quirky s Stem turns any citrus fruit into an a...,wired gadgetlab 2011 12 stem turns lemons and ...
...,...,...,...
3166,Peep Brownie S mores Busy Mommy An Iowa Mom Blog,Easter candy is hitting the shelves and I ve b...,busy mommy 2012 02 peep brownie smores html
3167,Marijuana infused cheesecake Medical Cannabis ...,Cannabis infused cheesecake is one of the tast...,
3168,Most Viewed Submissions All Time most viewed s...,www tastespotting com January 27 is National C...,tastespotting popular views all time 2
3169,How to Get a Complete Workout with Nothing But...,Everyone knows exercise plays an important rol...,lifehacker 5839197 how to get a full body work...


In [21]:
# Remove blank rows if any.
dataTest = dataTest.dropna()
dataTest

Unnamed: 0,Title,Body,Url
0,Homemade Enchilada Sauce Lynn s Kitchen Advent...,I usually buy my enchilada sauce Yes I knew I ...,lynnskitchenadventures 2009 04 homemade enchil...
1,lolpics Stun grenade ar,funny pictures at lolpicsse the best funny im...,lolpics se 18552 stun grenade ar
2,Treadmills,treadmills stair climbers treadmills,xcelerationfitness treadmills html
3,Father s Tactics Used by Assad to Crush Revolt...,Enlarge image Syrian President Bashar al Assad...,bloomberg news 2012 02 06 syria s assad deploy...
4,Stem Turns Lemons and Limes Into Juicy Atomize...,Quirky s Stem turns any citrus fruit into an a...,wired gadgetlab 2011 12 stem turns lemons and ...
...,...,...,...
3166,Peep Brownie S mores Busy Mommy An Iowa Mom Blog,Easter candy is hitting the shelves and I ve b...,busy mommy 2012 02 peep brownie smores html
3167,Marijuana infused cheesecake Medical Cannabis ...,Cannabis infused cheesecake is one of the tast...,
3168,Most Viewed Submissions All Time most viewed s...,www tastespotting com January 27 is National C...,tastespotting popular views all time 2
3169,How to Get a Complete Workout with Nothing But...,Everyone knows exercise plays an important rol...,lifehacker 5839197 how to get a full body work...


In [23]:
# Change all the text to lower case.
dataTest['Title'] = [entry.lower() for entry in dataTest['Title']]

In [24]:
#Tokenization : In this each entry will be broken into set of words
dataTest['Title']= [word_tokenize(entry) for entry in dataTest['Title']]

In [25]:
dataTest

Unnamed: 0,Title,Body,Url
0,"[homemade, enchilada, sauce, lynn, s, kitchen,...",I usually buy my enchilada sauce Yes I knew I ...,lynnskitchenadventures 2009 04 homemade enchil...
1,"[lolpics, stun, grenade, ar]",funny pictures at lolpicsse the best funny im...,lolpics se 18552 stun grenade ar
2,[treadmills],treadmills stair climbers treadmills,xcelerationfitness treadmills html
3,"[father, s, tactics, used, by, assad, to, crus...",Enlarge image Syrian President Bashar al Assad...,bloomberg news 2012 02 06 syria s assad deploy...
4,"[stem, turns, lemons, and, limes, into, juicy,...",Quirky s Stem turns any citrus fruit into an a...,wired gadgetlab 2011 12 stem turns lemons and ...
...,...,...,...
3166,"[peep, brownie, s, mores, busy, mommy, an, iow...",Easter candy is hitting the shelves and I ve b...,busy mommy 2012 02 peep brownie smores html
3167,"[marijuana, infused, cheesecake, medical, cann...",Cannabis infused cheesecake is one of the tast...,
3168,"[most, viewed, submissions, all, time, most, v...",www tastespotting com January 27 is National C...,tastespotting popular views all time 2
3169,"[how, to, get, a, complete, workout, with, not...",Everyone knows exercise plays an important rol...,lifehacker 5839197 how to get a full body work...


In [26]:
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

In [27]:
# Lemmatizing train Data
for index,entry in enumerate(data['Title']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    data.loc[index,'Title_final'] = str(Final_words)

In [28]:
data

Unnamed: 0,Title,Body,Url,label,Title_final
0,"[ibm, sees, holographic, calls, air, breathing...",A sign stands outside the International Busine...,bloomberg news 2010 12 23 ibm predicts hologra...,0,"['ibm', 'see', 'holographic', 'call', 'air', '..."
1,"[the, fully, electronic, futuristic, starting,...",And that can be carried on a plane without the...,popsci technology article 2012 07 electronic f...,1,"['fully', 'electronic', 'futuristic', 'start',..."
2,"[fruits, that, fight, the, flu, fruits, that, ...",Apples The most popular source of antioxidants...,menshealth health flu fighting fruits cm mmc F...,1,"['fruit', 'fight', 'flu', 'fruit', 'fight', 'f..."
3,"[10, foolproof, tips, for, better, sleep]",There was a period in my life when I had a lot...,dumblittleman 2007 12 10 foolproof tips for be...,1,"['foolproof', 'tip', 'good', 'sleep']"
4,"[the, 50, coolest, jerseys, you, didn, t, know...",Jersey sales is a curious business Whether you...,bleacherreport articles 1205138 the 50 coolest...,0,"['cool', 'jersey', 'know', 'exist', 'cool', 'j..."
...,...,...,...,...,...
7390,"[kno, raises, 46, million, more, to, build, mo...",Marc Andreessen is normally enthusiastic about...,techcrunch 2010 09 08 kno raises 46 million mo...,0,"['kno', 'raise', 'million', 'build', 'powerful..."
7391,"[why, i, miss, college]",Mar 30 2009 I d like to congratulate Jane on h...,uncoached category why i miss college,0,"['miss', 'college']"
7392,"[sweet, potatoes, eat, this, not, that, im, ea...",They re loaded with vitamin C which smoothes o...,eatthis menshealth slide sweet potatoes slides...,1,"['sweet', 'potato', 'eat', 'im', 'eat']"
7393,"[naturally, ella]",,naturallyella,1,"['naturally', 'ella']"


In [29]:
#Lemmatizing test data
for index,entry in enumerate(dataTest['Title']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    dataTest.loc[index,'Title_final'] = str(Final_words)

In [30]:
dataTest

Unnamed: 0,Title,Body,Url,Title_final
0,"[homemade, enchilada, sauce, lynn, s, kitchen,...",I usually buy my enchilada sauce Yes I knew I ...,lynnskitchenadventures 2009 04 homemade enchil...,"['homemade', 'enchilada', 'sauce', 'lynn', 'ki..."
1,"[lolpics, stun, grenade, ar]",funny pictures at lolpicsse the best funny im...,lolpics se 18552 stun grenade ar,"['lolpics', 'stun', 'grenade', 'ar']"
2,[treadmills],treadmills stair climbers treadmills,xcelerationfitness treadmills html,['treadmill']
3,"[father, s, tactics, used, by, assad, to, crus...",Enlarge image Syrian President Bashar al Assad...,bloomberg news 2012 02 06 syria s assad deploy...,"['father', 'tactic', 'use', 'assad', 'crush', ..."
4,"[stem, turns, lemons, and, limes, into, juicy,...",Quirky s Stem turns any citrus fruit into an a...,wired gadgetlab 2011 12 stem turns lemons and ...,"['stem', 'turn', 'lemon', 'lime', 'juicy', 'at..."
...,...,...,...,...
3166,"[peep, brownie, s, mores, busy, mommy, an, iow...",Easter candy is hitting the shelves and I ve b...,busy mommy 2012 02 peep brownie smores html,"['peep', 'brownie', 'mores', 'busy', 'mommy', ..."
3167,"[marijuana, infused, cheesecake, medical, cann...",Cannabis infused cheesecake is one of the tast...,,"['marijuana', 'infuse', 'cheesecake', 'medical..."
3168,"[most, viewed, submissions, all, time, most, v...",www tastespotting com January 27 is National C...,tastespotting popular views all time 2,"['viewed', 'submission', 'time', 'viewed', 'su..."
3169,"[how, to, get, a, complete, workout, with, not...",Everyone knows exercise plays an important rol...,lifehacker 5839197 how to get a full body work...,"['get', 'complete', 'workout', 'nothing', 'bod..."


In [31]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(data['Title_final'],data['label'],test_size=0.3)

In [32]:
Tfidf_vect = TfidfVectorizer(max_features=10000)
Tfidf_vect.fit(data['Title_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [33]:
print(Tfidf_vect.vocabulary_)

{'ibm': 2141, 'see': 3887, 'holographic': 2067, 'call': 632, 'air': 70, 'breathe': 533, 'battery': 325, 'fully': 1780, 'electronic': 1441, 'futuristic': 1791, 'start': 4217, 'gun': 1948, 'eliminate': 1447, 'advantage': 46, 'race': 3590, 'fruit': 1770, 'fight': 1611, 'flu': 1680, 'cold': 858, 'mens': 2717, 'health': 1997, 'tip': 4574, 'good': 1887, 'sleep': 4031, 'cool': 964, 'jersey': 2311, 'know': 2384, 'exist': 1529, 'treatment': 4654, 'fashion': 1579, 'lane': 2409, 'american': 126, 'wild': 4909, 'child': 764, 'dean': 1142, 'johnson': 2324, 'insidershealthcom': 2239, 'day': 1134, 'cooky': 963, 'cream': 1029, 'brownie': 566, 'sweet': 4394, 'business': 608, 'news': 2975, 'break': 522, 'international': 2263, 'cap': 658, 'great': 1911, 'iron': 2283, 'man': 2611, 'food': 1701, 'trash': 4650, 'teeth': 4482, 'french': 1755, 'onion': 3094, 'steak': 4223, 'red': 3637, 'wine': 4918, 'sauce': 3829, 'izabel': 2293, 'goulart': 1891, 'swimsuit': 4400, 'sport': 4182, 'illustrate': 2165, 'photo': 33

In [34]:
print(Train_X_Tfidf)

  (0, 3330)	0.3323042528348102
  (0, 3024)	0.5181133165537822
  (0, 1973)	0.4338345277550159
  (0, 1515)	0.35554033381477673
  (0, 964)	0.36704516782889446
  (0, 121)	0.41447435757771184
  (1, 3376)	0.13123691623085867
  (1, 3261)	0.12709493162658744
  (1, 1842)	0.4556521775687847
  (1, 254)	0.871208840204157
  (2, 852)	0.8146487700601522
  (2, 301)	0.5799546373980309
  (3, 4175)	0.4411471786107009
  (3, 3762)	0.6002654203117588
  (3, 759)	0.328480310621643
  (3, 98)	0.5806645137389359
  (4, 3630)	0.2449110370849363
  (4, 2969)	0.44296039441508595
  (4, 1701)	0.31456919495765395
  (4, 1428)	0.47439043379081375
  (4, 1274)	0.5243553283095872
  (4, 954)	0.3805996590623993
  (5, 4223)	0.7498095329521639
  (5, 3817)	0.6616537344367202
  (6, 4062)	0.5365261687753738
  :	:
  (5169, 1450)	0.27888438516663033
  (5169, 1268)	0.2630542952986061
  (5169, 707)	0.2499312284850412
  (5169, 389)	0.41730126840277554
  (5169, 247)	0.5200137089520835
  (5171, 875)	1.0
  (5172, 4473)	0.3228051401443184
 

In [35]:
# # seperate the independent and target variable on training data
# Train_X = data['Title_final']
# Train_Y = data['label']

# # independent variable on testing data
# Test_X = dataTest['Title_final']

In [70]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print('Naive Bayes Accuracy Score: ',accuracy_score(predictions_NB, Test_Y)*100)
print('Precision score: ', precision_score(predictions_NB, Test_Y)*100)
print('Recall score: ', recall_score(predictions_NB, Test_Y)*100)
print('Cross validation score for Naive Bayes: ', Naive.score(Test_X_Tfidf, predictions_NB))

Naive Bayes Accuracy Score:  76.25056331680938
Precision score:  79.47136563876653
Recall score:  75.4180602006689
Cross validation score for Naive Bayes:  1.0


In [71]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
print('Precision score: ', precision_score(predictions_SVM, Test_Y)*100)
print('Recall score: ', recall_score(predictions_SVM, Test_Y)*100)
print('Cross validation score for SVM: ', SVM.score(Test_X_Tfidf, predictions_NB))

SVM Accuracy Score ->  76.92654348805769
Precision score:  69.69162995594714
Recall score:  82.48175182481752
Cross validation score for SVM:  0.8589454709328527


In [108]:
data_results = pd.DataFrame(list(predictions_SVM),columns=["Results"])

In [109]:
data_results

Unnamed: 0,Results
0,0
1,0
2,1
3,0
4,0
...,...
2214,0
2215,1
2216,0
2217,1


In [110]:
data.to_csv("data_results.csv")