In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [2]:
np.random.seed(500)

In [3]:
df_train = pd.read_csv('train.tsv',delimiter='\t')

In [4]:
# Data Cleaning on Train Data
# Splitting the text from title, body and urls
boilerplate_title = df_train["boilerplate"].str.split('"title":"', n = 1, expand = True)
boilerplate_title = boilerplate_title[1].str.split('"body":"', n = 1, expand = True)
boilerplate_body = boilerplate_title[1].str.split('"url":"', n = 1, expand = True)

In [5]:
df_train['boilerplate_title'] = boilerplate_title[0]
df_train['boilerplate_body'] = boilerplate_body[0]
df_train['boilerplate_urls'] = boilerplate_body[1]

In [6]:
df_train['boilerplate_title'] = df_train['boilerplate_title'].astype(str)
df_train['boilerplate_body'] = df_train['boilerplate_body'].astype(str)
df_train['boilerplate_urls'] = df_train['boilerplate_urls'].astype(str)

In [7]:
df_train['boilerplate_title'] = df_train['boilerplate_title'].str.replace('[^\w\s]','')
df_train['boilerplate_body'] = df_train['boilerplate_body'].str.replace('[^\w\s]','')
df_train['boilerplate_urls'] = df_train['boilerplate_urls'].str.replace('[^\w\s]','')

In [8]:
data = pd.DataFrame(list(zip(df_train['boilerplate_title'], df_train['boilerplate_body'],df_train['boilerplate_urls'],df_train['label'])), 
               columns =['Title', 'Body','Url','label'])

In [9]:
data

Unnamed: 0,Title,Body,Url,label
0,IBM Sees Holographic Calls Air Breathing Batte...,A sign stands outside the International Busine...,bloomberg news 2010 12 23 ibm predicts hologra...,0
1,The Fully Electronic Futuristic Starting Gun T...,And that can be carried on a plane without the...,popsci technology article 2012 07 electronic f...,1
2,Fruits that Fight the Flu fruits that fight th...,Apples The most popular source of antioxidants...,menshealth health flu fighting fruits cm mmc F...,1
3,10 Foolproof Tips for Better Sleep,There was a period in my life when I had a lot...,dumblittleman 2007 12 10 foolproof tips for be...,1
4,The 50 Coolest Jerseys You Didn t Know Existed...,Jersey sales is a curious business Whether you...,bleacherreport articles 1205138 the 50 coolest...,0
...,...,...,...,...
7390,Kno Raises 46 Million More To Build Most Power...,Marc Andreessen is normally enthusiastic about...,techcrunch 2010 09 08 kno raises 46 million mo...,0
7391,Why I Miss College,Mar 30 2009 I d like to congratulate Jane on h...,uncoached category why i miss college,0
7392,Sweet Potatoes Eat This Not That im eating th...,They re loaded with vitamin C which smoothes o...,eatthis menshealth slide sweet potatoes slides...,1
7393,Naturally Ella,,naturallyella,1


In [10]:
# Remove blank rows if any.
data = data.dropna()
data

Unnamed: 0,Title,Body,Url,label
0,IBM Sees Holographic Calls Air Breathing Batte...,A sign stands outside the International Busine...,bloomberg news 2010 12 23 ibm predicts hologra...,0
1,The Fully Electronic Futuristic Starting Gun T...,And that can be carried on a plane without the...,popsci technology article 2012 07 electronic f...,1
2,Fruits that Fight the Flu fruits that fight th...,Apples The most popular source of antioxidants...,menshealth health flu fighting fruits cm mmc F...,1
3,10 Foolproof Tips for Better Sleep,There was a period in my life when I had a lot...,dumblittleman 2007 12 10 foolproof tips for be...,1
4,The 50 Coolest Jerseys You Didn t Know Existed...,Jersey sales is a curious business Whether you...,bleacherreport articles 1205138 the 50 coolest...,0
...,...,...,...,...
7390,Kno Raises 46 Million More To Build Most Power...,Marc Andreessen is normally enthusiastic about...,techcrunch 2010 09 08 kno raises 46 million mo...,0
7391,Why I Miss College,Mar 30 2009 I d like to congratulate Jane on h...,uncoached category why i miss college,0
7392,Sweet Potatoes Eat This Not That im eating th...,They re loaded with vitamin C which smoothes o...,eatthis menshealth slide sweet potatoes slides...,1
7393,Naturally Ella,,naturallyella,1


In [11]:
# We are extracting Title only. We will later extract Body and url to check if it helps in the accuracy of model.
# We are not taking it hear to decrease the overfitting of the model.
# Data Cleaning of train data set
# Change all the text to lower case.
data['Title'] = [entry.lower() for entry in data['Title']]

In [12]:
# Tokenization : In this each entry will be broken into set of words
data['Title']= [word_tokenize(entry) for entry in data['Title']]

In [13]:
data

Unnamed: 0,Title,Body,Url,label
0,"[ibm, sees, holographic, calls, air, breathing...",A sign stands outside the International Busine...,bloomberg news 2010 12 23 ibm predicts hologra...,0
1,"[the, fully, electronic, futuristic, starting,...",And that can be carried on a plane without the...,popsci technology article 2012 07 electronic f...,1
2,"[fruits, that, fight, the, flu, fruits, that, ...",Apples The most popular source of antioxidants...,menshealth health flu fighting fruits cm mmc F...,1
3,"[10, foolproof, tips, for, better, sleep]",There was a period in my life when I had a lot...,dumblittleman 2007 12 10 foolproof tips for be...,1
4,"[the, 50, coolest, jerseys, you, didn, t, know...",Jersey sales is a curious business Whether you...,bleacherreport articles 1205138 the 50 coolest...,0
...,...,...,...,...
7390,"[kno, raises, 46, million, more, to, build, mo...",Marc Andreessen is normally enthusiastic about...,techcrunch 2010 09 08 kno raises 46 million mo...,0
7391,"[why, i, miss, college]",Mar 30 2009 I d like to congratulate Jane on h...,uncoached category why i miss college,0
7392,"[sweet, potatoes, eat, this, not, that, im, ea...",They re loaded with vitamin C which smoothes o...,eatthis menshealth slide sweet potatoes slides...,1
7393,"[naturally, ella]",,naturallyella,1


In [14]:
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

In [15]:
# Lemmatizing train Data
for index,entry in enumerate(data['Title']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    data.loc[index,'Title_final'] = str(Final_words)

In [16]:
data

Unnamed: 0,Title,Body,Url,label,Title_final
0,"[ibm, sees, holographic, calls, air, breathing...",A sign stands outside the International Busine...,bloomberg news 2010 12 23 ibm predicts hologra...,0,"['ibm', 'see', 'holographic', 'call', 'air', '..."
1,"[the, fully, electronic, futuristic, starting,...",And that can be carried on a plane without the...,popsci technology article 2012 07 electronic f...,1,"['fully', 'electronic', 'futuristic', 'start',..."
2,"[fruits, that, fight, the, flu, fruits, that, ...",Apples The most popular source of antioxidants...,menshealth health flu fighting fruits cm mmc F...,1,"['fruit', 'fight', 'flu', 'fruit', 'fight', 'f..."
3,"[10, foolproof, tips, for, better, sleep]",There was a period in my life when I had a lot...,dumblittleman 2007 12 10 foolproof tips for be...,1,"['foolproof', 'tip', 'good', 'sleep']"
4,"[the, 50, coolest, jerseys, you, didn, t, know...",Jersey sales is a curious business Whether you...,bleacherreport articles 1205138 the 50 coolest...,0,"['cool', 'jersey', 'know', 'exist', 'cool', 'j..."
...,...,...,...,...,...
7390,"[kno, raises, 46, million, more, to, build, mo...",Marc Andreessen is normally enthusiastic about...,techcrunch 2010 09 08 kno raises 46 million mo...,0,"['kno', 'raise', 'million', 'build', 'powerful..."
7391,"[why, i, miss, college]",Mar 30 2009 I d like to congratulate Jane on h...,uncoached category why i miss college,0,"['miss', 'college']"
7392,"[sweet, potatoes, eat, this, not, that, im, ea...",They re loaded with vitamin C which smoothes o...,eatthis menshealth slide sweet potatoes slides...,1,"['sweet', 'potato', 'eat', 'im', 'eat']"
7393,"[naturally, ella]",,naturallyella,1,"['naturally', 'ella']"


In [17]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(data['Title_final'],data['label'],test_size=0.3)

In [18]:
Tfidf_vect = TfidfVectorizer(max_features=10000)
Tfidf_vect.fit(data['Title_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [19]:
print(Tfidf_vect.vocabulary_)

{'ibm': 3920, 'see': 7109, 'holographic': 3797, 'call': 1168, 'air': 131, 'breathe': 995, 'battery': 628, 'airbreathing': 133, 'fully': 3206, 'electronic': 2548, 'futuristic': 3227, 'start': 7666, 'gun': 3544, 'eliminate': 2556, 'advantage': 94, 'race': 6537, 'fruit': 3185, 'fight': 2932, 'flu': 3029, 'cold': 1602, 'mens': 5088, 'health': 3655, 'foolproof': 3073, 'tip': 8247, 'good': 3412, 'sleep': 7385, 'cool': 1764, 'jersey': 4266, 'know': 4468, 'exist': 2734, 'havent': 3636, 'genital': 3311, 'herpes': 3728, 'treatment': 8382, 'fashion': 2841, 'lane': 4544, 'american': 233, 'wild': 8909, 'child': 1415, 'recovery': 6642, 'dean': 2049, 'johnson': 4299, 'insidershealthcom': 4081, 'valet': 8617, 'handbook': 3591, 'day': 2035, 'cooky': 1763, 'cream': 1858, 'brownie': 1047, 'sweet': 7937, 'business': 1113, 'financial': 2949, 'news': 5512, 'break': 983, 'international': 4125, 'cap': 1215, 'great': 3472, 'iron': 4171, 'man': 4901, 'food': 3059, 'trash': 8371, 'teeth': 8091, 'french': 3158, '

In [20]:
print(Train_X_Tfidf)

  (0, 6129)	0.3323042528348102
  (0, 5612)	0.5181133165537822
  (0, 3601)	0.4338345277550159
  (0, 2699)	0.35554033381477673
  (0, 1764)	0.36704516782889446
  (0, 221)	0.41447435757771184
  (1, 6202)	0.12426580310779613
  (1, 6016)	0.12034383467016178
  (1, 3332)	0.14381620001849052
  (1, 3331)	0.28763240003698104
  (1, 3330)	0.43144860005547153
  (1, 522)	0.8249315003115328
  (2, 1588)	0.8146487700601522
  (2, 594)	0.5799546373980309
  (3, 7591)	0.37360575898654413
  (3, 6859)	0.508362353478535
  (3, 5929)	0.5317586384963808
  (3, 1409)	0.27818864477026
  (3, 187)	0.49176242508269646
  (4, 6628)	0.2449110370849363
  (4, 5499)	0.44296039441508595
  (4, 3059)	0.31456919495765395
  (4, 2523)	0.47439043379081375
  (4, 2263)	0.5243553283095872
  (4, 1749)	0.3805996590623993
  :	:
  (5169, 1311)	0.2099819585040761
  (5169, 740)	0.3505993955080907
  (5169, 511)	0.4368941717151706
  (5171, 1633)	0.39935620166197816
  (5171, 1173)	0.9167958465187969
  (5172, 8367)	0.4946613852729461
  (5172, 8

In [21]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print('Naive Bayes Accuracy Score: ',accuracy_score(predictions_NB, Test_Y)*100)
print('Precision score: ', precision_score(predictions_NB, Test_Y)*100)
print('Recall score: ', recall_score(predictions_NB, Test_Y)*100)
print('Cross validation score for Naive Bayes: ', Naive.score(Test_X_Tfidf, predictions_NB))

Naive Bayes Accuracy Score:  75.84497521406038
Precision score:  80.35242290748899
Recall score:  74.44897959183675
Cross validation score for Naive Bayes:  1.0


In [22]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)
print('Precision score: ', precision_score(predictions_SVM, Test_Y)*100)
print('Recall score: ', recall_score(predictions_SVM, Test_Y)*100)
print('Cross validation score for SVM: ', SVM.score(Test_X_Tfidf, predictions_NB))

SVM Accuracy Score ->  76.56602073005858
Precision score:  68.98678414096916
Recall score:  82.33438485804416
Cross validation score for SVM:  0.8548895899053628


In [23]:
data_results = pd.DataFrame(list(predictions_SVM),columns=["Results"])

In [24]:
data_results

Unnamed: 0,Results
0,0
1,0
2,1
3,0
4,1
...,...
2214,0
2215,1
2216,0
2217,1


In [25]:
data.to_csv("data_results.csv")