In [20]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re 
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score 
from sklearn import metrics

In [2]:
news_data= pd.read_csv('train.csv')
news_data.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [3]:
news_data.shape
news_data.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [4]:
news_data = news_data.fillna('')

In [5]:
news_data.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

In [6]:
news_data['content'] = news_data['author'] + ' ' + news_data['title']
news_data['content'].head()

0    Darrell Lucus House Dem Aide: We Didn’t Even S...
1    Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo...
2    Consortiumnews.com Why the Truth Might Get You...
3    Jessica Purkiss 15 Civilians Killed In Single ...
4    Howard Portnoy Iranian woman jailed for fictio...
Name: content, dtype: object

In [7]:
X = news_data.drop(columns='label', axis=1)
Y = news_data['label']

In [8]:
X.head()

Unnamed: 0,id,title,author,text,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus House Dem Aide: We Didn’t Even S...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,"Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",Consortiumnews.com Why the Truth Might Get You...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,Jessica Purkiss 15 Civilians Killed In Single ...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,Howard Portnoy Iranian woman jailed for fictio...


In [9]:
Y.head()

0    1
1    0
2    1
3    1
4    1
Name: label, dtype: int64

In [10]:
port_stem = PorterStemmer()

In [11]:
message = []
for i in range(0,len(X)):
    stemmed_content = re.sub('[^a-zA-Z]',' ', X['content'][i])
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    message.append(stemmed_content)

In [12]:
message[10]

'aaron klein obama organ action partner soro link indivis disrupt trump agenda'

In [13]:
## TFidf Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_v=TfidfVectorizer(max_features=5000,ngram_range=(1,3))
X = tfidf_v.fit_transform(message).toarray()

In [14]:
X.shape

(20800, 5000)

In [15]:
Y.shape

(20800,)

In [16]:
## Divide the dataset into Train and Test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=0)

In [17]:
tfidf_v.get_feature_names()[:20]

['aaron',
 'aaron carrol',
 'aaron kesel',
 'aaron klein',
 'aaron klein ali',
 'abandon',
 'abbi',
 'abbi goodnough',
 'abc',
 'abduct',
 'abe',
 'abedin',
 'abelson',
 'abort',
 'abram',
 'absolut',
 'abus',
 'accept',
 'access',
 'access pipelin']

In [18]:
count_df = pd.DataFrame(X_train, columns=tfidf_v.get_feature_names())
count_df.head()

Unnamed: 0,aaron,aaron carrol,aaron kesel,aaron klein,aaron klein ali,abandon,abbi,abbi goodnough,abc,abduct,...,zika,zika viru,zionist,zone,zraick,zraick sandra,zraick sandra stevenson,zu,zuckerberg,zuess
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
from sklearn.naive_bayes import MultinomialNB
classifier=MultinomialNB()

In [21]:
classifier.fit(X_train, y_train)
pred = classifier.predict(X_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)

accuracy:   0.946


In [23]:
test_data = pd.read_csv('Test.csv')

In [28]:
test_data.head()

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...


In [25]:
test_data.shape

(5200, 4)

In [31]:
test_data['content'] = test_data['author'] + ' ' + test_data['title']
test_data['content'].head()

0    David Streitfeld Specter of Trump Loosens Tong...
1                                                  NaN
2    Common Dreams #NoDAPL: Native American Leaders...
3    Daniel Victor Tim Tebow Will Attempt Another C...
4    Truth Broadcast Network Keiser Report: Meme Wa...
Name: content, dtype: object

In [32]:
test_content = test_data.drop(columns='title', axis=1)

In [33]:
test_content.head()

Unnamed: 0,id,author,text,content
0,20800,David Streitfeld,"PALO ALTO, Calif. — After years of scorning...",David Streitfeld Specter of Trump Loosens Tong...
1,20801,,Russian warships ready to strike terrorists ne...,
2,20802,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...,Common Dreams #NoDAPL: Native American Leaders...
3,20803,Daniel Victor,"If at first you don’t succeed, try a different...",Daniel Victor Tim Tebow Will Attempt Another C...
4,20804,Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...,Truth Broadcast Network Keiser Report: Meme Wa...


In [36]:
test_content.isnull().sum()

id           0
author     503
text         7
content    625
dtype: int64

In [38]:
test_content = test_content.fillna('')
test_content.isnull().sum()

id         0
author     0
text       0
content    0
dtype: int64

In [39]:
port_stem = PorterStemmer()
test_list = []
for i in range(0,len(test_content)):
    content = re.sub('[^a-zA-Z]',' ', test_content['content'][i])
    content = content.lower()
    content = content.split()
    content = [port_stem.stem(word) for word in content if not word in stopwords.words('english')]
    content = ' '.join(content)
    test_list.append(content)

In [40]:
tfidf=TfidfVectorizer(max_features=5000,ngram_range=(1,3))
test_vec = tfidf.fit_transform(test_list).toarray()

In [42]:
pred_1 = classifier.predict(test_vec)
len(pred_1)

5200

In [43]:
pred_1

array([0, 0, 0, ..., 0, 0, 1])

In [44]:
result = pd.DataFrame(pred_1) #preditcions are nothing but the final predictions of your model on input features of your new unseen test data
result.index = test_data.index # its important for comparison. Here "test_data" is your new test dataset
result.columns = ["prediction"]
result.to_csv("prediction_results.csv", index = False)      # the csv file will be saved locally on the same location where this 