In [1]:
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import numpy as np
import itertools
from sklearn.linear_model import PassiveAggressiveClassifier



In [2]:
#Reading data as pandas dataframe
frame=pd.read_excel('Train_set.xlsx')

In [3]:
frame.shape
frame.head()

Unnamed: 0,title,text,label
0,This man claims to be Pakistan's answer to the...,This man claims to be Pakistan's answer to the...,Fake
1,Grade 20 officer steals Kuwaiti delegate’s wal...,ISLAMABAD: The government was exposed to extre...,Fake
2,Paid day-to-day expenses of PM House from my o...,Incarcerated former prime minister Nawaz Shari...,Fake
3,Rimal Khan and Her Alleged Relation With Imran...,"Rimal Ali, a female Transgender disowned by fa...",Fake
4,Major oil find near Pak-Iran border,KARACHI: Minister for Maritime Affairs and For...,Fake


In [4]:
y = frame.label
y.head()

0    Fake
1    Fake
2    Fake
3    Fake
4    Fake
Name: label, dtype: object

In [5]:
X_train, X_test, y_train, y_test = train_test_split(frame['text'], y, test_size=0.33, random_state=53)

In [6]:
X_train.head()

337    United States President Donald Trump and China...
174    A CLEAR message of unity has been sent out by ...
343    US Secretary of State Mike Pompeo telephoned P...
66     Murtaza Wahab, Adviser to Sindh Chief Minister...
202    Prime Minister Imran Khan has ordered razing w...
Name: text, dtype: object

In [7]:
y_train.head()

337    True 
174    True 
343     Fake
66     True 
202    True 
Name: label, dtype: object

In [8]:
# Initialize the `count_vectorizer` 
count_vectorizer = CountVectorizer(stop_words='english')

# Fit and transform the training data.
count_train = count_vectorizer.fit_transform(X_train)

# Transform the test set 
count_test = count_vectorizer.transform(X_test)

In [9]:

# Initialize the `tfidf_vectorizer` 
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7) 

# Fit and transform the training data 
tfidf_train = tfidf_vectorizer.fit_transform(X_train) 

# Transform the test set 
tfidf_test = tfidf_vectorizer.transform(X_test)

In [10]:
print(tfidf_test)

  (0, 8083)	0.08482154912083911
  (0, 7973)	0.11686326976317357
  (0, 7847)	0.08209920440765768
  (0, 7764)	0.18696383930538085
  (0, 7516)	0.10749166724599923
  (0, 7465)	0.06092077904484056
  (0, 7049)	0.09147080692483202
  (0, 7046)	0.08342031915787501
  (0, 6719)	0.19624012945764982
  (0, 6561)	0.06712009082142824
  (0, 6326)	0.12351252756716646
  (0, 6216)	0.08962076255427016
  (0, 6096)	0.11686326976317357
  (0, 6009)	0.09568484440035645
  (0, 5737)	0.047912999543699625
  (0, 5712)	0.08631324188318212
  (0, 5678)	0.11170570472152366
  (0, 5415)	0.04880781641585067
  (0, 5319)	0.23899195223756767
  (0, 5280)	0.20168481888401266
  (0, 5154)	0.11686326976317357
  (0, 5011)	0.10749166724599923
  (0, 4952)	0.1405847631240298
  (0, 4940)	0.26576826016868155
  (0, 4871)	0.07272760189048337
  :	:
  (113, 1257)	0.04321297281075771
  (113, 1202)	0.03846648475561793
  (113, 1179)	0.04178063596927824
  (113, 1171)	0.04490706772545101
  (113, 1154)	0.13472120317635303
  (113, 1113)	0.04490706

In [11]:
# Get the feature names of `tfidf_vectorizer` 
print(tfidf_vectorizer.get_feature_names()[-10:])

['zone', 'zones', 'zoological', 'zor', 'zsl', 'zubair', 'zuckerberg', 'zulfi', 'zulfikar', 'zulfiqar']


In [12]:
# Get the feature names of `count_vectorizer` 
print(count_vectorizer.get_feature_names()[0:10])

['00', '000', '000ft', '01', '025', '049', '09', '0yk', '10', '100']


In [13]:

count_df = pd.DataFrame(count_train.A, columns=count_vectorizer.get_feature_names())
tfidf_df = pd.DataFrame(tfidf_train.A, columns=tfidf_vectorizer.get_feature_names())
difference = set(count_df.columns) - set(tfidf_df.columns)
difference
set()
print(count_df.equals(tfidf_df))
count_df.head()

False


Unnamed: 0,00,000,000ft,01,025,049,09,0yk,10,100,...,zone,zones,zoological,zor,zsl,zubair,zuckerberg,zulfi,zulfikar,zulfiqar
0,0,0,0,0,0,0,0,0,4,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
import matplotlib.pyplot as plt
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    See full source and example: 
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
    
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [21]:
clf = MultinomialNB() 
clf.fit(tfidf_train, y_train)
pred = clf.predict(tfidf_test)
score = accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
#cm = confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
#plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])

accuracy:   0.702


In [22]:
clf = MultinomialNB() 
clf.fit(count_train, y_train)
pred = clf.predict(count_test)
score = accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
#cm = confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
#plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])

accuracy:   0.702


In [24]:
linear_clf = PassiveAggressiveClassifier(n_iter=50)
linear_clf.fit(tfidf_train, y_train)
pred = linear_clf.predict(tfidf_test)
score = accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
#cm = confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
#plot_confusion_matrix(cm, classes=['FAKE', 'REAL'])

accuracy:   0.667




In [25]:
def most_informative_feature_for_binary_classification(vectorizer, classifier, n=100):
    """
    See: https://stackoverflow.com/a/26980472
    
    Identify most important features if given a vectorizer and binary classifier. Set n to the number
    of weighted features you would like to show. (Note: current implementation merely prints and does not 
    return top classes.)
    """

    class_labels = classifier.classes_
    feature_names = vectorizer.get_feature_names()
    topn_class1 = sorted(zip(classifier.coef_[0], feature_names))[:n]
    topn_class2 = sorted(zip(classifier.coef_[0], feature_names))[-n:]

    for coef, feat in topn_class1:
        print(class_labels[0], coef, feat)

    print()

    for coef, feat in reversed(topn_class2):
        print(class_labels[1], coef, feat)


most_informative_feature_for_binary_classification(tfidf_vectorizer, linear_clf, n=30)

Fake -2.7035907991957076 rainfall
Fake -2.401348042042769 tlv
Fake -2.167448370705674 bisp
Fake -2.1088987521870526 flooded
Fake -2.0444607605555123 scharf
Fake -1.9088310058191869 asked
Fake -1.746638532693724 flew
Fake -1.7255318309363212 week
Fake -1.6409591937064505 stated
Fake -1.6051559136939335 release
Fake -1.5942175960741456 nawaz
Fake -1.5895347847308166 verify
Fake -1.5403799377438503 new
Fake -1.5269868296532525 tweet
Fake -1.3736276302231138 affecting
Fake -1.3083507950398559 furthermore
Fake -1.2973087134150798 urged
Fake -1.2811519386062502 claiming
Fake -1.2753521650002329 check
Fake -1.261784149332416 website
Fake -1.186786211200096 nab
Fake -1.1508625364995273 forbidden
Fake -1.1233846696117782 country
Fake -1.1032882375048874 results
Fake -1.1014203343810767 mr
Fake -1.0947441640749305 information
Fake -1.0604227854275938 sharif
Fake -1.0246095278033158 israel
Fake -1.014792637005093 travelling
Fake -0.9811139594596564 islamabad

True  1.9170665519270231 effecting
Tr

In [26]:
feature_names = tfidf_vectorizer.get_feature_names()
sorted(zip(clf.coef_[0], feature_names), reverse=True)[:20]

[(-4.4807466538476275, 'saiful'),
 (-4.630091942399948, 'pakistani'),
 (-5.090743466075403, 'minister'),
 (-5.106829603827028, 'government'),
 (-5.599797610193011, 'khan'),
 (-5.794585935752096, 'prince'),
 (-5.849848614427145, 'china'),
 (-5.957737576438331, 'policemen'),
 (-6.023120335701182, 'court'),
 (-6.064505551864037, 'peoples'),
 (-6.0930789243080925, 'islamabad'),
 (-6.107677723729245, 'country'),
 (-6.152798159009715, 'imran'),
 (-6.18405070251382, 'chief'),
 (-6.2496479849996325, 'media'),
 (-6.2496479849996325, 'foreign'),
 (-6.266742418358933, 'news'),
 (-6.266742418358933, 'billion'),
 (-6.284134161070802, 'economic'),
 (-6.3018337381702025, 'tlv')]

In [27]:

### Most fake
sorted(zip(clf.coef_[0], feature_names))[:20]

[(-10.327185428905352, '025'),
 (-10.327185428905352, '09'),
 (-10.327185428905352, '0yk'),
 (-10.327185428905352, '1000'),
 (-10.327185428905352, '10000'),
 (-10.327185428905352, '101'),
 (-10.327185428905352, '10m'),
 (-10.327185428905352, '116'),
 (-10.327185428905352, '118'),
 (-10.327185428905352, '119'),
 (-10.327185428905352, '120'),
 (-10.327185428905352, '122'),
 (-10.327185428905352, '12800'),
 (-10.327185428905352, '13th'),
 (-10.327185428905352, '140m'),
 (-10.327185428905352, '144'),
 (-10.327185428905352, '145'),
 (-10.327185428905352, '151'),
 (-10.327185428905352, '159'),
 (-10.327185428905352, '1600')]

In [28]:
tokens_with_weights = sorted(list(zip(feature_names, clf.coef_[0])))
for i in tokens_with_weights:
    print(i)
    break

('00', -9.228573140237243)
