### Imporing dataset

In [1]:
import numpy as np
import pandas as pd
data=pd.read_csv("dataset/train.csv")
data.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


### Extracting Dependent and Independent Variables

In [2]:
X=data.drop("label",axis=1)
X.head()

Unnamed: 0,id,title,author,text
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ..."
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...


In [3]:
y=data['label']
y

0        1
1        0
2        1
3        1
4        1
        ..
20795    0
20796    0
20797    0
20798    1
20799    1
Name: label, Length: 20800, dtype: int64

### Preprocessing

In [4]:
# checking null values
print(data.isnull().values.any())
print(data.shape)

True
(20800, 5)


In [5]:
# dropping null values
data=data.dropna()
print(data.shape)

(18285, 5)


In [6]:
# resetting index (beacuse certain rows has been deleted)
data=data.reset_index()

In [7]:
messages=X['title']
messages[0]

'House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It'

In [8]:
X['title'][0]

'House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It'

### Preprocessing, Bag of Words

In [9]:
# Necessary import statements
import nltk
import re
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [10]:
# Further preprocessing and stemming
stemmer= PorterStemmer()
corpus=[]
for i in range(0,len(messages)):
    msg=re.sub('[^a-zA-Z]', ' ',str(messages[i]))
    msg=msg.lower()
    msg=msg.split()
    msg=[stemmer.stem(word) for word in msg  if not word in stopwords.words('english')]
    msg=' '.join(msg)
    corpus.append(msg)

In [11]:
corpus[0]

'hous dem aid even see comey letter jason chaffetz tweet'

In [12]:
# Creating Bag of Words Model
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=5000, ngram_range=(1,3))
X=cv.fit_transform(corpus).toarray()

In [13]:
X.shape

(20800, 5000)

In [14]:
cv.get_feature_names_out()[:20]

array(['abandon', 'abc', 'abc news', 'abduct', 'abe', 'abedin', 'abl',
       'abort', 'abroad', 'absolut', 'absurd', 'abus', 'abus new',
       'abus new york', 'academi', 'accept', 'access', 'access pipelin',
       'access pipelin protest', 'accid'], dtype=object)

In [15]:
cv.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': 5000,
 'min_df': 1,
 'ngram_range': (1, 3),
 'preprocessor': None,
 'stop_words': None,
 'strip_accents': None,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'vocabulary': None}

In [16]:
cv_df= pd.DataFrame(X,columns=cv.get_feature_names_out())
cv_df.head()

Unnamed: 0,abandon,abc,abc news,abduct,abe,abedin,abl,abort,abroad,absolut,...,zero,zika,zika viru,zionist,zone,zone new,zone new york,zoo,zu,zuckerberg
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Train Test Split

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.30,random_state=123)

### MultinomialNB Classifier

In [18]:
from sklearn.naive_bayes import MultinomialNB
model=MultinomialNB()
model.fit(X_train,y_train)

In [19]:
y_pred=model.predict(X_test)

### Confusion Matrix, Accuracy score

In [20]:
from sklearn.metrics import confusion_matrix, accuracy_score
matrix= confusion_matrix(y_test,y_pred)
matrix

array([[2826,  328],
       [ 196, 2890]], dtype=int64)

In [21]:
accuracy= accuracy_score(y_test,y_pred)
accuracy

0.916025641025641

### Multinomial Classifier with Hyperparameters

In [36]:
model_2= MultinomialNB(alpha=0.1)

In [37]:
previous_accuracy=0
for alpha in np.arange(0,1,0.1):
    sub_model=MultinomialNB(alpha=alpha)
    sub_model.fit(X_train,y_train)
    y_pred_2=sub_model.predict(X_test)
    accuracy_2=accuracy_score(y_test,y_pred_2)
    if accuracy_2>previous_accuracy:
        model_2=sub_model
    print("Alpha: {}, Score: {}".format(alpha,accuracy_2))



Alpha: 0.0, Score: 0.902724358974359
Alpha: 0.1, Score: 0.9136217948717948
Alpha: 0.2, Score: 0.9152243589743589
Alpha: 0.30000000000000004, Score: 0.9157051282051282
Alpha: 0.4, Score: 0.9157051282051282
Alpha: 0.5, Score: 0.9150641025641025
Alpha: 0.6000000000000001, Score: 0.9152243589743589
Alpha: 0.7000000000000001, Score: 0.9147435897435897
Alpha: 0.8, Score: 0.9153846153846154
Alpha: 0.9, Score: 0.9155448717948718


In [38]:
model.feature_log_prob_[0]

array([ -8.98729909,  -9.30575282, -10.28658207, ...,  -9.43928421,
       -11.38519436,  -9.59343489])

In [39]:
# Most Fake words
sorted(zip(model_2.feature_log_prob_[0], cv.get_feature_names_out()))[:20]

[(-11.484857439856956, 'access pipelin protest'),
 (-11.484857439856956, 'accus trump'),
 (-11.484857439856956, 'achiev'),
 (-11.484857439856956, 'acknowledg emf'),
 (-11.484857439856956, 'acknowledg emf damag'),
 (-11.484857439856956, 'acquit'),
 (-11.484857439856956, 'adhd'),
 (-11.484857439856956, 'airstrik kill'),
 (-11.484857439856956, 'al nusra'),
 (-11.484857439856956, 'alaska'),
 (-11.484857439856956, 'america finest'),
 (-11.484857439856956, 'america finest news'),
 (-11.484857439856956, 'america last'),
 (-11.484857439856956, 'american concern'),
 (-11.484857439856956, 'american concern elect'),
 (-11.484857439856956, 'american lookout'),
 (-11.484857439856956, 'american peopl defeat'),
 (-11.484857439856956, 'american polit'),
 (-11.484857439856956, 'arriv bosanski'),
 (-11.484857439856956, 'arriv bosanski prijevod')]

In [40]:
# Most real words
sorted(zip(model_2.feature_log_prob_[0], cv.get_feature_names_out()),reverse=True)[:20]

[(-2.9381270214049486, 'new'),
 (-2.9817857108249317, 'time'),
 (-2.9883432596491186, 'york'),
 (-2.9885701507877247, 'new york'),
 (-3.0029690831081552, 'york time'),
 (-3.0029690831081552, 'new york time'),
 (-3.968605509935381, 'breitbart'),
 (-4.0049306068704205, 'trump'),
 (-5.299792588241739, 'donald'),
 (-5.304380804595537, 'donald trump'),
 (-5.748643629565686, 'say'),
 (-6.032865697372542, 'clinton'),
 (-6.081679682692775, 'obama'),
 (-6.154288029471731, 'state'),
 (-6.15968178580605, 'presid'),
 (-6.176039838675052, 'report'),
 (-6.215282542174573, 'hous'),
 (-6.311221850656459, 'brief'),
 (-6.3695285190905775, 'attack'),
 (-6.382960554482375, 'hillari')]

### Passive Aggressive Classifier Algorithm

In [42]:
from sklearn.linear_model import PassiveAggressiveClassifier
model_3=PassiveAggressiveClassifier(max_iter=1000)
model_3.fit(X_train,y_train)

In [43]:
y_pred_3= model_3.predict(X_test)

### Confusion Matrix, Accuracy Score

In [44]:
matrix_3=confusion_matrix(y_test,y_pred_3)
matrix_3

array([[2918,  236],
       [ 228, 2858]], dtype=int64)

In [45]:
accuracy_3=accuracy_score(y_test,y_pred_3)
accuracy_3

0.9256410256410257