### Imporing dataset

In [2]:
import numpy as np
import pandas as pd
data=pd.read_csv("dataset/train.csv")
data.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


### Extracting Dependent and Independent Variables

In [3]:
X=data.drop("label",axis=1)
X.head()

Unnamed: 0,id,title,author,text
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ..."
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...


In [4]:
y=data['label']
y

0        1
1        0
2        1
3        1
4        1
        ..
20795    0
20796    0
20797    0
20798    1
20799    1
Name: label, Length: 20800, dtype: int64

### Preprocessing

In [5]:
# checking null values
print(data.isnull().values.any())
print(data.shape)

True
(20800, 5)


In [6]:
data.isna().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [7]:
# dropping null values
data=data.dropna()
print(data.shape)

(18285, 5)


In [8]:
# resetting index (beacuse certain rows has been deleted)
data=data.reset_index()

In [9]:
messages=X['title']
messages[0]

'House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It'

In [10]:
X['title'][0]

'House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It'

### Preprocessing, Bag of Words

In [11]:
# Necessary import statements
import nltk
import re
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [12]:
# Further preprocessing and stemming
stemmer= PorterStemmer()
corpus=[]
for i in range(0,len(messages)):
    msg=re.sub('[^a-zA-Z]', ' ',str(messages[i]))
    msg=msg.lower()
    msg=msg.split()
    msg=[stemmer.stem(word) for word in msg  if not word in stopwords.words('english')]
    msg=' '.join(msg)
    corpus.append(msg)

In [33]:
print(messages[0])
corpus[0]

House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It


'hous dem aid even see comey letter jason chaffetz tweet'

In [14]:
# Creating Bag of Words Model
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=5000, ngram_range=(1,3))
X=cv.fit_transform(corpus).toarray()

In [15]:
X.shape

(20800, 5000)

In [16]:
cv.get_feature_names_out()[:20]

array(['abandon', 'abc', 'abc news', 'abduct', 'abe', 'abedin', 'abl',
       'abort', 'abroad', 'absolut', 'absurd', 'abus', 'abus new',
       'abus new york', 'academi', 'accept', 'access', 'access pipelin',
       'access pipelin protest', 'accid'], dtype=object)

In [17]:
cv.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': 5000,
 'min_df': 1,
 'ngram_range': (1, 3),
 'preprocessor': None,
 'stop_words': None,
 'strip_accents': None,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'vocabulary': None}

In [34]:
cv_df= pd.DataFrame(X,columns=cv.get_feature_names_out())
cv_df.head()

Unnamed: 0,abandon,abc,abc news,abduct,abe,abedin,abl,abort,abroad,absolut,...,zero,zika,zika viru,zionist,zone,zone new,zone new york,zoo,zu,zuckerberg
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Train Test Split

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.30,random_state=123)

### MultinomialNB Classifier

In [20]:
from sklearn.naive_bayes import MultinomialNB
model=MultinomialNB()
model.fit(X_train,y_train)

In [27]:
model.get_params()

{'alpha': 1.0, 'class_prior': None, 'fit_prior': True, 'force_alpha': 'warn'}

In [21]:
y_pred=model.predict(X_test)

### Confusion Matrix, Accuracy score

In [23]:
from sklearn.metrics import confusion_matrix, accuracy_score
matrix= confusion_matrix(y_test,y_pred)
accuracy= accuracy_score(y_test,y_pred)
print("Confusion Matrix:")
print(matrix)
print("\nAccuracy Score: {:.2f}%".format(accuracy*100))

Confusion Matrix:
[[2826  328]
 [ 196 2890]]

Accuracy Score: 91.60%


### Multinomial Classifier with Hyperparameters

In [24]:
model_2= MultinomialNB(alpha=1)

In [28]:
previous_accuracy=0
for alpha in np.arange(0,10,1):
    sub_model=MultinomialNB(alpha=alpha)
    sub_model.fit(X_train,y_train)
    y_pred_2=sub_model.predict(X_test)
    accuracy_2=accuracy_score(y_test,y_pred_2)
    if accuracy_2>previous_accuracy:
        model_2=sub_model
    print("Alpha: {}, Accuracy Score: {:.2f}%".format(alpha,accuracy_2*100))



Alpha: 0, Accuracy Score: 90.27%
Alpha: 1, Accuracy Score: 91.60%
Alpha: 2, Accuracy Score: 91.65%
Alpha: 3, Accuracy Score: 91.65%
Alpha: 4, Accuracy Score: 91.60%
Alpha: 5, Accuracy Score: 91.60%
Alpha: 6, Accuracy Score: 91.59%
Alpha: 7, Accuracy Score: 91.46%
Alpha: 8, Accuracy Score: 91.46%
Alpha: 9, Accuracy Score: 91.38%


In [24]:
model.feature_log_prob_[0]

array([ -8.98729909,  -9.30575282, -10.28658207, ...,  -9.43928421,
       -11.38519436,  -9.59343489])

In [32]:
# Most Fake words
sorted(zip(model_2.feature_log_prob_[0], cv.get_feature_names_out()))[:20]

[(-9.562631275593727, 'access pipelin protest'),
 (-9.562631275593727, 'accus trump'),
 (-9.562631275593727, 'achiev'),
 (-9.562631275593727, 'acknowledg emf'),
 (-9.562631275593727, 'acknowledg emf damag'),
 (-9.562631275593727, 'acquit'),
 (-9.562631275593727, 'adhd'),
 (-9.562631275593727, 'airstrik kill'),
 (-9.562631275593727, 'al nusra'),
 (-9.562631275593727, 'alaska'),
 (-9.562631275593727, 'america finest'),
 (-9.562631275593727, 'america finest news'),
 (-9.562631275593727, 'america last'),
 (-9.562631275593727, 'american concern'),
 (-9.562631275593727, 'american concern elect'),
 (-9.562631275593727, 'american lookout'),
 (-9.562631275593727, 'american peopl defeat'),
 (-9.562631275593727, 'american polit'),
 (-9.562631275593727, 'arriv bosanski'),
 (-9.562631275593727, 'arriv bosanski prijevod')]

In [31]:
# Most real words
sorted(zip(model_2.feature_log_prob_[0], cv.get_feature_names_out()),reverse=True)[:20]

[(-3.3167398649100246, 'new'),
 (-3.360320704981943, 'time'),
 (-3.366866264973016, 'york'),
 (-3.367092739891886, 'new york'),
 (-3.3814650643941686, 'york time'),
 (-3.3814650643941686, 'new york time'),
 (-4.344078877514553, 'breitbart'),
 (-4.3802237003203945, 'trump'),
 (-5.661781570763707, 'donald'),
 (-5.666286082884811, 'donald trump'),
 (-6.100373637170326, 'say'),
 (-6.375360790140858, 'clinton'),
 (-6.422317773228629, 'obama'),
 (-6.491997693866619, 'state'),
 (-6.4971656640250615, 'presid'),
 (-6.512831780769461, 'report'),
 (-6.550369700088526, 'hous'),
 (-6.641862040513192, 'brief'),
 (-6.69726081990298, 'attack'),
 (-6.70999984568041, 'hillari')]

### Passive Aggressive Classifier Algorithm

In [35]:
from sklearn.linear_model import PassiveAggressiveClassifier
model_3=PassiveAggressiveClassifier(max_iter=1000)
model_3.fit(X_train,y_train)

In [36]:
y_pred_3= model_3.predict(X_test)

### Confusion Matrix, Accuracy Score

In [39]:
# Calculate the confusion matrix
matrix_3= confusion_matrix(y_test,y_pred_3)

# Calculate the accuracy score
accuracy_3= accuracy_score(y_test,y_pred_3)

print("Confusion Matrix:")
print(matrix_3)
print("\nAccuracy Score: {:.2f}%".format(accuracy_3*100))

Confusion Matrix:
[[2921  233]
 [ 201 2885]]

Accuracy Score: 93.04%
