In [1]:
import pandas as pd
import numpy as np
import re
import en_core_web_sm
from ydata_profiling import ProfileReport
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('news_dataset_2.csv', error_bad_lines = False)
df.head(5)



  df = pd.read_csv('news_dataset_2.csv', error_bad_lines = False)


Unnamed: 0,title,text,subject,date,is_fake
0,WATCH: Hypocrite Mike Pence Calls Democratic ...,This is unbelievably outrageous.Republicans ar...,News,"February 4, 2017",1
1,Ammon and Ryan Bundy Found ‘Not Guilty’ in Ore...,"21st Century Wire Yesterday, Judge Anna Brown ...",US_News,"October 29, 2016",1
2,WATCH: HILARIOUS Video Proves CNN Doesn’t Even...,Watch these hilarious examples of CNN having r...,left-news,"Apr 3, 2017",1
3,TRUMP CHIEF OF STAFF Goes At It With Liberal H...,,politics,"Jan 29, 2017",1
4,PRICELESS! What Nancy Pelosi Just Said About T...,Nancy Pelosi is obviously geographically chall...,politics,"Nov 10, 2017",1


In [3]:
ProfileReport(df, title = 'Profiling Report')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [4]:
df.drop(['date', 'subject'], axis = 1, inplace = True)
df.head(5)

Unnamed: 0,title,text,is_fake
0,WATCH: Hypocrite Mike Pence Calls Democratic ...,This is unbelievably outrageous.Republicans ar...,1
1,Ammon and Ryan Bundy Found ‘Not Guilty’ in Ore...,"21st Century Wire Yesterday, Judge Anna Brown ...",1
2,WATCH: HILARIOUS Video Proves CNN Doesn’t Even...,Watch these hilarious examples of CNN having r...,1
3,TRUMP CHIEF OF STAFF Goes At It With Liberal H...,,1
4,PRICELESS! What Nancy Pelosi Just Said About T...,Nancy Pelosi is obviously geographically chall...,1


In [5]:
df.isnull().sum()

title      0
text       0
is_fake    0
dtype: int64

In [6]:
df.shape

(12000, 3)

In [7]:
df = df[df['text'] != ' ']
df.shape

(11851, 3)

In [8]:
df.size

35553

In [9]:
df = df.drop_duplicates()
df.size

34347

In [10]:
df['title with text'] = df['title'] + df['text']
df.head(5)

Unnamed: 0,title,text,is_fake,title with text
0,WATCH: Hypocrite Mike Pence Calls Democratic ...,This is unbelievably outrageous.Republicans ar...,1,WATCH: Hypocrite Mike Pence Calls Democratic ...
1,Ammon and Ryan Bundy Found ‘Not Guilty’ in Ore...,"21st Century Wire Yesterday, Judge Anna Brown ...",1,Ammon and Ryan Bundy Found ‘Not Guilty’ in Ore...
2,WATCH: HILARIOUS Video Proves CNN Doesn’t Even...,Watch these hilarious examples of CNN having r...,1,WATCH: HILARIOUS Video Proves CNN Doesn’t Even...
4,PRICELESS! What Nancy Pelosi Just Said About T...,Nancy Pelosi is obviously geographically chall...,1,PRICELESS! What Nancy Pelosi Just Said About T...
5,LOL! Democrat Congressman Says Best Way To Fig...,"MSNBC host asks Congressman Ted Leiu, a Democr...",1,LOL! Democrat Congressman Says Best Way To Fig...


In [11]:
def preprocess(text):
  text = text.lower()
  text = re.sub(r'[^a-z]', ' ', text)
  text = re.sub(r'  ', ' ', text)
  stop_words = [
                    'a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and',
                    'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being',
                    'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn',
                    "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during',
                    'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't",
                    'have', 'haven', "haven't", 'having', 'he', 'her', 'here', 'hers', 'herself', 'him',
                    'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'isn', "isn't", 'it', "it's", 'its',
                    'itself', 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn',
                    "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off',
                    'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own',
                    're', 's', 'same', 'shan', "shan't", 'she', "she's", 'should', "should've", 'shouldn',
                    "shouldn't", 'so', 'some', 'such', 't', 'than', 'that', "that'll", 'the', 'their', 'theirs',
                    'them', 'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 'through', 'to',
                    'too', 'under', 'until', 'up', 've', 'very', 'was', 'wasn', "wasn't", 'we', 'were', 'weren',
                    "weren't", 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'will', 'with',
                    'won', "won't", 'wouldn', "wouldn't", 'y', 'you', "you'd", "you'll", "you're", "you've",
                    'your', 'yours', 'yourself', 'yourselves'
               ]

  words = text.split()
  words = [word for word in words if word not in stop_words]
  return ' '.join(words)

In [12]:
df['title with text'] = df['title with text'].apply(preprocess)
df.head(5)

Unnamed: 0,title,text,is_fake,title with text
0,WATCH: Hypocrite Mike Pence Calls Democratic ...,This is unbelievably outrageous.Republicans ar...,1,watch hypocrite mike pence calls democratic ob...
1,Ammon and Ryan Bundy Found ‘Not Guilty’ in Ore...,"21st Century Wire Yesterday, Judge Anna Brown ...",1,ammon ryan bundy found guilty oregon federal c...
2,WATCH: HILARIOUS Video Proves CNN Doesn’t Even...,Watch these hilarious examples of CNN having r...,1,watch hilarious video proves cnn even bother v...
4,PRICELESS! What Nancy Pelosi Just Said About T...,Nancy Pelosi is obviously geographically chall...,1,priceless nancy pelosi said trump backfired bi...
5,LOL! Democrat Congressman Says Best Way To Fig...,"MSNBC host asks Congressman Ted Leiu, a Democr...",1,lol democrat congressman says best way fight f...


In [13]:
def lem_text(text):
    nlp = en_core_web_sm.load()
    doc = nlp(text)
    lemmatized_words = [token.lemma_ for token in doc]
    return lemmatized_words

In [14]:
df['title with text'] = df['title with text'].apply(lem_text)
df.head(5)

Unnamed: 0,title,text,is_fake,title with text
0,WATCH: Hypocrite Mike Pence Calls Democratic ...,This is unbelievably outrageous.Republicans ar...,1,"[watch, hypocrite, mike, pence, call, democrat..."
1,Ammon and Ryan Bundy Found ‘Not Guilty’ in Ore...,"21st Century Wire Yesterday, Judge Anna Brown ...",1,"[ammon, ryan, bundy, find, guilty, oregon, fed..."
2,WATCH: HILARIOUS Video Proves CNN Doesn’t Even...,Watch these hilarious examples of CNN having r...,1,"[watch, hilarious, video, prove, cnn, even, bo..."
4,PRICELESS! What Nancy Pelosi Just Said About T...,Nancy Pelosi is obviously geographically chall...,1,"[priceless, nancy, pelosi, say, trump, backfir..."
5,LOL! Democrat Congressman Says Best Way To Fig...,"MSNBC host asks Congressman Ted Leiu, a Democr...",1,"[lol, democrat, congressman, say, good, way, f..."


In [15]:
text_for_training = df['title with text'].to_list()

In [16]:
model = Word2Vec(text_for_training, vector_size = 100, window = 5, min_count = 2, workers = 3, sg = 0, epochs = 20)

In [17]:
df['vector'] = df['title with text'].apply(lambda x: model.wv.get_mean_vector(x, ignore_missing = True))
df.head(5)

Unnamed: 0,title,text,is_fake,title with text,vector
0,WATCH: Hypocrite Mike Pence Calls Democratic ...,This is unbelievably outrageous.Republicans ar...,1,"[watch, hypocrite, mike, pence, call, democrat...","[-0.005368908, 0.013794179, 0.0033937022, -0.0..."
1,Ammon and Ryan Bundy Found ‘Not Guilty’ in Ore...,"21st Century Wire Yesterday, Judge Anna Brown ...",1,"[ammon, ryan, bundy, find, guilty, oregon, fed...","[0.020560464, 0.036553193, -0.006683607, -0.01..."
2,WATCH: HILARIOUS Video Proves CNN Doesn’t Even...,Watch these hilarious examples of CNN having r...,1,"[watch, hilarious, video, prove, cnn, even, bo...","[0.011666809, 0.07191304, 0.014804526, -0.0128..."
4,PRICELESS! What Nancy Pelosi Just Said About T...,Nancy Pelosi is obviously geographically chall...,1,"[priceless, nancy, pelosi, say, trump, backfir...","[-0.0072137616, 0.016117265, 0.03414256, 0.006..."
5,LOL! Democrat Congressman Says Best Way To Fig...,"MSNBC host asks Congressman Ted Leiu, a Democr...",1,"[lol, democrat, congressman, say, good, way, f...","[0.03215446, 0.06676074, 0.025591323, 0.013753..."


In [18]:
y = df['is_fake'].to_list()
X = df['vector'].to_list()

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [20]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.96      0.95      1777
           1       0.96      0.93      0.94      1658

    accuracy                           0.95      3435
   macro avg       0.95      0.95      0.95      3435
weighted avg       0.95      0.95      0.95      3435



In [21]:
precision_score(y_test, y_pred)

0.9567099567099567

In [22]:
clf_tree = DecisionTreeClassifier()
clf_tree.fit(X_train, y_train)
y_pred = clf_tree.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.90      0.91      1777
           1       0.90      0.90      0.90      1658

    accuracy                           0.90      3435
   macro avg       0.90      0.90      0.90      3435
weighted avg       0.90      0.90      0.90      3435



In [23]:
precision_score(y_test, y_pred)

0.8959952181709504

In [24]:
clf_svc = SVC()
clf_svc.fit(X_train, y_train)
y_pred = clf_svc.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97      1777
           1       0.98      0.96      0.97      1658

    accuracy                           0.97      3435
   macro avg       0.97      0.97      0.97      3435
weighted avg       0.97      0.97      0.97      3435



In [25]:
precision_score(y_test, y_pred)

0.9766871165644172

In [26]:
log_reg = LogisticRegression()
params_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1.0, 10.0],
    'fit_intercept': [True, False],
    'solver': ['liblinear', 'lbfgs', 'newton-cg', 'sag', 'saga'],
    'max_iter': [100, 200, 300]
}
grid_search = GridSearchCV(log_reg, params_grid, cv = 5, scoring = 'precision')
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_params

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

{'C': 0.001,
 'fit_intercept': True,
 'max_iter': 100,
 'penalty': 'l2',
 'solver': 'liblinear'}

In [27]:
log_reg = LogisticRegression(**best_params)
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.56      1.00      0.72      1777
           1       1.00      0.16      0.28      1658

    accuracy                           0.60      3435
   macro avg       0.78      0.58      0.50      3435
weighted avg       0.77      0.60      0.51      3435



In [28]:
precision_score(y_test, y_pred)

1.0

In [29]:
clf_tree = DecisionTreeClassifier()
params_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2'],
    'random_state': [42]
}
grid_search = GridSearchCV(clf_tree, params_grid, cv = 5, scoring = 'precision')
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_params

{'criterion': 'gini',
 'max_depth': 10,
 'max_features': None,
 'min_samples_leaf': 2,
 'min_samples_split': 5,
 'random_state': 42,
 'splitter': 'best'}

In [30]:
clf_tree = DecisionTreeClassifier(**best_params)
clf_tree.fit(X_train, y_train)
y_pred = clf_tree.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.92      0.92      1777
           1       0.92      0.90      0.91      1658

    accuracy                           0.91      3435
   macro avg       0.91      0.91      0.91      3435
weighted avg       0.91      0.91      0.91      3435



In [31]:
precision_score(y_test, y_pred)

0.917841814837523

In [32]:
clf_svc = SVC()
params_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'degree': [2, 3, 4],
    'gamma': ['scale', 'auto'] + [0.1, 1, 10],
    'shrinking': [True, False],
    'probability': [True, False],
    'random_state': [42]
}
grid_search = GridSearchCV(clf_svc, params_grid, cv = 5, scoring = 'precision')
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_params

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

{'C': 1,
 'degree': 3,
 'gamma': 1,
 'kernel': 'poly',
 'probability': True,
 'random_state': 42,
 'shrinking': True}

In [33]:
clf_svc = SVC(**best_params)
clf_svc.fit(X_train, y_train)
y_pred = clf_svc.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.55      1.00      0.71      1777
           1       1.00      0.11      0.20      1658

    accuracy                           0.57      3435
   macro avg       0.77      0.56      0.45      3435
weighted avg       0.77      0.57      0.46      3435



In [34]:
precision_score(y_test, y_pred)

1.0