<a href="https://colab.research.google.com/github/JiakangChenBuff/fake-news-classifier/blob/main/fake_news_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Import Data and Helper Functions (may take a while) { display-mode: "form" }
import math
import os
import numpy as np
from bs4 import BeautifulSoup as bs
import requests
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from torchtext.vocab import GloVe

import pickle

import requests, io, zipfile
!wget -O data.zip 'https://storage.googleapis.com/inspirit-ai-data-bucket-1/Data/AI%20Scholars/Sessions%206%20-%2010%20(Projects)/Project%20-%20Fake%20News%20Detection/inspirit_fake_news_resources%20(1).zip'
!unzip data.zip

basepath = '.'

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

with open(os.path.join(basepath, 'train_val_data.pkl'), 'rb') as f:
  train_data, val_data = pickle.load(f)
  
def get_description_from_html(html):
  soup = bs(html)
  description_tag = soup.find('meta', attrs={'name':'og:description'}) or soup.find('meta', attrs={'property':'description'}) or soup.find('meta', attrs={'name':'description'})
  if description_tag:
    description = description_tag.get('content') or ''
  else: # If there is no description, return empty string.
    description = ''
  return description

def get_descriptions_from_data(data):
  # A dictionary mapping from url to description for the websites in 
  # train_data.
  descriptions = []
  for site in tqdm(data):
    descriptions.append(get_description_from_html(site[1]))
  return descriptions

train_descriptions = get_descriptions_from_data(train_data)
val_descriptions = get_descriptions_from_data(val_data)

VEC_SIZE = 300
glove = GloVe(name='6B', dim=VEC_SIZE)

def train_model(X_train, y_train, X_val, y_val):
  model = LogisticRegression(solver='liblinear')
  model.fit(X_train, y_train)
  
  return model

def prepare_data(data, featurizer):
    X = []
    y = []
    for datapoint in data:
        url, html, label = datapoint
        # We convert all text in HTML to lowercase, so <p>Hello.</p> is mapped to
        # <p>hello</p>. This will help us later when we extract features from 
        # the HTML, as we will be able to rely on the HTML being lowercase.
        html = html.lower() 
        y.append(label)

        features = featurizer(url, html)

        # Gets the keys of the dictionary as descriptions, gets the values
        # as the numerical features. Don't worry about exactly what zip does!
        feature_descriptions, feature_values = zip(*features.items())

        X.append(feature_values)

    return X, y, feature_descriptions
  
# Gets the log count of a phrase/keyword in HTML (transforming the phrase/keyword
# to lowercase).
def get_normalized_count(html, phrase):
    return math.log(1 + html.count(phrase.lower()))

# Returns a dictionary mapping from plaintext feature descriptions to numerical
# features for a (url, html) pair.
def keyword_featurizer(url, html):
    features = {}
    
    # Same as before.
    features['.com domain'] = url.endswith('.com')
    features['.org domain'] = url.endswith('.org')
    features['.net domain'] = url.endswith('.net')
    features['.info domain'] = url.endswith('.info')
    features['.org domain'] = url.endswith('.org')
    features['.biz domain'] = url.endswith('.biz')
    features['.ru domain'] = url.endswith('.ru')
    features['.co.uk domain'] = url.endswith('.co.uk')
    features['.co domain'] = url.endswith('.co')
    features['.tv domain'] = url.endswith('.tv')
    features['.news domain'] = url.endswith('.news')
    
    keywords = ['trump', 'biden', 'clinton', 'sports', 'finance']
    
    for keyword in keywords:
      features[keyword + ' keyword'] = get_normalized_count(html, keyword)
    
    return features

keyword_X_train, y_train, feature_descriptions = prepare_data(train_data, keyword_featurizer)
keyword_X_val, y_val, feature_descriptions = prepare_data(val_data, keyword_featurizer)

vectorizer = CountVectorizer(max_features=300)

vectorizer.fit(train_descriptions)

def vectorize_data_descriptions(data_descriptions, vectorizer):
  X = vectorizer.transform(data_descriptions).todense()
  return X

bow_X_train = vectorize_data_descriptions(train_descriptions, vectorizer)
bow_X_val = vectorize_data_descriptions(val_descriptions, vectorizer)

# Returns word vector for word if it exists, else return None.
def get_word_vector(word):
    try:
      return glove.vectors[glove.stoi[word.lower()]].numpy()
    except KeyError:
      return None

def glove_transform_data_descriptions(descriptions):
    X = np.zeros((len(descriptions), VEC_SIZE))
    for i, description in enumerate(descriptions):
        found_words = 0.0
        description = description.strip()
        for word in description.split(): 
            vec = get_word_vector(word)
            if vec is not None:
                # Increment found_words and add vec to X[i].
                found_words += 1
                X[i] += vec
        # We divide the sum by the number of words added, so we have the
        # average word vector.
        if found_words > 0:
            X[i] /= found_words
            
    return X

glove_X_train = glove_transform_data_descriptions(train_descriptions)
glove_X_val = glove_transform_data_descriptions(val_descriptions)

def combine_features(X_list):
  return np.concatenate(X_list, axis=1)

combined_X_train = combine_features([keyword_X_train, bow_X_train, glove_X_train])
combined_X_val = combine_features([keyword_X_val, bow_X_val, glove_X_val])

--2022-12-13 07:30:00--  https://storage.googleapis.com/inspirit-ai-data-bucket-1/Data/AI%20Scholars/Sessions%206%20-%2010%20(Projects)/Project%20-%20Fake%20News%20Detection/inspirit_fake_news_resources%20(1).zip
Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.2.128, 142.250.141.128, 2607:f8b0:4023:c0d::80, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.2.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 109422100 (104M) [application/zip]
Saving to: ‘data.zip’


2022-12-13 07:30:00 (265 MB/s) - ‘data.zip’ saved [109422100/109422100]

Archive:  data.zip
replace train_val_data.pkl? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace test_data.pkl? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


100%|██████████| 2002/2002 [02:35<00:00, 12.84it/s]
100%|██████████| 309/309 [00:23<00:00, 12.89it/s]
.vector_cache/glove.6B.zip: 862MB [02:39, 5.41MB/s]                           
100%|█████████▉| 399999/400000 [00:40<00:00, 9829.19it/s] 


In [None]:
#@title Live Fake News Classification Demo { vertical-output: true }
import warnings
warnings.filterwarnings("ignore")

def get_data_pair(url):
  if not url.startswith('http'):
      url = 'http://' + url
  url_pretty = url
  if url_pretty.startswith('http://'):
      url_pretty = url_pretty[7:]
  if url_pretty.startswith('https://'):
      url_pretty = url_pretty[8:]
      
  # Scrape website for HTML
  try:
    response = requests.get(url, timeout=10)
    htmltext = response.text
  except:
    htmltext = ''
  
  return url_pretty, htmltext

curr_url = "www.nytimes.com" #@param {type:"string"}

url, html = get_data_pair(curr_url)
  

# Call on the output of *keyword_featurizer* or something similar
# to transform it into a format that allows for concatenation. See
# example below.
def dict_to_features(features_dict):
  X = np.array(list(features_dict.values())).astype('float')
  X = X[np.newaxis, :]
  return X
def featurize_data_pair(url, html):
  # Approach 1 with keywords
  keyword_X = dict_to_features(keyword_featurizer(url, html))

  # Approach 2 with bag of words
  description = get_description_from_html(html)
  
  bow_X = vectorize_data_descriptions([description], vectorizer)
  
  # Approach 3 with word vectors
  glove_X = glove_transform_data_descriptions([description])
  
  # Combining approaches
  X = combine_features([keyword_X, bow_X, glove_X])
  
  return X

if html == '':
  print('That doesn\'t seem to be a valid url')
else:
  curr_X = featurize_data_pair(url, html)

  model = train_model(combined_X_train, y_train, combined_X_val, y_val)

  curr_y = model.predict(curr_X)[0]
    
    
  if curr_y < .5:
    print(curr_url, 'appears to be real.')
  else:
    print(curr_url, 'appears to be fake.')

www.foxnews.com appears to be real.
