In [1]:
import os
import math
import numpy as np
import pickle
import requests
import urllib.request
import io
import zipfile
import warnings

# html
from bs4 import BeautifulSoup as bs

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, confusion_matrix

In [2]:
with zipfile.ZipFile('data.zip', 'r') as zipper:
      zipper.extractall()

with open(os.path.join('.', 'train_val_data.pkl'), 'rb') as f:
      train_data, val_data = pickle.load(f)

with open(os.path.join('.','test_data.pkl'),'rb') as f:
    test_data = pickle.load(f)

  # natural language and vocab
import nltk
nltk.download('words')
from nltk.corpus import words
vocab = words.words()

y_train = [label for url, html, label in train_data]
y_val = [label for url, html, label in val_data]

warnings.warn('Data loaded.')

[nltk_data] Downloading package words to /home/azureuser/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [15]:
def load():

  # prepare data
  def prepare_data(data, featurizer, is_train):
      X = []
      for index, datapoint in enumerate(data):
          url, html, label = datapoint
          html = html.lower()

          features = featurizer(url, html)

          # Gets the keys of the dictionary as descriptions, gets the values as the numerical features.
          feature_descriptions, feature_values = zip(*features.items())

          X.append(feature_values)

      return X, feature_descriptions

  # train model
  def train_model(X_train, y_train):
      model = LogisticRegression(solver='liblinear')
      model.fit(X_train, y_train)

      return model

  # wrapper function for everything above
  def instantiate_model(compiled_featurizer, train_data, val_data):
      X_train, feature_descriptions = prepare_data(train_data, compiled_featurizer, True)
      X_val, _ = prepare_data(val_data, compiled_featurizer, False)
      X_test, feature_descriptions = prepare_data(test_data, compiled_featurizer, True)
      Y_test =[label for url, html, label in test_data]
      model = train_model(X_train, y_train)
      evaluate_model(model,X_test,Y_test)
      return model, X_train, X_val, feature_descriptions

  # a wrapper function that takes in named a list of keyword argument functions
  # each of those functions are given the URL and HTML and expected to return a list or dictionary with the appropriate features
  def create_featurizer(**featurizers):
      def featurizer(url, html):
          features = {}

          for group_name, featurizer in featurizers.items():
              group_features = featurizer(url, html)

              if type(group_features) == type([]):
                  for feature_name, feature_value in zip(range(len(group_features)), group_features):
                      features[group_name + ' [' + str(feature_name) + ']'] = feature_value
              elif type(group_features) == type({}):
                  for feature_name, feature_value in group_features.items():
                      features[group_name + ' [' + feature_name + ']'] = feature_value
              else:
                  features[group_name] = feature_value

          return features

      return featurizer

  # evaluate model
  def evaluate_model(model, X_val, y_val):
      y_val_pred = model.predict(X_val)

      print(print_metrics(y_val, y_val_pred))
      confusion_matrix(y_val, y_val_pred)

      return y_val_pred

  # confusion matrices
  import pandas as pd
  import seaborn as sns
  import matplotlib.pyplot as plt

  def plot_confusion_matrix(y_val, y_val_pred):
      # Create the Confusion Matrix
      cnf_matrix = confusion_matrix(y_val, y_val_pred)

      # Visualizing the Confusion Matrix
      class_names = [0, 1]  # Our diagnosis categories

      fig, ax = plt.subplots()
      # Setting up and visualizing the plot (do not worry about the code below!)
      tick_marks = np.arange(len(class_names))
      plt.xticks(tick_marks, class_names)
      plt.yticks(tick_marks, class_names)
      sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap='YlGnBu', fmt='g')  # Creating heatmap
      ax.xaxis.set_label_position('top')
      plt.tight_layout()
      plt.title('Confusion matrix', y=1.1)
      plt.ylabel('Actual Labels')
      plt.xlabel('Predicted Labels')

  # other metrics
  def print_metrics(y_val, y_val_pred):
      prf = precision_recall_fscore_support(y_val, y_val_pred)
      return {'Accuracy': accuracy_score(y_val, y_val_pred), 'Precision': prf[0][1], 'Recall': prf[1][1],
              'F-1 Score': prf[2][1]}

  # gets the log count of a phrase/keyword in HTML (transforming the phrase/keyword to lowercase).
  def get_normalized_keyword_count(html, keyword):
      # only concern words inside the body, to speed things up
      try:
          necessary_html = html.split('<body')[1].split('</body>')[0]
      except:
          necessary_html = html  # if it doesn't have a body...

      return math.log(1 + necessary_html.count(keyword.lower()))  # log is a good normalizer

  # count the number of words in a URL
  def count_words_in_url(url):
      for i in range(len(url), 2, -1):  # don't count the first letter, because sometimes that might be a word by itself
          if url[:i].lower() in vocab:  # if it's a word
              return 1 + count_words_in_url(url[i:])  # get more words, and keep counting
      return 0  # no words in URL (or at least, it doesn't start with a word, such as NYTimes)

  def url_extension_featurizer(url, html):
      features = {}

      extensions = ['.com', '.org', '.edu', '.net', '.co', '.nz', '.media', '.za', '.fr', '.is', '.tv', '.press',
                    '.news', '.uk', '.info', '.ca', '.agency', '.us', '.ru', '.su', '.biz', '.ir']

      for extension in extensions:
          features[extension] = url.endswith(extension)

      return features

  def keyword_featurizer(url, html):
      features = {}

      keywords = ['vertical', 'news', 'section', 'light', 'data', 'eq', 'medium', 'large', 'ad', 'header', 'text', 'js',
                  'nav', 'analytics', 'article', 'menu', 'tv', 'cnn', 'button', 'icon', 'edition', 'span', 'item', 'label',
                  'link', 'world', 'politics', 'president', 'donald', 'business', 'food', 'tech', 'style', 'amp', 'vr',
                  'watch', 'search', 'list', 'media', 'wrapper', 'div', 'zn', 'card', 'var', 'prod', 'true', 'window', 'new',
                  'color', 'width', 'container', 'mobile', 'fixed', 'flex', 'aria', 'tablet', 'desktop', 'type', 'size',
                  'tracking', 'heading', 'logo', 'svg', 'path', 'fill', 'content', 'ul', 'li', 'shop', 'home', 'static',
                  'wrap', 'main', 'img', 'celebrity', 'lazy', 'image', 'high', 'noscript', 'inner', 'margin', 'headline',
                  'child', 'interest', 'john', 'movies', 'music', 'parents', 'real', 'warren', 'opens', 'share', 'people',
                  'max', 'min', 'state', 'event', 'story', 'click', 'time', 'trump', 'elizabeth', 'year', 'visit', 'post',
                  'public', 'module', 'latest', 'star', 'skip', 'imagesvc', 'posted', 'ltc', 'summer', 'square', 'solid',
                  'default', 'super', 'house', 'pride', 'week', 'america', 'man', 'day', 'wp', 'york', 'id', 'gallery',
                  'inside', 'calls', 'big', 'daughter', 'photo', 'joe', 'deal', 'app', 'special', 'source', 'red', 'table',
                  'money', 'family', 'featured', 'makes', 'pete', 'michael', 'video', 'case', 'says', 'popup', 'carousel',
                  'category', 'script', 'helvetica', 'feature', 'dark', 'extra', 'small', 'horizontal', 'bg', 'hierarchical',
                  'paginated', 'siblings', 'grid', 'active', 'demand', 'background', 'height', 'cn', 'cd', 'src', 'cnnnext',
                  'dam', 'report', 'trade', 'images', 'file', 'huawei', 'mueller', 'impeachment', 'retirement', 'tealium',
                  'col', 'immigration', 'china', 'flag', 'track', 'tariffs', 'sanders', 'staff', 'fn', 'srcset', 'green',
                  'orient', 'iran', 'morning', 'jun', 'debate', 'ocasio', 'cortez', 'voters', 'pelosi', 'barr', 'buttigieg',
                  'american', 'object', 'javascript', 'uppercase', 'omtr', 'chris', 'dn', 'hfs', 'rachel', 'maddow', 'lh',
                  'teasepicture', 'db', 'xl', 'articletitlesection', 'founders', 'mono', 'ttu', 'biden', 'boston', 'bold',
                  'anglerfish', 'jeffrey', 'radius']

      for keyword in keywords:
          features[keyword] = get_normalized_keyword_count(html, keyword)

      return features

  def url_word_count_featurizer(url, html):
      return count_words_in_url(url.split('.')[-2])
      # for example, www.google.com will return google and nytimes.com will return nytimes

  compiled_featurizer = create_featurizer(
      url_extension=url_extension_featurizer,
      keyword=keyword_featurizer,
      url_word_count=url_word_count_featurizer,
      html_length=lambda url, html: len(html),
      url_length=lambda url, html: len(url))

  print('Beginning to train model.')
  model, X_train, X_val, feature_descriptions = instantiate_model(compiled_featurizer, train_data, val_data)
  print('Trained model.')
    
  return model, feature_descriptions, compiled_featurizer, requests, confusion_matrix, print_metrics, train_data, val_data

model, feature_descriptions, compiled_featurizer, requests, confusion_matrix, print_metrics, train_data, val_data = load()


[nltk_data] Downloading package words to /home/azureuser/nltk_data...
[nltk_data]   Package words is already up-to-date!


Beginning to train model.
{'Accuracy': 0.8252032520325203, 'Precision': 0.7225806451612903, 'Recall': 1.0, 'F-1 Score': 0.8389513108614232}
Trained model.


In [3]:
def load():
  # prepare data
  def prepare_data(data, featurizer, is_train):
      X = []
      for index, datapoint in enumerate(data):
          url, html, label = datapoint
          html = html.lower()

          features = featurizer(url, html)

          # Gets the keys of the dictionary as descriptions, gets the values as the numerical features.
          feature_descriptions, feature_values = zip(*features.items())

          X.append(feature_values)

      return X, feature_descriptions

  # train model
  def train_model(X_train, y_train):
      model = LogisticRegression(multi_class='ovr' ,solver='sag')
      model.fit(X_train, y_train)

      return model

  # wrapper function for everything above
  def instantiate_model(compiled_featurizer, train_data, val_data):
      X_train, feature_descriptions = prepare_data(train_data, compiled_featurizer, True)
      X_val, _ = prepare_data(val_data, compiled_featurizer, False)
      X_test, feature_descriptions = prepare_data(test_data, compiled_featurizer, True)
      Y_test =[label for url, html, label in test_data]
      model = train_model(X_train, y_train)
      evaluate_model(model,X_test,Y_test)
      return model, X_train, X_val, feature_descriptions

  # a wrapper function that takes in named a list of keyword argument functions
  # each of those functions are given the URL and HTML and expected to return a list or dictionary with the appropriate features
  def create_featurizer(**featurizers):
      def featurizer(url, html):
          features = {}

          for group_name, featurizer in featurizers.items():
              group_features = featurizer(url, html)

              if type(group_features) == type([]):
                  for feature_name, feature_value in zip(range(len(group_features)), group_features):
                      features[group_name + ' [' + str(feature_name) + ']'] = feature_value
              elif type(group_features) == type({}):
                  for feature_name, feature_value in group_features.items():
                      features[group_name + ' [' + feature_name + ']'] = feature_value
              else:
                  features[group_name] = feature_value

          return features

      return featurizer

  # evaluate model
  def evaluate_model(model, X_val, y_val):
      y_val_pred = model.predict(X_val)

      print(print_metrics(y_val, y_val_pred))
      confusion_matrix(y_val, y_val_pred)

      return y_val_pred

  # confusion matrices
  import pandas as pd
  import seaborn as sns
  import matplotlib.pyplot as plt

  def plot_confusion_matrix(y_val, y_val_pred):
      # Create the Confusion Matrix
      cnf_matrix = confusion_matrix(y_val, y_val_pred)

      # Visualizing the Confusion Matrix
      class_names = [0, 1]  # Our diagnosis categories

      fig, ax = plt.subplots()
      # Setting up and visualizing the plot (do not worry about the code below!)
      tick_marks = np.arange(len(class_names))
      plt.xticks(tick_marks, class_names)
      plt.yticks(tick_marks, class_names)
      sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap='YlGnBu', fmt='g')  # Creating heatmap
      ax.xaxis.set_label_position('top')
      plt.tight_layout()
      plt.title('Confusion matrix', y=1.1)
      plt.ylabel('Actual Labels')
      plt.xlabel('Predicted Labels')

  # other metrics
  def print_metrics(y_val, y_val_pred):
      prf = precision_recall_fscore_support(y_val, y_val_pred)
      return {'Accuracy': accuracy_score(y_val, y_val_pred), 'Precision': prf[0][1], 'Recall': prf[1][1],
              'F-1 Score': prf[2][1]}

  # gets the log count of a phrase/keyword in HTML (transforming the phrase/keyword to lowercase).
  def get_normalized_keyword_count(html, keyword):
      # only concern words inside the body, to speed things up
      try:
          necessary_html = html.split('<body')[1].split('</body>')[0]
      except:
          necessary_html = html  # if it doesn't have a body...

      return math.log(1 + necessary_html.count(keyword.lower()))  # log is a good normalizer

  # count the number of words in a URL
  def count_words_in_url(url):
      for i in range(len(url), 2, -1):  # don't count the first letter, because sometimes that might be a word by itself
          if url[:i].lower() in vocab:  # if it's a word
              return 1 + count_words_in_url(url[i:])  # get more words, and keep counting
      return 0  # no words in URL (or at least, it doesn't start with a word, such as NYTimes)

  def url_extension_featurizer(url, html):
      features = {}

      extensions = ['.com', '.org', '.edu', '.net', '.co', '.nz', '.media', '.za', '.fr', '.is', '.tv', '.press',
                    '.news', '.uk', '.info', '.ca', '.agency', '.us', '.ru', '.su', '.biz', '.ir']

      for extension in extensions:
          features[extension] = url.endswith(extension)

      return features

  def keyword_featurizer(url, html):
      features = {}

      keywords = ['vertical', 'news', 'section', 'light', 'data', 'eq', 'medium', 'large', 'ad', 'header', 'text', 'js',
                  'nav', 'analytics', 'article', 'menu', 'tv', 'cnn', 'button', 'icon', 'edition', 'span', 'item', 'label',
                  'link', 'world', 'politics', 'president', 'donald', 'business', 'food', 'tech', 'style', 'amp', 'vr',
                  'watch', 'search', 'list', 'media', 'wrapper', 'div', 'zn', 'card', 'var', 'prod', 'true', 'window', 'new',
                  'color', 'width', 'container', 'mobile', 'fixed', 'flex', 'aria', 'tablet', 'desktop', 'type', 'size',
                  'tracking', 'heading', 'logo', 'svg', 'path', 'fill', 'content', 'ul', 'li', 'shop', 'home', 'static',
                  'wrap', 'main', 'img', 'celebrity', 'lazy', 'image', 'high', 'noscript', 'inner', 'margin', 'headline',
                  'child', 'interest', 'john', 'movies', 'music', 'parents', 'real', 'warren', 'opens', 'share', 'people',
                  'max', 'min', 'state', 'event', 'story', 'click', 'time', 'trump', 'elizabeth', 'year', 'visit', 'post',
                  'public', 'module', 'latest', 'star', 'skip', 'imagesvc', 'posted', 'ltc', 'summer', 'square', 'solid',
                  'default', 'super', 'house', 'pride', 'week', 'america', 'man', 'day', 'wp', 'york', 'id', 'gallery',
                  'inside', 'calls', 'big', 'daughter', 'photo', 'joe', 'deal', 'app', 'special', 'source', 'red', 'table',
                  'money', 'family', 'featured', 'makes', 'pete', 'michael', 'video', 'case', 'says', 'popup', 'carousel',
                  'category', 'script', 'helvetica', 'feature', 'dark', 'extra', 'small', 'horizontal', 'bg', 'hierarchical',
                  'paginated', 'siblings', 'grid', 'active', 'demand', 'background', 'height', 'cn', 'cd', 'src', 'cnnnext',
                  'dam', 'report', 'trade', 'images', 'file', 'huawei', 'mueller', 'impeachment', 'retirement', 'tealium',
                  'col', 'immigration', 'china', 'flag', 'track', 'tariffs', 'sanders', 'staff', 'fn', 'srcset', 'green',
                  'orient', 'iran', 'morning', 'jun', 'debate', 'ocasio', 'cortez', 'voters', 'pelosi', 'barr', 'buttigieg',
                  'american', 'object', 'javascript', 'uppercase', 'omtr', 'chris', 'dn', 'hfs', 'rachel', 'maddow', 'lh',
                  'teasepicture', 'db', 'xl', 'articletitlesection', 'founders', 'mono', 'ttu', 'biden', 'boston', 'bold',
                  'anglerfish', 'jeffrey', 'radius']

      for keyword in keywords:
          features[keyword] = get_normalized_keyword_count(html, keyword)

      return features

  def url_word_count_featurizer(url, html):
      return count_words_in_url(url.split('.')[-2])
      # for example, www.google.com will return google and nytimes.com will return nytimes

  compiled_featurizer = create_featurizer(
      url_extension=url_extension_featurizer,
      keyword=keyword_featurizer,
      url_word_count=url_word_count_featurizer,
      html_length=lambda url, html: len(html),
      url_length=lambda url, html: len(url))

  print('Beginning to train model.')
  model, X_train, X_val, feature_descriptions = instantiate_model(compiled_featurizer, train_data, val_data)
  print('Trained model.')
    
  return model, feature_descriptions, compiled_featurizer, requests, confusion_matrix, print_metrics, train_data, val_data

model, feature_descriptions, compiled_featurizer, requests, confusion_matrix, print_metrics, train_data, val_data = load()


Beginning to train model.
{'Accuracy': 0.8252032520325203, 'Precision': 0.7225806451612903, 'Recall': 1.0, 'F-1 Score': 0.8389513108614232}
Trained model.




In [4]:
def load():
  # prepare data
  def prepare_data(data, featurizer, is_train):
      X = []
      for index, datapoint in enumerate(data):
          url, html, label = datapoint
          html = html.lower()

          features = featurizer(url, html)

          # Gets the keys of the dictionary as descriptions, gets the values as the numerical features.
          feature_descriptions, feature_values = zip(*features.items())

          X.append(feature_values)

      return X, feature_descriptions

  # train model
  def train_model(X_train, y_train):
      model = LogisticRegression(multi_class='ovr' ,solver='liblinear',n_jobs=9)
      model.fit(X_train, y_train)

      return model

  # wrapper function for everything above
  def instantiate_model(compiled_featurizer, train_data, val_data):
      X_train, feature_descriptions = prepare_data(train_data, compiled_featurizer, True)
      X_val, _ = prepare_data(val_data, compiled_featurizer, False)
      X_test, feature_descriptions = prepare_data(test_data, compiled_featurizer, True)
      Y_test =[label for url, html, label in test_data]
      model = train_model(X_train, y_train)
      evaluate_model(model,X_test,Y_test)
      return model, X_train, X_val, feature_descriptions

  # a wrapper function that takes in named a list of keyword argument functions
  # each of those functions are given the URL and HTML and expected to return a list or dictionary with the appropriate features
  def create_featurizer(**featurizers):
      def featurizer(url, html):
          features = {}

          for group_name, featurizer in featurizers.items():
              group_features = featurizer(url, html)

              if type(group_features) == type([]):
                  for feature_name, feature_value in zip(range(len(group_features)), group_features):
                      features[group_name + ' [' + str(feature_name) + ']'] = feature_value
              elif type(group_features) == type({}):
                  for feature_name, feature_value in group_features.items():
                      features[group_name + ' [' + feature_name + ']'] = feature_value
              else:
                  features[group_name] = feature_value

          return features

      return featurizer

  # evaluate model
  def evaluate_model(model, X_val, y_val):
      y_val_pred = model.predict(X_val)

      print(print_metrics(y_val, y_val_pred))
      confusion_matrix(y_val, y_val_pred)

      return y_val_pred

  # confusion matrices
  import pandas as pd
  import seaborn as sns
  import matplotlib.pyplot as plt

  def plot_confusion_matrix(y_val, y_val_pred):
      # Create the Confusion Matrix
      cnf_matrix = confusion_matrix(y_val, y_val_pred)

      # Visualizing the Confusion Matrix
      class_names = [0, 1]  # Our diagnosis categories

      fig, ax = plt.subplots()
      # Setting up and visualizing the plot (do not worry about the code below!)
      tick_marks = np.arange(len(class_names))
      plt.xticks(tick_marks, class_names)
      plt.yticks(tick_marks, class_names)
      sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap='YlGnBu', fmt='g')  # Creating heatmap
      ax.xaxis.set_label_position('top')
      plt.tight_layout()
      plt.title('Confusion matrix', y=1.1)
      plt.ylabel('Actual Labels')
      plt.xlabel('Predicted Labels')

  # other metrics
  def print_metrics(y_val, y_val_pred):
      prf = precision_recall_fscore_support(y_val, y_val_pred)
      return {'Accuracy': accuracy_score(y_val, y_val_pred), 'Precision': prf[0][1], 'Recall': prf[1][1],
              'F-1 Score': prf[2][1]}

  # gets the log count of a phrase/keyword in HTML (transforming the phrase/keyword to lowercase).
  def get_normalized_keyword_count(html, keyword):
      # only concern words inside the body, to speed things up
      try:
          necessary_html = html.split('<body')[1].split('</body>')[0]
      except:
          necessary_html = html  # if it doesn't have a body...

      return math.log(1 + necessary_html.count(keyword.lower()))  # log is a good normalizer

  # count the number of words in a URL
  def count_words_in_url(url):
      for i in range(len(url), 2, -1):  # don't count the first letter, because sometimes that might be a word by itself
          if url[:i].lower() in vocab:  # if it's a word
              return 1 + count_words_in_url(url[i:])  # get more words, and keep counting
      return 0  # no words in URL (or at least, it doesn't start with a word, such as NYTimes)

  def url_extension_featurizer(url, html):
      features = {}

      extensions = ['.com', '.org', '.edu', '.net', '.co', '.nz', '.media', '.za', '.fr', '.is', '.tv', '.press',
                    '.news', '.uk', '.info', '.ca', '.agency', '.us', '.ru', '.su', '.biz', '.ir']

      for extension in extensions:
          features[extension] = url.endswith(extension)

      return features

  def keyword_featurizer(url, html):
      features = {}

      keywords = ['vertical', 'news', 'section', 'light', 'data', 'eq', 'medium', 'large', 'ad', 'header', 'text', 'js',
                  'nav', 'analytics', 'article', 'menu', 'tv', 'cnn', 'button', 'icon', 'edition', 'span', 'item', 'label',
                  'link', 'world', 'politics', 'president', 'donald', 'business', 'food', 'tech', 'style', 'amp', 'vr',
                  'watch', 'search', 'list', 'media', 'wrapper', 'div', 'zn', 'card', 'var', 'prod', 'true', 'window', 'new',
                  'color', 'width', 'container', 'mobile', 'fixed', 'flex', 'aria', 'tablet', 'desktop', 'type', 'size',
                  'tracking', 'heading', 'logo', 'svg', 'path', 'fill', 'content', 'ul', 'li', 'shop', 'home', 'static',
                  'wrap', 'main', 'img', 'celebrity', 'lazy', 'image', 'high', 'noscript', 'inner', 'margin', 'headline',
                  'child', 'interest', 'john', 'movies', 'music', 'parents', 'real', 'warren', 'opens', 'share', 'people',
                  'max', 'min', 'state', 'event', 'story', 'click', 'time', 'trump', 'elizabeth', 'year', 'visit', 'post',
                  'public', 'module', 'latest', 'star', 'skip', 'imagesvc', 'posted', 'ltc', 'summer', 'square', 'solid',
                  'default', 'super', 'house', 'pride', 'week', 'america', 'man', 'day', 'wp', 'york', 'id', 'gallery',
                  'inside', 'calls', 'big', 'daughter', 'photo', 'joe', 'deal', 'app', 'special', 'source', 'red', 'table',
                  'money', 'family', 'featured', 'makes', 'pete', 'michael', 'video', 'case', 'says', 'popup', 'carousel',
                  'category', 'script', 'helvetica', 'feature', 'dark', 'extra', 'small', 'horizontal', 'bg', 'hierarchical',
                  'paginated', 'siblings', 'grid', 'active', 'demand', 'background', 'height', 'cn', 'cd', 'src', 'cnnnext',
                  'dam', 'report', 'trade', 'images', 'file', 'huawei', 'mueller', 'impeachment', 'retirement', 'tealium',
                  'col', 'immigration', 'china', 'flag', 'track', 'tariffs', 'sanders', 'staff', 'fn', 'srcset', 'green',
                  'orient', 'iran', 'morning', 'jun', 'debate', 'ocasio', 'cortez', 'voters', 'pelosi', 'barr', 'buttigieg',
                  'american', 'object', 'javascript', 'uppercase', 'omtr', 'chris', 'dn', 'hfs', 'rachel', 'maddow', 'lh',
                  'teasepicture', 'db', 'xl', 'articletitlesection', 'founders', 'mono', 'ttu', 'biden', 'boston', 'bold',
                  'anglerfish', 'jeffrey', 'radius']

      for keyword in keywords:
          features[keyword] = get_normalized_keyword_count(html, keyword)

      return features

  def url_word_count_featurizer(url, html):
      return count_words_in_url(url.split('.')[-2])
      # for example, www.google.com will return google and nytimes.com will return nytimes

  compiled_featurizer = create_featurizer(
      url_extension=url_extension_featurizer,
      keyword=keyword_featurizer,
      url_word_count=url_word_count_featurizer,
      html_length=lambda url, html: len(html),
      url_length=lambda url, html: len(url))

  print('Beginning to train model.')
  model, X_train, X_val, feature_descriptions = instantiate_model(compiled_featurizer, train_data, val_data)
  print('Trained model.')
    
  return model, feature_descriptions, compiled_featurizer, requests, confusion_matrix, print_metrics, train_data, val_data

model, feature_descriptions, compiled_featurizer, requests, confusion_matrix, print_metrics, train_data, val_data = load()


Beginning to train model.
{'Accuracy': 0.8252032520325203, 'Precision': 0.7225806451612903, 'Recall': 1.0, 'F-1 Score': 0.8389513108614232}
Trained model.




In [5]:
def load():
  # prepare data
  def prepare_data(data, featurizer, is_train):
      X = []
      for index, datapoint in enumerate(data):
          url, html, label = datapoint
          html = html.lower()

          features = featurizer(url, html)

          # Gets the keys of the dictionary as descriptions, gets the values as the numerical features.
          feature_descriptions, feature_values = zip(*features.items())

          X.append(feature_values)

      return X, feature_descriptions

  from sklearn.naive_bayes import GaussianNB
  # train model
  def train_model(X_train, y_train):
      model = GaussianNB()
      model.fit(X_train, y_train)

      return model

  # wrapper function for everything above
  def instantiate_model(compiled_featurizer, train_data, val_data):
      X_train, feature_descriptions = prepare_data(train_data, compiled_featurizer, True)
      X_val, _ = prepare_data(val_data, compiled_featurizer, False)
      X_test, feature_descriptions = prepare_data(test_data, compiled_featurizer, True)
      Y_test =[label for url, html, label in test_data]
      model = train_model(X_train, y_train)
      evaluate_model(model,X_test,Y_test)
      return model, X_train, X_val, feature_descriptions

  # a wrapper function that takes in named a list of keyword argument functions
  # each of those functions are given the URL and HTML and expected to return a list or dictionary with the appropriate features
  def create_featurizer(**featurizers):
      def featurizer(url, html):
          features = {}

          for group_name, featurizer in featurizers.items():
              group_features = featurizer(url, html)

              if type(group_features) == type([]):
                  for feature_name, feature_value in zip(range(len(group_features)), group_features):
                      features[group_name + ' [' + str(feature_name) + ']'] = feature_value
              elif type(group_features) == type({}):
                  for feature_name, feature_value in group_features.items():
                      features[group_name + ' [' + feature_name + ']'] = feature_value
              else:
                  features[group_name] = feature_value

          return features

      return featurizer

  # evaluate model
  def evaluate_model(model, X_val, y_val):
      y_val_pred = model.predict(X_val)

      print(print_metrics(y_val, y_val_pred))
      confusion_matrix(y_val, y_val_pred)

      return y_val_pred

  # confusion matrices
  import pandas as pd
  import seaborn as sns
  import matplotlib.pyplot as plt

  def plot_confusion_matrix(y_val, y_val_pred):
      # Create the Confusion Matrix
      cnf_matrix = confusion_matrix(y_val, y_val_pred)

      # Visualizing the Confusion Matrix
      class_names = [0, 1]  # Our diagnosis categories

      fig, ax = plt.subplots()
      # Setting up and visualizing the plot (do not worry about the code below!)
      tick_marks = np.arange(len(class_names))
      plt.xticks(tick_marks, class_names)
      plt.yticks(tick_marks, class_names)
      sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap='YlGnBu', fmt='g')  # Creating heatmap
      ax.xaxis.set_label_position('top')
      plt.tight_layout()
      plt.title('Confusion matrix', y=1.1)
      plt.ylabel('Actual Labels')
      plt.xlabel('Predicted Labels')

  # other metrics
  def print_metrics(y_val, y_val_pred):
      prf = precision_recall_fscore_support(y_val, y_val_pred)
      return {'Accuracy': accuracy_score(y_val, y_val_pred), 'Precision': prf[0][1], 'Recall': prf[1][1],
              'F-1 Score': prf[2][1]}

  # gets the log count of a phrase/keyword in HTML (transforming the phrase/keyword to lowercase).
  def get_normalized_keyword_count(html, keyword):
      # only concern words inside the body, to speed things up
      try:
          necessary_html = html.split('<body')[1].split('</body>')[0]
      except:
          necessary_html = html  # if it doesn't have a body...

      return math.log(1 + necessary_html.count(keyword.lower()))  # log is a good normalizer

  # count the number of words in a URL
  def count_words_in_url(url):
      for i in range(len(url), 2, -1):  # don't count the first letter, because sometimes that might be a word by itself
          if url[:i].lower() in vocab:  # if it's a word
              return 1 + count_words_in_url(url[i:])  # get more words, and keep counting
      return 0  # no words in URL (or at least, it doesn't start with a word, such as NYTimes)

  def url_extension_featurizer(url, html):
      features = {}

      extensions = ['.com', '.org', '.edu', '.net', '.co', '.nz', '.media', '.za', '.fr', '.is', '.tv', '.press',
                    '.news', '.uk', '.info', '.ca', '.agency', '.us', '.ru', '.su', '.biz', '.ir']

      for extension in extensions:
          features[extension] = url.endswith(extension)

      return features

  def keyword_featurizer(url, html):
      features = {}

      keywords = ['vertical', 'news', 'section', 'light', 'data', 'eq', 'medium', 'large', 'ad', 'header', 'text', 'js',
                  'nav', 'analytics', 'article', 'menu', 'tv', 'cnn', 'button', 'icon', 'edition', 'span', 'item', 'label',
                  'link', 'world', 'politics', 'president', 'donald', 'business', 'food', 'tech', 'style', 'amp', 'vr',
                  'watch', 'search', 'list', 'media', 'wrapper', 'div', 'zn', 'card', 'var', 'prod', 'true', 'window', 'new',
                  'color', 'width', 'container', 'mobile', 'fixed', 'flex', 'aria', 'tablet', 'desktop', 'type', 'size',
                  'tracking', 'heading', 'logo', 'svg', 'path', 'fill', 'content', 'ul', 'li', 'shop', 'home', 'static',
                  'wrap', 'main', 'img', 'celebrity', 'lazy', 'image', 'high', 'noscript', 'inner', 'margin', 'headline',
                  'child', 'interest', 'john', 'movies', 'music', 'parents', 'real', 'warren', 'opens', 'share', 'people',
                  'max', 'min', 'state', 'event', 'story', 'click', 'time', 'trump', 'elizabeth', 'year', 'visit', 'post',
                  'public', 'module', 'latest', 'star', 'skip', 'imagesvc', 'posted', 'ltc', 'summer', 'square', 'solid',
                  'default', 'super', 'house', 'pride', 'week', 'america', 'man', 'day', 'wp', 'york', 'id', 'gallery',
                  'inside', 'calls', 'big', 'daughter', 'photo', 'joe', 'deal', 'app', 'special', 'source', 'red', 'table',
                  'money', 'family', 'featured', 'makes', 'pete', 'michael', 'video', 'case', 'says', 'popup', 'carousel',
                  'category', 'script', 'helvetica', 'feature', 'dark', 'extra', 'small', 'horizontal', 'bg', 'hierarchical',
                  'paginated', 'siblings', 'grid', 'active', 'demand', 'background', 'height', 'cn', 'cd', 'src', 'cnnnext',
                  'dam', 'report', 'trade', 'images', 'file', 'huawei', 'mueller', 'impeachment', 'retirement', 'tealium',
                  'col', 'immigration', 'china', 'flag', 'track', 'tariffs', 'sanders', 'staff', 'fn', 'srcset', 'green',
                  'orient', 'iran', 'morning', 'jun', 'debate', 'ocasio', 'cortez', 'voters', 'pelosi', 'barr', 'buttigieg',
                  'american', 'object', 'javascript', 'uppercase', 'omtr', 'chris', 'dn', 'hfs', 'rachel', 'maddow', 'lh',
                  'teasepicture', 'db', 'xl', 'articletitlesection', 'founders', 'mono', 'ttu', 'biden', 'boston', 'bold',
                  'anglerfish', 'jeffrey', 'radius']

      for keyword in keywords:
          features[keyword] = get_normalized_keyword_count(html, keyword)

      return features

  def url_word_count_featurizer(url, html):
      return count_words_in_url(url.split('.')[-2])
      # for example, www.google.com will return google and nytimes.com will return nytimes

  compiled_featurizer = create_featurizer(
      url_extension=url_extension_featurizer,
      keyword=keyword_featurizer,
      url_word_count=url_word_count_featurizer,
      html_length=lambda url, html: len(html),
      url_length=lambda url, html: len(url))

  print('Beginning to train model.')
  model, X_train, X_val, feature_descriptions = instantiate_model(compiled_featurizer, train_data, val_data)
  print('Trained model.')
    
  return model, feature_descriptions, compiled_featurizer, requests, confusion_matrix, print_metrics, train_data, val_data

model, feature_descriptions, compiled_featurizer, requests, confusion_matrix, print_metrics, train_data, val_data = load()


Beginning to train model.
{'Accuracy': 0.6097560975609756, 'Precision': 0.5625, 'Recall': 0.6428571428571429, 'F-1 Score': 0.6000000000000001}
Trained model.


In [6]:
def load():
  # prepare data
  def prepare_data(data, featurizer, is_train):
      X = []
      for index, datapoint in enumerate(data):
          url, html, label = datapoint
          html = html.lower()

          features = featurizer(url, html)

          # Gets the keys of the dictionary as descriptions, gets the values as the numerical features.
          feature_descriptions, feature_values = zip(*features.items())

          X.append(feature_values)

      return X, feature_descriptions

  from sklearn.naive_bayes import GaussianNB
  # train model
  def train_model(X_train, y_train):
      model = GaussianNB()
      model.fit(X_train, y_train)

      return model

  # wrapper function for everything above
  def instantiate_model(compiled_featurizer, train_data, val_data):
      X_train, feature_descriptions = prepare_data(train_data, compiled_featurizer, True)
      X_val, _ = prepare_data(val_data, compiled_featurizer, False)
      X_test, feature_descriptions = prepare_data(test_data, compiled_featurizer, True)
      Y_test =[label for url, html, label in test_data]
      model = train_model(X_train, y_train)
      evaluate_model(model,X_test,Y_test)
      return model, X_train, X_val, feature_descriptions

  # a wrapper function that takes in named a list of keyword argument functions
  # each of those functions are given the URL and HTML and expected to return a list or dictionary with the appropriate features
  def create_featurizer(**featurizers):
      def featurizer(url, html):
          features = {}

          for group_name, featurizer in featurizers.items():
              group_features = featurizer(url, html)

              if type(group_features) == type([]):
                  for feature_name, feature_value in zip(range(len(group_features)), group_features):
                      features[group_name + ' [' + str(feature_name) + ']'] = feature_value
              elif type(group_features) == type({}):
                  for feature_name, feature_value in group_features.items():
                      features[group_name + ' [' + feature_name + ']'] = feature_value
              else:
                  features[group_name] = feature_value

          return features

      return featurizer

  # evaluate model
  def evaluate_model(model, X_val, y_val):
      y_val_pred = model.predict(X_val)

      print(print_metrics(y_val, y_val_pred))
      confusion_matrix(y_val, y_val_pred)

      return y_val_pred

  # confusion matrices
  import pandas as pd
  import seaborn as sns
  import matplotlib.pyplot as plt

  def plot_confusion_matrix(y_val, y_val_pred):
      # Create the Confusion Matrix
      cnf_matrix = confusion_matrix(y_val, y_val_pred)

      # Visualizing the Confusion Matrix
      class_names = [0, 1]  # Our diagnosis categories

      fig, ax = plt.subplots()
      # Setting up and visualizing the plot (do not worry about the code below!)
      tick_marks = np.arange(len(class_names))
      plt.xticks(tick_marks, class_names)
      plt.yticks(tick_marks, class_names)
      sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap='YlGnBu', fmt='g')  # Creating heatmap
      ax.xaxis.set_label_position('top')
      plt.tight_layout()
      plt.title('Confusion matrix', y=1.1)
      plt.ylabel('Actual Labels')
      plt.xlabel('Predicted Labels')

  # other metrics
  def print_metrics(y_val, y_val_pred):
      prf = precision_recall_fscore_support(y_val, y_val_pred)
      return {'Accuracy': accuracy_score(y_val, y_val_pred), 'Precision': prf[0][1], 'Recall': prf[1][1],
              'F-1 Score': prf[2][1]}

  # gets the log count of a phrase/keyword in HTML (transforming the phrase/keyword to lowercase).
  def get_normalized_keyword_count(html, keyword):
      # only concern words inside the body, to speed things up
      try:
          necessary_html = html.split('<body')[1].split('</body>')[0]
      except:
          necessary_html = html  # if it doesn't have a body...

      return math.log(1 + necessary_html.count(keyword.lower()))  # log is a good normalizer

  # count the number of words in a URL
  def count_words_in_url(url):
      for i in range(len(url), 2, -1):  # don't count the first letter, because sometimes that might be a word by itself
          if url[:i].lower() in vocab:  # if it's a word
              return 1 + count_words_in_url(url[i:])  # get more words, and keep counting
      return 0  # no words in URL (or at least, it doesn't start with a word, such as NYTimes)

  def url_extension_featurizer(url, html):
      features = {}

      extensions = ['.com', '.org', '.edu', '.net', '.co', '.nz', '.media', '.za', '.fr', '.is', '.tv', '.press',
                    '.news', '.uk', '.info', '.ca', '.agency', '.us', '.ru', '.su', '.biz', '.ir']

      for extension in extensions:
          features[extension] = url.endswith(extension)

      return features

  def keyword_featurizer(url, html):
      features = {}

      keywords = ['vertical', 'news', 'section', 'light', 'data', 'eq', 'medium', 'large', 'ad', 'header', 'text', 'js',
                  'nav', 'analytics', 'article', 'menu', 'tv', 'cnn', 'button', 'icon', 'edition', 'span', 'item', 'label',
                  'link', 'world', 'politics', 'president', 'donald', 'business', 'food', 'tech', 'style', 'amp', 'vr',
                  'watch', 'search', 'list', 'media', 'wrapper', 'div', 'zn', 'card', 'var', 'prod', 'true', 'window', 'new',
                  'color', 'width', 'container', 'mobile', 'fixed', 'flex', 'aria', 'tablet', 'desktop', 'type', 'size',
                  'tracking', 'heading', 'logo', 'svg', 'path', 'fill', 'content', 'ul', 'li', 'shop', 'home', 'static',
                  'wrap', 'main', 'img', 'celebrity', 'lazy', 'image', 'high', 'noscript', 'inner', 'margin', 'headline',
                  'child', 'interest', 'john', 'movies', 'music', 'parents', 'real', 'warren', 'opens', 'share', 'people',
                  'max', 'min', 'state', 'event', 'story', 'click', 'time', 'trump', 'elizabeth', 'year', 'visit', 'post',
                  'public', 'module', 'latest', 'star', 'skip', 'imagesvc', 'posted', 'ltc', 'summer', 'square', 'solid',
                  'default', 'super', 'house', 'pride', 'week', 'america', 'man', 'day', 'wp', 'york', 'id', 'gallery',
                  'inside', 'calls', 'big', 'daughter', 'photo', 'joe', 'deal', 'app', 'special', 'source', 'red', 'table',
                  'money', 'family', 'featured', 'makes', 'pete', 'michael', 'video', 'case', 'says', 'popup', 'carousel',
                  'category', 'script', 'helvetica', 'feature', 'dark', 'extra', 'small', 'horizontal', 'bg', 'hierarchical',
                  'paginated', 'siblings', 'grid', 'active', 'demand', 'background', 'height', 'cn', 'cd', 'src', 'cnnnext',
                  'dam', 'report', 'trade', 'images', 'file', 'huawei', 'mueller', 'impeachment', 'retirement', 'tealium',
                  'col', 'immigration', 'china', 'flag', 'track', 'tariffs', 'sanders', 'staff', 'fn', 'srcset', 'green',
                  'orient', 'iran', 'morning', 'jun', 'debate', 'ocasio', 'cortez', 'voters', 'pelosi', 'barr', 'buttigieg',
                  'american', 'object', 'javascript', 'uppercase', 'omtr', 'chris', 'dn', 'hfs', 'rachel', 'maddow', 'lh',
                  'teasepicture', 'db', 'xl', 'articletitlesection', 'founders', 'mono', 'ttu', 'biden', 'boston', 'bold',
                  'anglerfish', 'jeffrey', 'radius']

      for keyword in keywords:
          features[keyword] = get_normalized_keyword_count(html, keyword)

      return features

  def url_word_count_featurizer(url, html):
      return count_words_in_url(url.split('.')[-2])
      # for example, www.google.com will return google and nytimes.com will return nytimes

  compiled_featurizer = create_featurizer(
      url_extension=url_extension_featurizer,
      keyword=keyword_featurizer,
      url_word_count=url_word_count_featurizer,
      html_length=lambda url, html: len(html),
      url_length=lambda url, html: len(url))

  print('Beginning to train model.')
  model, X_train, X_val, feature_descriptions = instantiate_model(compiled_featurizer, train_data, val_data)
  print('Trained model.')
    
  return model, feature_descriptions, compiled_featurizer, requests, confusion_matrix, print_metrics, train_data, val_data

model, feature_descriptions, compiled_featurizer, requests, confusion_matrix, print_metrics, train_data, val_data = load()


Beginning to train model.
{'Accuracy': 0.6097560975609756, 'Precision': 0.5625, 'Recall': 0.6428571428571429, 'F-1 Score': 0.6000000000000001}
Trained model.
