In [None]:
!pip install textstat



In [None]:
import re as re

import numpy as np
import pandas as pd

import spacy
from nltk.stem import SnowballStemmer

from textblob import TextBlob

import textstat

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import hashlib

from pathlib import Path

nlp = spacy.load("en_core_web_sm")

In [None]:
# path configuration
RAW_DATA_URL = "https://github.com/INTERTECHNICA-BUSINESS-SOLUTIONS-SRL/COVID-Fake-News-Analysis/raw/main/data/original/fake_new_dataset.xlsx";
STOP_WORDS_DATA_URL = "https://github.com/INTERTECHNICA-BUSINESS-SOLUTIONS-SRL/COVID-Fake-News-Analysis/raw/main/data/support/stop_words_tokens.csv"

PROCESSED_DATA_FILE_NAME = './COVIDFakeNewsProcessedData.csv'
ARCHIVED_DATA_FILE_PATH = "./data/processed"
ARCHIVED_DATA_FILE_NAME = "COVIDFakeNewsProcessedData.zip"

In [None]:
# read basic data
raw_data = pd.read_excel(RAW_DATA_URL, index_col=0);

# read stop words data
stop_words_raw_data = pd.read_csv(STOP_WORDS_DATA_URL, index_col=None ,header=0);
stop_words_data = stop_words_raw_data["tokens"].values;

In [None]:
# creates numeric label values: 1 - false news, 0 - veridic news
# creates numeric subcategory values: 2 - false news, 1 - partially false news, 0 - veridic news 
class LabelPreparationTransformer (BaseEstimator, TransformerMixin) :
  def binary_target_mapper(self, x): 
    if x == 1 : return 0;
    if x == 0 : return 1;
    return None;

  def multi_target_mapper(self, x): 
    if x == 'false news' : return 2;
    if x == 'partially false' : return 1;
    if x == 'true' : return 0;
    return None;

  def fit(self, X) :
      return self;

  def transform(self, X, Y = None) :
    X["binary_target"] = X["label"].apply(lambda x: self.binary_target_mapper(x));
    X["multi_target"] = X["subcategory"].apply(lambda x: self.multi_target_mapper(x));

    X = X.drop(["label", "subcategory"], axis = 1);

    return X;    

In [None]:
# concatenates title and text for generating the content
# minimal cleanup for empty values 
class BasicPreparationTransformer (BaseEstimator, TransformerMixin) :

  def fit(self, X) :
      return self;

  def transform(self, X, Y = None) :
    X["title"] = X["title"].replace(np.NaN, "");
    X["text"] = X["text"].replace(np.NaN, "");

    X["content"] = X["title"] + " " + X["text"];

    X["content"] = X["content"].apply(lambda x : x.strip());

    X = X.drop(["title", "text"], axis = 1);

    return X;    

In [None]:
# removes duplicate news from the dataset
class DuplicateRemoverTransformer(BaseEstimator, TransformerMixin) :
  def __init__(self) :
      super().__init__();
      self._lowercase_letters_only = r'[^a-z]'; 
      
  def hash(self, x) :
      hash_source = x.lower();
      hash_source = re.sub(self._lowercase_letters_only, '', hash_source)
      hash_value = hashlib.sha1(hash_source.encode("utf-8")).hexdigest()

      return hash_value;  

  def fit(self, X) :
      return self;

  def transform(self, X, Y = None) :
    X["hash"] = X["content"].apply(lambda x : self.hash(x));
    X = X.drop_duplicates(subset='hash', keep="first");
    X = X.drop(["hash"], axis = 1);

    return X;      

In [None]:
# tokenizes the text
# removes stop words, numbers and other superfluous information
class TokenizationTransformer (BaseEstimator, TransformerMixin) :
  def __init__(self, nlp_support, custom_stop_words_tokens) :
    super().__init__()
    self._nlp_support = nlp_support;
    self._covid_19_regexp = r"-19";    
    self._covid_regexp = r"covid";
    self._number_regexp = r"[0-9]+";
    self._stemmer = SnowballStemmer("english");
    self._custom_stop_words_tokens = custom_stop_words_tokens;


  def is_valid(self, token) :
    # remove tokens based on their syntatic information 
    is_invalid = token.is_stop or \
      token.is_punct or \
      token.is_left_punct or \
      token.is_right_punct or \
      token.is_space or \
      token.is_bracket or \
      token.is_quote or \
      token.is_currency or \
      token.like_url or \
      token.like_num or \
      token.like_email;

    return not is_invalid;

  def prepare(self, text) :
    prepared_text = text.strip();  
    prepared_text = prepared_text.lower();
    prepared_text = re.sub(self._covid_19_regexp, "", prepared_text);
    prepared_text = re.sub(self._number_regexp, "", prepared_text);
    prepared_text = self._stemmer.stem(prepared_text);
    
    if (len(prepared_text) == 0) :
      return None;

    if (prepared_text in self._custom_stop_words_tokens):
      return None;  

    return prepared_text;

  def fit(self, X) :
      return self;

  def transform(self, X, Y = None) :
    X["tokens_text_joined"] = np.repeat(None, X.shape[0]);
    X["tokens_text_lemma_joined"] = np.repeat(None, X.shape[0]);
    X["tokens_pos_joined"] = np.repeat(None, X.shape[0]);
    X["tokens_text_processed_joined"] = np.repeat(None, X.shape[0]);
                                         
    for i in range(0, X.shape[0]) :
      tokens = self._nlp_support(X.iloc[i]["content"]);
      tokens_text = [];
      tokens_text_lemma = [];
      tokens_text_pos = [];
      tokens_text_processed = [];

      for token in tokens:
        if (self.is_valid(token)) :
          prepared_text = self.prepare(token.text); 
          if (prepared_text == None) :
            continue;
          tokens_text = np.append(tokens_text, token.text);   
          tokens_text_lemma = np.append(tokens_text_lemma, token.lemma_);
          tokens_text_pos = np.append(tokens_text_pos, token.pos_);
          tokens_text_processed = np.append(tokens_text_processed, prepared_text);

      X.iloc[i, X.columns.get_loc("tokens_text_joined")] = " ".join(tokens_text);
      X.iloc[i, X.columns.get_loc("tokens_text_lemma_joined")] = " ".join(tokens_text_lemma);
      X.iloc[i, X.columns.get_loc("tokens_pos_joined")] = " ".join(tokens_text_pos);
      X.iloc[i, X.columns.get_loc("tokens_text_processed_joined")] = " ".join(tokens_text_processed);

    return X;

In [None]:
# determine the sentence count for news records  
class SentencesCountTransformer(BaseEstimator, TransformerMixin) :
  def __init__(self, nlp_support) :
    super().__init__()
    self._nlp_support = nlp_support;

  def get_sentences_count(self, x) :
    doc = self._nlp_support(x);
    return len(list(doc.sents));

  def fit(self, X) :
    return self;  

  def transform(self, X, Y = None) :
    X["sentences_count"] = X["content"].apply(lambda x : self.get_sentences_count(x));
    
    return X;


In [None]:
# determine the sentiment for news records  
class SentimentTransformer(BaseEstimator, TransformerMixin) :
  def fit(self, X) :
    return self;  

  def get_sentiment_subjectivity(self, x):
    blob = TextBlob(x);
    return blob.sentiment.subjectivity;

  def get_sentiment_polarity(self, x):
    blob = TextBlob(x);
    return blob.sentiment.polarity;

  def transform(self, X, Y = None) :
    X["sentiment_polarity"] = X["content"].apply(lambda x : self.get_sentiment_polarity(x));
    X["sentiment_subjectivity"] = X["content"].apply(lambda x : self.get_sentiment_subjectivity(x));
    
    return X;

In [None]:
# determine the readability score (Flesch–Kincaid) for news records  
class ReadabilityScoreTransformer(BaseEstimator, TransformerMixin) :
  def fit(self, X) :
    return self;  

  def get_readability_score(self, x):
      readability_score = textstat.flesch_reading_ease(x);
      return readability_score;

  def transform(self, X, Y = None) :
    X["readability_score"] = X["content"].apply(lambda x : self.get_readability_score(x));
    
    return X;

In [None]:
# create the data processing pipeline
transformation_pipeline = Pipeline([
    ("LabelPreparationTransformer", LabelPreparationTransformer()),
    ("BasicPreparationTransformer", BasicPreparationTransformer()),
    ("DuplicateRemoverTransformer", DuplicateRemoverTransformer()),
    ("TokenizationTransformer", TokenizationTransformer(nlp, stop_words_data)),
    ("SentencesCountTransformer", SentencesCountTransformer(nlp)),
    ("SentimentTransformer", SentimentTransformer()),
    ("ReadabilityScoreTransformer", ReadabilityScoreTransformer())
]);

# process the pipeline
processed_data = transformation_pipeline.fit_transform(raw_data);

In [None]:
# data processing report
print("Processed records: \t {}".format(processed_data.shape[0]));
print("Removed records: \t {}".format(raw_data.shape[0] - processed_data.shape[0]));

Processed records: 	 3002
Removed records: 	 117


In [None]:
# write the processed data
output_path = Path(ARCHIVED_DATA_FILE_PATH);
output_path.mkdir(parents=True, exist_ok=True);

compression_options = dict(method = 'zip', archive_name = PROCESSED_DATA_FILE_NAME); 
processed_data.to_csv(
    ARCHIVED_DATA_FILE_PATH + "/" + ARCHIVED_DATA_FILE_NAME, 
    header = True, 
    index = False, 
    compression = compression_options
);