In [1]:
# install the necessary packages
!pip3 install requests_cache

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
# import standard libraries
import io
import zipfile
import requests as rq
import requests_cache as rqc
import numpy as np
import pandas as pd

In [3]:
# print environment information
print("ENVIRONMENT INFORMATION")
print("Using numpy version %s" % np.__version__)
print("Using pandas version %s" % pd.__version__)

ENVIRONMENT INFORMATION
Using numpy version 1.22.4
Using pandas version 1.5.3


In [4]:
# reads the data from a remote zip file
def get_data_from_remote_zip_file(file_url, file_name):

  response = rqc.CachedSession().get(file_url)
  binary_data = io.BytesIO(response.content)

  raw_data = None

  with zipfile.ZipFile(binary_data) as z:
    with z.open(file_name) as f:
      raw_data = pd.read_csv(f)
      
  return raw_data

# gets the fake news dataframe from raw data
def get_fake_news_from_raw_data(raw_data) :
  
  fake_news_title = raw_data["title"].copy()
  fake_news_text = raw_data["text"].copy()
  fake_news_is_fake = raw_data["label"] == 0

  fake_news_data_frame = pd.DataFrame({
      "title": fake_news_title,
      "text": fake_news_text,
      "is_fake": fake_news_is_fake
  })

  fake_news_data_frame = fake_news_data_frame[fake_news_data_frame["is_fake"] == True]
  fake_news_data_frame = fake_news_data_frame.drop("is_fake", axis = 1) 
  fake_news_data_frame = fake_news_data_frame.replace(to_replace='None', value=np.nan).dropna()
  fake_news_data_frame = fake_news_data_frame.reset_index()
  fake_news_data_frame.columns = ["original_index", "title", "text"]

  return fake_news_data_frame

In [5]:
# URL and FILE NAME for fake news data
FAKE_NEWS_URL = "https://github.com/INTERTECHNICA-BUSINESS-SOLUTIONS-SRL/NATO-Article-COVID-Fake-News-Content-Enhancement/raw/main/data/original/fake_news_dataset.zip"
FAKE_NEWS_FILE_NAME = "fake_news_dataset.csv"

# obtain the fake news dataframe
raw_data = get_data_from_remote_zip_file(FAKE_NEWS_URL, FAKE_NEWS_FILE_NAME)
fake_news_data_frame = get_fake_news_from_raw_data(raw_data)

In [6]:
# save the processed data
fake_news_data_frame.to_csv(
    "fake_news_data_frame.zip",
    compression = {
        "method" : "zip", 
        "archive_name" : "fake_news_data_frame.csv"
    }
  )