In [None]:
import pandas as pd
import os

In [None]:
################
# Data: Tweets #
################

# https://www.kaggle.com/datasets/tariqsays/sentiment-dataset-with-1-million-tweets // 09/22/2020 - 10/10/2022 // 937854 tweets
tweets_kaggle = pd.read_csv("data/kaggle-20-22.csv")
tweets_kaggle.drop(columns=["Language", "Label"], inplace=True)
tweets_kaggle = tweets_kaggle.rename(columns={"Text": "tweet"})


# https://archive.ics.uci.edu/ml/datasets/Health+News+in+Twitterw // 2011-06-13 - 2015-04-09 // 62316 tweets
tweets_uci = pd.DataFrame()
directory = "data/uci-2015/"
for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        try:
            df = pd.read_csv(
                directory + filename,
                sep="|",
                header=None,
                on_bad_lines="skip",
                engine="python",
            )
        except:
            print("Error reading file: " + filename)
            assert False
        df.columns = ["tweet_id", "date", "tweet"]
        df["date"] = pd.to_datetime(df["date"], format="%a %b %d %H:%M:%S %z %Y")
        df.drop(columns=["tweet_id"], inplace=True)
        tweets_uci = pd.concat([tweets_uci, df])

# 1,000,170 tweets in total
tweets = pd.concat([tweets_kaggle, tweets_uci])

In [None]:
#######################
# Data: News Articles #
#######################

# https://components.one/datasets/all-the-news-2-news-articles-dataset/ // 2017-08-01 - 2018-02-01 // 2688878 articles
news = pd.read_csv("data/all-the-news.csv", parse_dates=["date"])

In [70]:
#####################
# Data: Pre-process #
#####################

keywords = [
    "Alzheimer",
    "Dementia",
    "Ageing",
    "Memory loss",
    "Cognitive impairment",
    "Neurodegenerative disease",
    "Brain health",
    "Aging population",
    "Mild cognitive impairment",
    "Tau protein",
    "Beta-amyloid protein",
    "Brain imaging",
    "Neuropsychological testing",
    "Caregiving",
    "Risk factors",
    "Genetics",
    "Lifestyle interventions",
    "Pharmacotherapy",
    "Rehabilitation",
    "Social support",
    "Quality of life",
    "Long-term care",
    "Epidemiology",
]


if not os.path.exists("out"):
    os.makedirs("out")

# Cache filtered tweets
if os.path.exists("out/relevant_tweets.csv"):
    print("Loading cached tweets...")
    relevant_tweets = pd.read_csv("out/relevant_tweets.csv")
else:
    # 1987 relevant tweets
    print("Filtering tweets...")
    relevant_tweets = tweets[tweets.tweet.str.contains("|".join(keywords), case=False, na=False)]
    relevant_tweets.to_csv("out/relevant_tweets.csv", header=True, index=False)
    
    
# Cache filtered news articles
if os.path.exists("out/relevant_news.csv"):
    print("Loading cached news articles...")
    relevant_news = pd.read_csv("out/relevant_news.csv")
else:
    # 1533 relevant news articles
    print("Filtering news articles...")
    relevant_news = news[news.title.str.contains("|".join(keywords), case=False, na=False)]
    relevant_news.to_csv("out/relevant_news.csv", header=True, index=False)

Loading cached tweets...
Loading cached news articles...


In [71]:
relevant_news

Unnamed: 0,date,year,month,day,author,title,article,url,section,publication
0,2018-07-24 12:26:00,2018,7.0,24,Julia Belluz,Dementia prevention: 9 behaviors could cut you...,Dementia has long been thought of as an inevit...,https://www.vox.com/health-care/2017/7/31/1604...,,Vox
1,2016-02-23 13:30:02,2016,2.0,23,Clayton Aldern,Meet the scientist connecting the dots between...,"Originally published as part of Grist's ""Clima...",https://www.vox.com/2016/2/23/11094686/air-pol...,,Vox
2,2019-06-23 00:00:00,2019,6.0,23,"Hyun Young Yi, Hyonhee Shin",'Don't ask my age': Ageing South Koreans begin...,SEOUL (Reuters) - Boasting an overgrown beard ...,https://www.reuters.com/article/us-southkorea-...,Business News,Reuters
3,2018-07-25 22:35:54,2018,7.0,25,Julia Belluz,Dementia drug research: Biogen and Eisai unvei...,"Alzheimer’s is one of the deadliest, costliest...",https://www.vox.com/2018/7/25/17607376/dementi...,,Vox
4,2016-09-15 08:35:54,2016,9.0,15,Charles Clark,The cities with the best quality of life in th...,Global design and consultancy firm Arcadis has...,https://www.businessinsider.com/the-cities-wit...,,Business Insider
...,...,...,...,...,...,...,...,...,...,...
1528,2020-03-06 00:00:00,2020,3.0,6,,Vichy + Vichy Liftactiv Specialist Peptide-C A...,,https://www.refinery29.com/en-us/shop/product/...,,Refinery 29
1529,2020-03-30 00:00:00,2020,3.0,30,"By Sandee LaMotte, CNN",Heart disease and air pollution skyrockets ris...,(CNN)People with heart conditions who live in...,https://www.cnn.com/2020/03/30/health/air-poll...,health,CNN
1530,2020-03-18 15:39:26,2020,3.0,18,"Patricia Mazzei, Frances Robles, Audra D. S. B...",A Deadly Coronavirus Mix in Florida: An Aging ...,"In Florida, where a quarter of the population ...",https://www.nytimes.com/2020/03/18/us/coronavi...,us,The New York Times
1531,2020-03-27 10:30:17,2020,3.0,27,Nicholas Bakalar,Daily Aspirin Does Not Lower Alzheimer’s Risk,Older men and women who took low-dose aspirin ...,https://www.nytimes.com/2020/03/27/well/mind/b...,well,The New York Times
