In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [32]:
#########################
# Data: Tweets / Weibos #
#########################

# ENGLISH
# https://www.kaggle.com/datasets/tariqsays/sentiment-dataset-with-1-million-tweets // 09/22/2020 - 10/10/2022 // 937854 tweets
tweets_kaggle = pd.read_csv("data/kaggle-20-22.csv")
tweets_kaggle.drop(columns=["Language", "Label"], inplace=True)
tweets_kaggle = tweets_kaggle.rename(columns={"Text": "tweet"})


# https://archive.ics.uci.edu/ml/datasets/Health+News+in+Twitterw // 2011-06-13 - 2015-04-09 // 62316 tweets
tweets_uci = pd.DataFrame()
directory = "data/uci-2015/"
for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        try:
            df = pd.read_csv(
                directory + filename,
                sep="|",
                header=None,
                on_bad_lines="skip",
                engine="python",
            )
        except:
            print("Error reading file: " + filename)
            assert False
        df.columns = ["tweet_id", "date", "tweet"]
        df["date"] = pd.to_datetime(df["date"], format="%a %b %d %H:%M:%S %z %Y")
        df.drop(columns=["tweet_id"], inplace=True)
        tweets_uci = pd.concat([tweets_uci, df])

# 1,000,170 tweets in total
tweets = pd.concat([tweets_kaggle, tweets_uci])

# CHINESE
# https://github.com/brightmart/nlp_chinese_corpus, webtext2019zh dataset // 2015 - 2016 // 41,000,000 posts
weibo = pd.read_json("data/webtext15-16.json", lines=True)
weibo.drop_duplicates(subset=["qid"], inplace=True)

In [None]:
#######################
# Data: News Articles #
#######################

# ENGLISH
# https://components.one/datasets/all-the-news-2-news-articles-dataset/ // 2017-08-01 - 2018-02-01 // 2688878 articles
en_news = pd.read_csv("data/all-the-news.csv", parse_dates=["date"])
en_news.drop(columns=["url"], inplace=True)

# CHINESE
# https://www.kaggle.com/datasets/ceshine/yet-another-chinese-news-dataset
ch_news = pd.read_csv("data/ch-news.csv", parse_dates=["date"])
ch_news.drop(columns=["url", "image"], inplace=True)

KeyError: Index(['qid'], dtype='object')

In [None]:
#####################
# Data: Pre-process #
#####################

en_keywords = [
    "Alzheimer",
    "Dementia",
    "Ageing",
    "Memory loss",
    "Cognitive impairment",
    "Neurodegenerative disease",
    "Brain health",
    "Aging population",
    "Mild cognitive impairment",
    "Tau protein",
    "Beta-amyloid protein",
    "Brain imaging",
    "Neuropsychological testing",
    "Caregiving",
    "Risk factors",
    "Genetics",
    "Lifestyle interventions",
    "Pharmacotherapy",
    "Rehabilitation",
    "Social support",
    "Quality of life",
    "Long-term care",
    "Epidemiology",
]

if not os.path.exists("out"):
    os.makedirs("out")
    
print("Loading English data...")

# Cache filtered tweets
if os.path.exists("out/relevant_tweets.csv"):
    print("Loading cached tweets...")
    relevant_tweets = pd.read_csv("out/relevant_tweets.csv")
else:
    # 1987 relevant tweets
    print("Filtering tweets...")
    relevant_tweets = tweets[
        tweets.tweet.str.contains("|".join(en_keywords), case=False, na=False)
    ]
    relevant_tweets.to_csv("out/relevant_tweets.csv", header=True, index=False)


# Cache filtered news articles
if os.path.exists("out/relevant_en_news.csv"):
    print("Loading cached news articles...")
    relevant_news = pd.read_csv("out/relevant_en_news.csv")
else:
    # 1533 relevant news articles
    print("Filtering news articles...")
    relevant_en_news = en_news[
        en_news.title.str.contains("|".join(en_keywords), case=False, na=False)
    ]
    relevant_en_news.to_csv("out/relevant_en_news.csv", header=True, index=False)

In [None]:
ch_keywords = [
    "阿尔茨海默",  # "Alzheimer"
    "失智",  # "Dementia"
    "老化",  # "Ageing"
    "减退",  # "Memory loss"
    "认知障碍",  # "Cognitive impairment"
    "神经退化",  # "Neurodegenerative disease"
    "大脑健康",  # "Brain health"
    "人口老龄化",  # "Aging population"
    "轻度认知障碍",  # "Mild cognitive impairment"
    "Tau蛋白",  # "Tau protein"
    "β-淀粉样蛋," "大脑成像",  # "Beta-amyloid protein"  # "Brain imaging"
    "神经心理学测试",  # "Neuropsychological testing"
    "照顾",  # "Caregiving"
    "风险因素",  # "Risk factors"
    "遗传",  # "Genetics"
    "药物",  # "Pharmacotherapy"
    "康复",  # "Rehabilitation"
    "社会支持",  # "Social support"
    "生活质量",  # "Quality of life"
    "长期护理",  # "Long-term care"
]

if not os.path.exists("out"):
    os.makedirs("out")

print("Loading Chinese data...")

# Cache filtered tweets
if os.path.exists("out/relevant_weibo.csv"):
    print("Loading cached weibos...")
    relevant_weibo = pd.read_csv("out/relevant_weibo.csv")
else:
    # 72776 relevant tweets
    print("Filtering weibos...")
    relevant_weibo = weibo[
        weibo.title.str.contains("|".join(ch_keywords), case=False, na=False)
        | weibo.desc.str.contains("|".join(ch_keywords), case=False, na=False)
    ]
    relevant_weibo.to_csv("out/relevant_weibo.csv", header=True, index=False)


# Cache filtered news articles
if os.path.exists("out/relevant_ch_news.csv"):
    print("Loading cached news articles...")
    relevant_ch_news = pd.read_csv("out/relevant_ch_news.csv")
else:
    # 467 relevant news articles
    print("Filtering news articles...")
    relevant_ch_news = ch_news[
        ch_news.title.str.contains("|".join(ch_keywords), case=False, na=False)
        | ch_news.desc.str.contains("|".join(ch_keywords), case=False, na=False)
    ]
    relevant_ch_news.to_csv("out/relevant_ch_news.csv", header=True, index=False)

Unnamed: 0,qid,title,desc,topic,star,content,answer_id,answerer_tags
299,29077330,相比初进门时，你收养的流浪动物如今有了哪些变化？,看着一只只原本受伤可怜的小动物在精心照顾下都变成了家里的“老大”：有可能是外表的变化，也有可...,宠物饲养,18,前年过年我朋友捡到一只狗奄奄一息 她没办法了 我有养过狗，说你抱来吧。看看能不能活 抱回来一...,44963240,
411,29077330,相比初进门时，你收养的流浪动物如今有了哪些变化？,看着一只只原本受伤可怜的小动物在精心照顾下都变成了家里的“老大”：有可能是外表的变化，也有可...,宠物饲养,4,先上图 我家芝麻，2016.7.5捡的，那是我放假第一天，一回家就跟麻麻说做梦都想养只猫。那...,114188435,猫奴进行时
577,43321220,如何对待三岁男孩喜欢摸妈妈的乳房这件事？,曾经，我有个小表弟，记得他都上小学了，总是去摸外婆的乳房，她妈妈不让摸。她妈妈也用提醒外婆不...,幼儿教育,6,谢邀。我儿子一岁多两岁的时候就不摸了。有时候逗他，他说才不要呢，人家都长大了！ 她老妈在房里...,95389544,问答社区人称晒娃狂魔
789,20690585,抑郁症的本质是什么？人为什么会得抑郁症？,听说治疗抑郁症必须要借助药物，但抑郁症是心理疾病，而药物的本质是化学。抑郁症等心理疾病，难道...,心理学,6,每个人的基因里如果有一些“糊涂”，但是这种糊涂基因又不会影响你的工作的话，那么这是一种有益的...,123042170,占星/哲学/神学/心理学
878,30948512,因为当年的嫁妆，我姐恨我全家，我作为弟弟该怎么办？,先说说基本情况吧，我是男生，今年18岁，高二在读，马上升高三。我有个同母异父的姐姐，今年30...,亲子,7,既然你这么不爱看大家指责你父母，那我单独问你吧。告诉你一个表哥被硬塞进我家住了四年的故事。虽...,50201929,
...,...,...,...,...,...,...,...,...
4120747,24636155,张飞得夏侯氏的场合以及前因后果是怎样的？,魏略说：夏后氏是建安五年在本郡出行樵采，为张飞所得。<br>从地点和事件来说，这很可能是一起...,三国,47,谢邀。这个有两种玩法。一种是开脑洞，就有无数种编法。我提供几种编法： 故事一：张飞出了名的会...,28469777,公众号：张佳玮写字的地方
4120913,30983096,孤独的人适合养什么宠物？,我是男生，在公司的驻外机构工作，常年一个人。平时除了工作上的合作伙伴，就很少有其他人交流了。...,生活,20,黄皮子大仙 1卷卷有一些不可抗拒的强迫症，比如它喜欢进我卧室，而我不准，一趁开门就钻进去，怎...,54076986,卖原单，公众号：纯原宝鉴
4121034,19656997,是否子女很容易重复父母的婚姻模式？为什么？怎么办？,之前看到国外有研究统计表明“在家庭暴力中长大的男孩，成年后虐待女性的几率反而比其他人高”。（...,心理学,3,我不太认同这种说法， 虽然有概率上的问题，我发现人们自我设定和心理暗示的力量实在太强大， 你...,12938554,Coder
4121222,41851663,从古至今中国人为什么重男轻女呢？,如果说现代人重男轻女是因为祖传的Y染色体，那古代人呢，他们不懂这些遗传学知识呀，又怎么出现的...,性别,4,谢邀。看到这个问题第一反应就是Ortner的文章，'Is Female to Male as...,92581335,Et in Arcadia ego


In [None]:
# Distribution of tweets and news articles over time

# Set figure size and title
plt.figure(figsize=(10, 5))
plt.title("Distribution of Dates")

# Create histogram of date column
plt.hist(relevant_tweets["date"], bins=100)
plt.hist(relevant_news["date"], bins=100)

# Display the plot
plt.show()