In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [3]:
#########################
# Data: Tweets / Weibos #
#########################

# ENGLISH
# https://www.kaggle.com/datasets/tariqsays/sentiment-dataset-with-1-million-tweets // 09/22/2020 - 10/10/2022 // 937854 tweets
tweets_kaggle = pd.read_csv("data/kaggle-20-22.csv")
tweets_kaggle.drop(columns=["Language", "Label"], inplace=True)
tweets_kaggle = tweets_kaggle.rename(columns={"Text": "tweet"})


# https://archive.ics.uci.edu/ml/datasets/Health+News+in+Twitterw // 2011-06-13 - 2015-04-09 // 62316 tweets
tweets_uci = pd.DataFrame()
directory = "data/uci-2015/"
for filename in os.listdir(directory):
    if filename.endswith(".txt"):
        try:
            df = pd.read_csv(
                directory + filename,
                sep="|",
                header=None,
                on_bad_lines="skip",
                engine="python",
            )
        except:
            print("Error reading file: " + filename)
            assert False
        df.columns = ["tweet_id", "date", "tweet"]
        df["date"] = pd.to_datetime(df["date"], format="%a %b %d %H:%M:%S %z %Y")
        df["date"] = df["date"].dt.tz_convert(None)
        df.drop(columns=["tweet_id"], inplace=True)
        tweets_uci = pd.concat([tweets_uci, df])

# 1,000,170 tweets in total
tweets = pd.concat([tweets_kaggle, tweets_uci])

# CHINESE
# https://github.com/brightmart/nlp_chinese_corpus, webtext2019zh dataset // 2015 - 2016 // 41,000,000 posts
weibo = pd.read_json("data/webtext15-16.json", lines=True)
weibo.drop_duplicates(subset=["qid"], inplace=True)

In [8]:
#######################
# Data: News Articles #
#######################

# ENGLISH
# https://components.one/datasets/all-the-news-2-news-articles-dataset/ // 2017-08-01 - 2018-02-01 // 2688878 articles
en_news = pd.read_csv("data/all-the-news.csv", parse_dates=["date"])
# en_news["date"] = pd.to_datetime(en_news["date"], format="YYYY-MM-DD HH:MM:SS")
en_news.drop(columns=["url"], inplace=True)

# CHINESE
# https://www.kaggle.com/datasets/ceshine/yet-another-chinese-news-dataset
ch_news = pd.read_csv("data/ch-news.csv", parse_dates=["date"])
# ch_news["date"] = pd.to_datetime(ch_news["date"], format="YYYYMMDD")
ch_news.drop(columns=["url", "image"], inplace=True)


In [10]:
en_news["date"].dtype

dtype('O')

In [9]:
#####################
# Data: Pre-process #
#####################

en_keywords = [
    "Alzheimer",
    "Dementia",
    "Ageing",
    "Memory loss",
    "Cognitive impairment",
    "Neurodegenerative disease",
    "Brain health",
    "Aging population",
    "Mild cognitive impairment",
    "Tau protein",
    "Beta-amyloid protein",
    "Brain imaging",
    "Neuropsychological testing",
    "Caregiving",
    "Risk factors",
    "Genetics",
    "Lifestyle interventions",
    "Pharmacotherapy",
    "Rehabilitation",
    "Social support",
    "Quality of life",
    "Long-term care",
    "Epidemiology",
]

if not os.path.exists("out"):
    os.makedirs("out")
    
print("Loading English data...")

# Cache filtered tweets
if os.path.exists("out/relevant_tweets.xlsx"):
    print("Loading cached tweets...")
    relevant_tweets = pd.read_excel("out/relevant_tweets.xlsx")
else:
    # 1987 relevant tweets
    print("Filtering tweets...")
    relevant_tweets = tweets[
        tweets.tweet.str.contains("|".join(en_keywords), case=False, na=False)
    ]
    relevant_tweets.to_excel("out/relevant_tweets.xlsx", header=True, index=False)


# Cache filtered news articles
if os.path.exists("out/relevant_en_news.xlsx"):
    print("Loading cached news articles...")
    relevant_en_news = pd.read_excel("out/relevant_en_news.csv")
else:
    # 1533 relevant news articles
    print("Filtering news articles...")
    relevant_en_news = en_news[
        en_news.title.str.contains("|".join(en_keywords), case=False, na=False)
    ]
    relevant_en_news.to_excel("out/relevant_en_news.xlsx", header=True, index=False)

Loading English data...
Loading cached tweets...
Filtering news articles...


In [17]:
ch_keywords = [
    "阿尔茨海默",  # "Alzheimer"
    "失智",  # "Dementia"
    "老化",  # "Ageing"
    "减退",  # "Memory loss"
    "认知障碍",  # "Cognitive impairment"
    "神经退化",  # "Neurodegenerative disease"
    "大脑健康",  # "Brain health"
    "人口老龄化",  # "Aging population"
    "轻度认知障碍",  # "Mild cognitive impairment"
    "Tau蛋白",  # "Tau protein"
    "β-淀粉样蛋," "大脑成像",  # "Beta-amyloid protein"  # "Brain imaging"
    "神经心理学测试",  # "Neuropsychological testing"
    "照顾",  # "Caregiving"
    "风险因素",  # "Risk factors"
    "遗传",  # "Genetics"
    "药物",  # "Pharmacotherapy"
    "康复",  # "Rehabilitation"
    "社会支持",  # "Social support"
    "生活质量",  # "Quality of life"
    "长期护理",  # "Long-term care"
]

if not os.path.exists("out"):
    os.makedirs("out")

print("Loading Chinese data...")

# Cache filtered tweets
if os.path.exists("out/relevant_weibo.csv"):
    print("Loading cached weibos...")
    relevant_weibo = pd.read_csv("out/relevant_weibo.csv")
else:
    # 7985 relevant weibos
    print("Filtering weibos...")
    relevant_weibo = weibo[
        weibo.title.str.contains("|".join(ch_keywords), case=False, na=False)
        | weibo.desc.str.contains("|".join(ch_keywords), case=False, na=False)
    ]
    relevant_weibo.to_csv("out/relevant_weibo.csv", header=True, index=False, encoding="utf_8_sig")


# Cache filtered news articles
if os.path.exists("out/relevant_ch_news.csv"):
    print("Loading cached news articles...")
    relevant_ch_news = pd.read_csv("out/relevant_ch_news.csv")
else:
    # 467 relevant news articles
    print("Filtering news articles...")
    relevant_ch_news = ch_news[
        ch_news.title.str.contains("|".join(ch_keywords), case=False, na=False)
        | ch_news.desc.str.contains("|".join(ch_keywords), case=False, na=False)
    ]
    relevant_ch_news.to_csv("out/relevant_ch_news.csv", header=True, index=False, encoding="utf-8-sig")

Loading Chinese data...
Filtering weibos...
Filtering news articles...


In [21]:
# relevant_en_news    # 1533 relevant news articles
# relevant_tweets     # 1987 relevant tweets
# relevant_ch_news    # 467 relevant news articles
relevant_weibo      # 7985 relevant weibos

Unnamed: 0,qid,title,desc,topic,star,content,answer_id,answerer_tags
299,29077330,相比初进门时，你收养的流浪动物如今有了哪些变化？,看着一只只原本受伤可怜的小动物在精心照顾下都变成了家里的“老大”：有可能是外表的变化，也有可...,宠物饲养,18,前年过年我朋友捡到一只狗奄奄一息 她没办法了 我有养过狗，说你抱来吧。看看能不能活 抱回来一...,44963240,
577,43321220,如何对待三岁男孩喜欢摸妈妈的乳房这件事？,曾经，我有个小表弟，记得他都上小学了，总是去摸外婆的乳房，她妈妈不让摸。她妈妈也用提醒外婆不...,幼儿教育,6,谢邀。我儿子一岁多两岁的时候就不摸了。有时候逗他，他说才不要呢，人家都长大了！ 她老妈在房里...,95389544,问答社区人称晒娃狂魔
789,20690585,抑郁症的本质是什么？人为什么会得抑郁症？,听说治疗抑郁症必须要借助药物，但抑郁症是心理疾病，而药物的本质是化学。抑郁症等心理疾病，难道...,心理学,6,每个人的基因里如果有一些“糊涂”，但是这种糊涂基因又不会影响你的工作的话，那么这是一种有益的...,123042170,占星/哲学/神学/心理学
878,30948512,因为当年的嫁妆，我姐恨我全家，我作为弟弟该怎么办？,先说说基本情况吧，我是男生，今年18岁，高二在读，马上升高三。我有个同母异父的姐姐，今年30...,亲子,7,既然你这么不爱看大家指责你父母，那我单独问你吧。告诉你一个表哥被硬塞进我家住了四年的故事。虽...,50201929,
1041,24226804,如何自己一个人带孩子?,事情是这样的 今年十月孩子就要出生了 可孩子生出来由谁来带成为我们夫妻俩的烦恼了 由于各方面...,育儿,10,我月子之后上班之前都是自己带，上班之后也是晚上带小孩睡觉。我认为：要借助现代科学，具体看我的...,35019755,
...,...,...,...,...,...,...,...,...
4116942,30557532,父母感情不和会真的让子女对家庭婚姻生活失去信心吗?,我父母感情不好，经常吵架冷战，而且常以离婚来作为威胁。<br> 说说父母的性格吧，父亲勤劳踏...,生活,12,听说孙俪曾经因为父母婚姻问题决定做一名不婚主义者，直到后来遇到了邓超，才有了现在的“114”...,55478473,
4117097,31611036,目前的B超技术能诊断出哪些胎儿缺陷？有哪些疾病是容易漏检？有哪些是无法检测只有出生后才会发现？,一个新生儿妈妈，经历各种B超检查，显示正常，但出生婴儿耳朵畸形。类似的还有无手无脚。查了一些...,B超,6,谢谢邀请 首先新生儿是否残疾或有疾病会受到产前，产时和产后的影响。可以说即使产前做了百分百的...,52671913,特殊教育
4117103,34590028,无赖亲戚欠债不还反咬一口怎么办？,大家好，写下这篇文章实属无奈，想来微博向大家寻找解决办法，怎样对付欠钱不还的无赖，万分感谢<...,纠纷,3,两招： 1直接起诉。用银行流水单和他的微信记录来当做证据。2夜里带一桶汽油，从他家门缝里倒进...,59293621,男，智能机械人，图灵满分
4118331,31910555,能不能用药物、手术或其它方法提升弹钢琴的手指机能？,,钢琴,11,能 一，PHp基础机能训练 10分钟提升一 httpm.v.qq.compagentzn01...,53818699,


In [None]:
# Distribution of tweets and news articles over time

# Set figure size and title
plt.figure(figsize=(20, 10))
plt.title("Distribution of Dates")

# Create histogram of date column
plt.hist(relevant_en_news["date"], bins=10, label="English News")
plt.hist(relevant_ch_news["date"], bins=10, label="Chinese News")
plt.legend()

# Display the plot
plt.show()