# Diabetes on Twitter

In [3]:
import pandas as pd
from os.path import join
import numpy as np
import os

## Tweets from users that were just diagnosed with diabetes

### Query tweets from the Twitter v2 API

Search terms:  
* "I was just diagnosed with diabetes"
* "today I was diagnosed with diabetes"
* "I just learned I have diabetes"
* "learned I got diabetes"
* "heard I got diabetes" 
* "learned I have diabetes"
* "heard I have diabetes"
* "I was recently diagnosed with diabetes"
* "I recently learned I have diabetes"
* "I recently learned that I have diabetes"

In [None]:
# query data from Twitter API
! chmod +x queries/diagnosed_diabetes.sh
! ./queries/diagnosed_diabetes.sh

In [None]:
# convert line JSON file to flat csv
! twarc2 csv --no-inline-referenced-tweets --show-stats  ../data/diagnosed_diabetes.jsonl ../data/diagnosed_diabetes.csv

In [None]:
# convert line JSON file to flat csv
! twarc2 csv --no-inline-referenced-tweets --show-stats  ../data/diagnosed_diabetes.jsonl ../data/diagnosed_diabetes.csv

### Clean the data

In [1]:
src = '../data'

In [4]:
data = pd.read_csv(join(src, 'diagnosed_diabetes.csv'), parse_dates=['created_at', 'author.created_at'])
data = data.drop(columns=['__twarc.retrieved_at', '__twarc.url', '__twarc.version',
        'attachments.poll.duration_minutes', 'attachments.poll.end_datetime',
        'attachments.poll.id', 'attachments.poll.options',
        'attachments.poll.voting_status', 'attachments.poll_ids',
        'author.entities.description.cashtags','author.pinned_tweet_id',
        'author.profile_image_url', 'author.protected','in_reply_to_user.description',
       'in_reply_to_user.entities.description.cashtags', 'in_reply_to_user.created_at',
       'in_reply_to_user.entities.description.hashtags',
       'in_reply_to_user.entities.description.mentions',
       'in_reply_to_user.entities.description.urls',
       'in_reply_to_user.entities.url.urls', 'in_reply_to_user.id',
       'in_reply_to_user.location', 'in_reply_to_user.name',
       'in_reply_to_user.pinned_tweet_id',
       'in_reply_to_user.profile_image_url', 'in_reply_to_user.protected',
       'in_reply_to_user.public_metrics.followers_count',
       'in_reply_to_user.public_metrics.following_count',
       'in_reply_to_user.public_metrics.listed_count',
       'in_reply_to_user.public_metrics.tweet_count', 'in_reply_to_user.url',
       'in_reply_to_user.username', 'in_reply_to_user.verified',
       'in_reply_to_user.withheld.country_codes', 'in_reply_to_user_id',
       'withheld.scope', 'withheld.copyright', 'withheld.country_codes',
       'reply_settings','attachments.media_keys','author.verified',
       'geo.coordinates.coordinates', 'geo.coordinates.type',
       'geo.country', 'geo.country_code', 'geo.full_name', 'geo.geo.bbox',
       'geo.geo.type', 'geo.id', 'geo.name', 'geo.place_id', 'geo.place_type',
       'entities.cashtags', 'possibly_sensitive', 'type',
       'author.withheld.country_codes','author.entities.description.hashtags',
       'author.entities.description.mentions', 'source',
       'author.entities.description.urls', 'author.entities.url.urls'])

In [5]:
# We just want original tweets, so we remove all referenced tweets
data = data[data['referenced_tweets'].isna()].reset_index(drop=True)
# many users post about a diagnosis anniversary, remove these tweets as well
data['recent'] = data['text'].apply(lambda x: ('years ago' not in x) and \
                                              ('yrs ago' not in x) and \
                                              ('year ago' not in x) and \
                                              ('YEARS AGO' not in x) and \
                                              ('years today' not in x) and \
                                              ('flashback') not in x)


past = data[data['recent'] == False].copy()
recent = data[data['recent'] == True].copy()
print('{} tweets from past diabetes diagnoses'.format(len(past)))
print('{} tweets from recent diabetes diagnoses'.format(len(recent)))

315 tweets from past diabetes diagnoses
224 tweets from recent diabetes diagnoses


In [6]:
recent.to_csv(join(src, 'diagnosed_diabetes_clean.csv'), index=False)
recent['id'].to_csv(join(src, 'diagnosed_diabetes_clean_IDs.csv'), index=False)
np.savetxt(join(src, 'diagnosed_user_IDs.txt'), recent['author.id'].unique(),
           fmt='%d')
np.savetxt(join(src, 'diagnosed_user_names.txt'), recent['author.username'].unique(),
           fmt='%s')

## User profiles

### Query user timelines from the Twitter v2 API

In [None]:
# excludes replies and retweets
! chmod -x /queries/diagnosed_user_timelines.sh
! ./diagnosed_user_timelines.sh

### Convert jsons to csv

In [36]:
! cat ../data/diagnosed_user_IDs_161_to_175.txt | xargs -i sh -c "twarc2 csv --no-inline-referenced-tweets --show-stats ../data/user_timelines/{}.jsonl > ../data/user_timelines_csv/{}.csv"

In [7]:
# columns of interest that we want to keep
cols = [
'author.created_at', 'author.description', 'author.id', 'author.location',
'author.name', 'author.pinned_tweet_id','author.public_metrics.followers_count',
'author.public_metrics.following_count','author.public_metrics.listed_count',
'author.public_metrics.tweet_count', 'author.url', 'author.username',
'context_annotations', 'conversation_id', 'created_at', 'entities.annotations',
'entities.cashtags', 'entities.hashtags', 'id', 'lang', 'possibly_sensitive',
'public_metrics.like_count','public_metrics.quote_count',
'public_metrics.reply_count','public_metrics.retweet_count','referenced_tweets',
'text'
]

In [8]:
userfiles = os.listdir(join(src, 'user_timelines_csv'))

In [None]:
userfiles = os.listdir(join(src, 'user_timelines_csv'))
user_timelines = pd.DataFrame()
for i, f in enumerate(userfiles):
    print(i)
    user_ID = int(f.split('.')[0])
    tmp = pd.read_csv(join(src, 'user_timelines_csv', f), low_memory=False,
                      usecols=cols)
    tmp['user_ID'] = user_ID
    user_timelines = pd.concat([user_timelines, tmp])

In [45]:
user_timelines.to_csv(join(src, 'twitter_user_timelines.csv'), index=False)
user_timelines['id'].to_csv(join(src, 'twitter_user_timelines_IDs.csv'), index=False)