In [1]:
from langdetect import detect
import json_lines

import pandas as pd

In [2]:
def load_jsonl(file):
    with open(file, 'rb') as f:
        # extract relevant fields from tweets. Be aware that replies
        # have a different structure. For example, assuming we would
        # like to extract hashtags we need to distinguish between different cases
        # (other fields return truncated text and hastags)
        list_tweets = list()
        for tweet in json_lines.reader(f, broken=True):
            tweet_id = tweet['id']
            # tweet is a reply / retweet
            if 'retweeted_status' in tweet:
                try:
                    full_tweet = tweet['retweeted_status']['extended_tweet']
                    tweet_text = full_tweet['full_text']
                except:
                    # text and hashtags have not been truncated
                    full_tweet = tweet['retweeted_status']
                    tweet_text = full_tweet['text']
            # no reply
            else:
                try:
                    full_tweet = tweet['extended_tweet']
                    tweet_text = full_tweet['full_text']
                except:
                    full_tweet = tweet
                    tweet_text = full_tweet['text']
            
            # filter tweets with lang != eng
            # if detect(tweet_text) != 'en':
            # this is quite slow ... skip for now
            if False:
                continue
            else:
                list_tweets.append({
                    'id': tweet_id,
                    'text': tweet_text
                })
        return list_tweets

In [3]:
tweets_pre = pd.DataFrame(load_jsonl('data/preGreta.jsonl'))

In [4]:
tweets_pre.head()

Unnamed: 0,id,text
0,1006327664976617472,Our new paper in @NatureClimate shows that tem...
1,1006327640226189318,“The transition to accessible and clean energy...
2,1006327582235688961,"The federal government won't lead on climate, ..."
3,1006327581556084736,Extreme Hurricane Rainfall Expected to Increas...
4,1006327528397697025,"""UN Women: Women To Be Heard When Tackling Cli..."


In [5]:
# tag tweets
from resources.CMUTweetTagger import runtagger_parse

ARK_TWEET_NLP_PATH = 'java -XX:ParallelGCThreads=2 -Xmx500m -jar resources/ark-tweet-nlp-0.3.2/ark-tweet-nlp-0.3.2.jar'

tagged_tweets = runtagger_parse(tweets_pre['text'].values, run_tagger_cmd=ARK_TWEET_NLP_PATH)

In [6]:
tagged_tweets[0]

[('Our', 'D', 0.9918),
 ('new', 'A', 0.998),
 ('paper', 'N', 0.9881),
 ('in', 'P', 0.9948),
 ('@NatureClimate', '@', 0.978),
 ('shows', 'V', 0.987),
 ('that', 'D', 0.8964),
 ('temperature', 'N', 0.9922),
 ('goals', 'N', 0.9671),
 ('alone', 'R', 0.7253),
 ('are', 'V', 0.9867),
 ('not', 'R', 0.9995),
 ('sufficient', 'A', 0.9398),
 ('for', 'P', 0.9987),
 ('understanding', 'V', 0.8635),
 ('future', 'A', 0.4472),
 ('#extremeweather', '#', 0.4919),
 ('-', ',', 0.8946),
 ('composition', 'N', 0.9941),
 ('limits', 'N', 0.9563),
 ('also', 'R', 0.9916),
 ('needed', 'V', 0.9335),
 ('@BristolUni', '@', 0.9985),
 ('@ecioxford', '@', 0.9984),
 ('@cabotinstitute', '@', 0.9984),
 ('@ssparrow01', '@', 0.9978),
 ('#climatechange', '#', 0.9048),
 ('https://t.co/nUhw7XtRcM', 'U', 0.9931)]

In [7]:
def create_words_df(df, tagged_tweets):
    tweets_words = list()
    for i, tags in enumerate(tagged_tweets):
        tweet_id = df.loc[i, 'id']
        for word, tag, conf in tags:
            if tag in ["#", 'N', 'V', 'R', 'A']:
                tweets_words.append({
                    'id':tweet_id,
                    'word': word.lower(),
                    'tag': tag,
                    'conf': conf
                })
                
    return pd.DataFrame(tweets_words)

In [8]:
words_pre = create_words_df(tweets_pre, tagged_tweets)
words_pre.head()

Unnamed: 0,id,word,tag,conf
0,1006327664976617472,new,A,0.998
1,1006327664976617472,paper,N,0.9881
2,1006327664976617472,shows,V,0.987
3,1006327664976617472,temperature,N,0.9922
4,1006327664976617472,goals,N,0.9671


In [9]:
import matplotlib.pyplot as plt

word_counts = words_pre.word.value_counts()
word_counts = word_counts[:50]

# Plot words count
fig, ax = plt.subplots(figsize=(15, 5))
_ = ax.set_title('Words distribution')
_ = ax.bar(x=word_counts.keys().tolist(), height=word_counts.tolist())
_ = ax.tick_params(axis='x', rotation=90)
_ = plt.plot()