# Twitter Corpus 

By: Iris Luden
Last edited: March 2023

Corpus 1: 
- Start date: 7-2015
- End date: 4-2019 (included)

Corpus 2: 
- Start date:    5-2019
- End date: 2-2023 (included)

# Data 

The tweets have been collected using Loureiro et al. (2022): "TimeLMs: Diachronic Language models from Twitter"

Code can be found here: https://github.com/cardiffnlp/timelms

Changes were made only  to 

They are stored in data/cleaned/tweets-{yyyy}-{months specification}.cleaned.jl 



### Tweet cleaning 

- lower case
- tokenized using TweetTokenizer form nltk 
- stripped by strong.punctionation and ”“ characters

#### Capping number of retrieved tweets
Randomly remove some tweets from each year such that each month yields at most 52000 tweets.

The reason is that when scraping using the API, some months retrieve more files than others, and we want them to remain roughly equal in size.

In [None]:
import pandas as pd
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import sent_tokenize

In [None]:
def clean_word(word):
    word = word.lower()
    word = word.strip('#"$%&()*+,-/:;<=>@[\]^_`{|}~”“')
    return word 
    
def cleanup_text(list_of_words):
    ''' Clean up words in a splitted text, strip them from punctuation
     remove empty word strings '''
    cleaned_text = []
    
    for word in list_of_words: 
        word = clean_word(word)
        
        if word != '':
            cleaned_text.append(word)
    
    return cleaned_text


def collect_clean_twitter(filename):
    ''' Filters the tweets in filename 
    Removes tweets that are too small (less than 10 terms)
    Replace odd characters with normal ones 
    Clean texts by removing punctionation 
    Tokenizes the sentences using TweetTokenizer of NLTK 
    Registers at which month the tweet was posted'''
    
    # read data
    df = pd.read_json(filename, lines=True)
    print(f"There are {len(df)} tweets in total")
    
    # remove tweets that are too small - saves computation time
    df['Number of words'] = df['text'].map(lambda x: len(x.split()))
    df = df[df['Number of words'] >= 10]
    print(f'After removing tweets that are too small there are {len(df)} tweets left.')
    
    
    # tokenize the tweets, and replace ’ character to the normal one 
    tokenizer = TweetTokenizer()
    df['text tokenized'] = df['text'].map(lambda x: tokenizer.tokenize(x.replace("’", "'")))
    print("Completed tokenizing the tweets")
    
    # clean the texts 
    df['text cleaned'] = df['text tokenized'].map(lambda x: cleanup_text(x))
    df['Number of words'] = df['text cleaned'].map(lambda x: len(x))
    
    # remove the tweets with too little words 
    df = df[df['Number of words'] >= 10]
    
    print(f'After removing tweets that are too small there are {len(df)} tweets left.')
    
    # collect monthly data
    df['month'] = df['created_at'].map(lambda x: x.month)
    
    df.drop(columns=['created_at', 'username'], inplace=True)
    
    return df 

In [None]:
def cap_tweets_per_month(df, cap=52000):
    '''reduces the data frame to "cap" number of tweets per month. Keeps the corresponding ID.'''
    
    all_dfs = []
    
    for month in df['month'].unique():
        df_month = df[df['month'] == month]
        print(len(df_month))
        if len(df_month) > cap: 
            df_month_new = df_month.sample(n=cap)
            all_dfs.append(df_month_new)
            print(len(df_month_new))
            print()
            
        else: 
            all_dfs.append(df_month)
            
    if len(all_dfs) != len(df['month'].unique()): 
        print("Not the correct number of dfs ")
        return -1 
    
    # rejoin the dataframes back together
    reduced_df = pd.concat(all_dfs)

    return reduced_df

# Write the tweet sentences to the corpus files 

In the form: 

      sentence1 \n
      sentence2 \n 
      sentence3 \n
      ...


In [None]:
def write_to_file(df, outfile):
    '''Write to files such that it can be read by Pathlinesentences for LSCD''' 
    with open('Twitter_Corpus/' + outfile, 'w', encoding='utf-8') as o: 
        print(outfile)
        for text in df['text cleaned']:
            
            o.write(' '.join(text))
            o.write('\n')
            
    with open('Twitter_IDs/' + outfile[11:], 'w', encoding='utf-8') as o2: 
        print('Twitter_IDs/ids_' + outfile[11:])
        for ID in df['id']: 
            o2.write(str(ID))
            o2.write('\n')
            
    print("Completed writing", outfile)

In [None]:
# Clean and reduce the tweet of each year and month 

for _, _, files in os.walk('data/cleaned'):
    for file in files:
        print(file)
        
        # clean 
        df = collect_clean_twitter('data/cleaned/' + file)
        
        # data sets at most 52000 tweets, randomly remove some. 
        df_reduced = cap_tweets_per_month(df, cap=52000)

        # determine outfile 
        year = file[7:-11]
        if 'all' in year: 
            year = year[:-4]
        outfile = f'Twitter_C1/tweets-{year}.txt'
        
#         # save reduced and cleaned file 
#         write_to_file(df_reduced, outfile)

#### Count how many tweets are used per year 

In [None]:
for path, _, files in os.walk('Twitter_IDs/'):
    for filename in files[1:]: 
        year = filename[7:-4]
        with open(path + filename, 'r') as o: 
            ids = o.readlines()
            print(len(ids))

In [None]:
# number of tweets per corpus 
C1 = [288000, 568609, 566447, 569045, 190000]
C2 = [416000, 624000, 521669, 518255, 96084]
print(sum(C1), sum(C2))