In [1]:
import pandas as pd
import numpy as np

In [2]:
builders = []

def builder(func):
    builders.append(func)
    return func

In [3]:
@builder
def covid1():
    indata = pd.read_csv('./raw_data/covid/Corona_NPL_train.csv')
    ds = pd.DataFrame()
    ds['Tweet'] = indata['OriginalTweet']
    ds['Topic'] = ['covid'] * len(ds)
    return ds

In [4]:
@builder
def covid2():
    indata = pd.read_csv('./raw_data/covid/covidvaccine.csv', low_memory=False)
    ds = pd.DataFrame()
    ds['Tweet'] = indata['text']
    ds['Topic'] = ['covid'] * len(ds)
    return ds

In [5]:
import json

@builder 
def political1():
    df = pd.DataFrame(columns=['Tweet', 'Topic'],)
    for figure in ('biden', 'harris', 'pence', 'trump'):
        indata = pd.read_json(f'./raw_data/political/{figure}_timeline.json')
        # df['Tweet'] = df['Tweet'] + indata['text']
        df['Tweet'] = pd.concat([df['Tweet'],indata['text']],ignore_index=True)

    df['Topic'] = ['political'] * len(df)

    return df

In [6]:
@builder
def political2():
    indata = pd.read_csv('./raw_data/political/kenya_political_tweets.csv')
    ds = pd.DataFrame()
    ds['Tweet'] = indata['text']
    ds['Topic'] = ['political'] * len(ds)
    return ds

In [7]:
@builder
def political3():
    indata = pd.read_csv('./raw_data/political/Political_tweets.csv', low_memory=False)
    ds = pd.DataFrame()
    ds['Tweet'] = indata['text']
    ds['Topic'] = ['political']*len(ds)
    return ds

In [8]:
@builder
def political4():
    indata = pd.read_csv('./raw_data/political/tweets.csv')
    ds = pd.DataFrame()
    ds['Tweet'] = indata['tweets']
    ds['Topic'] = ['political']*len(ds)
    ds['Tweet'].replace('ENGLISH TRANSLATION:', '', regex=True, inplace=True)
    return ds

In [9]:
@builder
def hate1():
    indata = pd.read_csv('./raw_data/hate/sexism_data.csv', low_memory=False)
    ds = pd.DataFrame()
    ds['Tweet'] = indata['text']
    # ds['Topic'] = 
    ds['Topic'] = ['hate' if value else 'none' for value in indata['sexist']]
    MENTION_REGEX = r'MENTION\d*'
    ds['Tweet'].replace(MENTION_REGEX, '', regex=True, inplace=True)
    return ds

In [10]:
@builder
def hate2():
    indata = pd.read_csv('./raw_data/hate/labeled_data.csv')
    ds = pd.DataFrame()
    ds['Tweet'] = indata['tweet']
    ds['Topic'] = ['hate'] * len(ds)
    return ds

In [11]:
@builder
def hate3():
    indata = pd.read_csv('./raw_data/hate/TwitterHate.csv')
    ds = pd.DataFrame()
    ds['Tweet'] = indata['tweet']
    ds['Topic'] = ['hate' if label==1 else 'none' for label in indata['label']]
    return ds

In [12]:
# @builder
# def spam1():
#     indata = pd.read_csv('./raw_data/spam/train.csv')
#     ds = pd.DataFrame()
#     ds['Tweet'] = indata['Tweet']
#     ds['Topic'] = ['spam']*len(ds)
#     return ds

In [13]:
@builder
def finance1():
    indata = pd.read_csv('./raw_data/finance/stockerbot-export.csv')
    ds = pd.DataFrame()
    ds['Tweet'] = indata['text']
    ds['Topic'] = ['finance']*len(ds)
    return ds

In [14]:
@builder
def finance2():
    indata = pd.read_csv('./raw_data/finance/labelled.csv')
    ds = pd.DataFrame()
    ds['Tweet'] = indata['Column3']
    ds['Topic'] = ['finance']*len(ds)
    return ds

In [15]:
@builder
def none1():
    indata = pd.read_csv('./raw_data/none/gender-classifier.csv')
    ds = pd.DataFrame()
    ds['Tweet'] = indata['text']
    ds['Topic'] = ['none']*len(ds)
    return ds

In [16]:
@builder
def none2():
    return pd.read_csv('./niki/random-tweets.csv')[['Tweet', 'Topic']]

In [22]:
@builder
def tweetval():
    ds = pd.DataFrame()

    dirs = ['hate', 'emotion', 'irony', 'offensive', 'sentiment', 'stance/abortion' , 'stance/atheism' , 'stance/climate', 'stance/feminist', 'stance/hillary']
    mappings = [
        ['none','hate'],
        ['hate', 'none', 'none', 'none'],
        ['none', 'none'],
        ['none', 'hate'],
        ['none', 'none', 'none'],
        ['none', 'hate', 'none'],
        ['none', 'none', 'none'],
        ['none', 'political', 'none'],
        ['none', 'hate', 'none'],
        ['political', 'political', 'political'],
        ]

    tweet = []
    topic = []
    for i, path in enumerate(dirs):
        labels = open(f'./raw_data/tweetval/{path}/train_labels.txt', encoding='utf-8').read().split('\n');
        text = open(f'./raw_data/tweetval/{path}/train_text.txt', encoding='utf-8').read().split('\n');
        labels += open(f'./raw_data/tweetval/{path}/test_labels.txt', encoding='utf-8').read().split('\n');
        text += open(f'./raw_data/tweetval/{path}/test_text.txt', encoding='utf-8').read().split('\n');
        labels += open(f'./raw_data/tweetval/{path}/val_labels.txt', encoding='utf-8').read().split('\n');
        text += open(f'./raw_data/tweetval/{path}/val_text.txt', encoding='utf-8').read().split('\n');

        fixed_text = []
        fixed_labels = []
        for j in range(len(labels)):
            t = text[j]
            l = labels[j]

            if t and l:
                fixed_text.append(t)
                fixed_labels.append(mappings[i][int(l)])

        tweet.extend(fixed_text)
        topic.extend(fixed_labels)

    ds['Tweet'] = tweet
    ds['Topic'] = topic

    return ds

Unnamed: 0,Tweet,Topic
0,@user nice new signage. Are you not concerned ...,none
1,A woman who you fucked multiple times saying y...,hate
2,@user @user real talk do you have eyes or were...,hate
3,your girlfriend lookin at me like a groupie in...,hate
4,Hysterical woman like @user,none
...,...,...
100772,HillaryClinton Next up for legalization: Your ...,political
100773,You can't grow an economy from the bottom up. ...,political
100774,Too many women in too many countries speak the...,political
100775,"In case of Emergency, Push THIS BUTTON to scru...",political


In [18]:
df = pd.DataFrame(columns=['Tweet', 'Topic'])

for i, builder in enumerate(builders):
    print(f'\rBuilding {i+1}/{len(builders)}', end="")
    df = pd.concat([df, builder()])

print('\nDone building')

print('Removing links')
LINK_REGEX = r'http(s)?:\/\/\S*'
df['Tweet'].replace(LINK_REGEX, '', regex=True, inplace=True)

print('Removing hashtags')
df['Tweet'].replace('#', '', inplace=True)

print('Removing @s and RTs')
df['Tweet'].replace(r'@\S+', '', regex=True, inplace=True)
df['Tweet'].replace('RT', '', inplace=True)

print('Removing extra whitespace')
WHITESPACE_REGEX = r'[\s\n*]'
df['Tweet'].replace(WHITESPACE_REGEX, ' ', regex=True, inplace=True)

print('Removing special characters')
df['Tweet'].replace(r'[^\x00-\x7F]*', '', regex=True, inplace=True)

print('Removing short tweets')
MIN_LENGTH = 16
df = df[df['Tweet'].str.len() > MIN_LENGTH]

print('Making all lowercase')
df['Tweet'] = df['Tweet'].apply(lambda tweet: str(tweet).lower())

print('Dataset info:')
print(df['Topic'].value_counts())
print(df.head())

null_counts = df['Topic'].value_counts()['none']
print(f'Target accuracy: {1-(null_counts/len(df))}')
    

Building 14/14
Done building
Removing links
Removing hashtags
Removing @s and RTs
Removing extra whitespace
Removing special characters
Removing short tweets
Making all lowercase
Dataset info:
covid        409030
political    186534
none          70094
1             42815
0             33343
finance       32925
hate          27993
2             22518
3              1291
Name: Topic, dtype: int64
                                               Tweet  Topic
1  advice talk to your neighbours family to excha...  covid
2  coronavirus australia: woolworths to give elde...  covid
3  my food stock is not the only one which is emp...  covid
4  me, ready to go at supermarket during the #cov...  covid
5  as news of the regions first confirmed covid-1...  covid
Target accuracy: 0.9151961845905173


In [19]:
save_path = 'train-data.csv'
print(f'Saving to {save_path}')
df.to_csv(save_path)

Saving to train-data.csv


In [20]:
df['Topic'].value_counts().min()

1291