In [1]:
import pandas as pd
import numpy as np

In [2]:
builders = []

def builder(func):
    builders.append(func)
    return func

In [3]:
@builder
def covid1():
    indata = pd.read_csv('./raw_data/covid/Corona_NPL_train.csv')
    ds = pd.DataFrame()
    ds['Tweet'] = indata['OriginalTweet']
    ds['Topic'] = ['covid'] * len(ds)
    return ds

In [4]:
@builder
def covid2():
    indata = pd.read_csv('./raw_data/covid/covidvaccine.csv', low_memory=False)
    ds = pd.DataFrame()
    ds['Tweet'] = indata['text']
    ds['Topic'] = ['covid'] * len(ds)
    return ds

In [5]:
import json

@builder 
def political1():
    df = pd.DataFrame(columns=['Tweet', 'Topic'],)
    for figure in ('biden', 'harris', 'pence', 'trump'):
        indata = pd.read_json(f'./raw_data/political/{figure}_timeline.json')
        # df['Tweet'] = df['Tweet'] + indata['text']
        df['Tweet'] = pd.concat([df['Tweet'],indata['text']],ignore_index=True)

    df['Topic'] = ['political'] * len(df)

    return df

In [6]:
@builder
def political2():
    indata = pd.read_csv('./raw_data/political/kenya_political_tweets.csv')
    ds = pd.DataFrame()
    ds['Tweet'] = indata['text']
    ds['Topic'] = ['political'] * len(ds)
    return ds

In [7]:
@builder
def political3():
    indata = pd.read_csv('./raw_data/political/Political_tweets.csv', low_memory=False)
    ds = pd.DataFrame()
    ds['Tweet'] = indata['text']
    ds['Topic'] = ['political']*len(ds)
    return ds

In [8]:
@builder
def political4():
    indata = pd.read_csv('./raw_data/political/tweets.csv')
    ds = pd.DataFrame()
    ds['Tweet'] = indata['tweets']
    ds['Topic'] = ['political']*len(ds)
    ds['Tweet'].replace('ENGLISH TRANSLATION:', '', regex=True, inplace=True)
    return ds

In [9]:
@builder
def hate1():
    indata = pd.read_csv('./raw_data/hate/sexism_data.csv', low_memory=False)
    ds = pd.DataFrame()
    ds['Tweet'] = indata['text']
    # ds['Topic'] = 
    ds['Topic'] = ['hate' if value else 'none' for value in indata['sexist']]
    MENTION_REGEX = r'MENTION\d*'
    ds['Tweet'].replace(MENTION_REGEX, '', regex=True, inplace=True)
    return ds

In [10]:
@builder
def hate2():
    indata = pd.read_csv('./raw_data/hate/labeled_data.csv')
    ds = pd.DataFrame()
    ds['Tweet'] = indata['tweet']
    ds['Topic'] = ['hate'] * len(ds)
    return ds

In [11]:
@builder
def spam1():
    indata = pd.read_csv('./raw_data/spam/train.csv')
    ds = pd.DataFrame()
    ds['Tweet'] = indata['Tweet']
    ds['Topic'] = ['spam']*len(ds)
    return ds

In [12]:
@builder
def finance1():
    indata = pd.read_csv('./raw_data/finance/stockerbot-export.csv')
    ds = pd.DataFrame()
    ds['Tweet'] = indata['text']
    ds['Topic'] = ['finance']*len(ds)
    return ds

In [13]:
@builder
def finance2():
    indata = pd.read_csv('./raw_data/finance/labelled.csv')
    ds = pd.DataFrame()
    ds['Tweet'] = indata['Column3']
    ds['Topic'] = ['finance']*len(ds)
    return ds

In [14]:
@builder
def none1():
    indata = pd.read_csv('./raw_data/none/gender-classifier.csv')
    ds = pd.DataFrame()
    ds['Tweet'] = indata['text']
    ds['Topic'] = ['none']*len(ds)
    return ds

In [19]:
# @builder
def none2():
    return pd.read_csv('./niki/random-tweets.csv')[['Tweet', 'Topic']]

6015

In [16]:
df = pd.DataFrame(columns=['Tweet', 'Topic'])

for i, builder in enumerate(builders):
    print(f'\rBuilding {i+1}/{len(builders)}', end="")
    df = pd.concat([df, builder()])

print('\nDone building')

print('Removing links')
LINK_REGEX = r'http(s)?:\/\/\S*'
df['Tweet'].replace(LINK_REGEX, '', regex=True, inplace=True)

print('Removing hashtags')
HASHTAG_REGEX = r'#\S*'
df['Tweet'].replace(HASHTAG_REGEX, '', regex=True, inplace=True)

print('Removing @s and RTs')
df['Tweet'].replace(r'@\S+', '', regex=True, inplace=True)
df['Tweet'].replace('RT', '', inplace=True)

print('Removing extra whitespace')
WHITESPACE_REGEX = r'[\s\n*]'
df['Tweet'].replace(WHITESPACE_REGEX, ' ', regex=True, inplace=True)

print('Removing special characters')
df['Tweet'].replace(r'[^\x00-\x7F]*', '', regex=True, inplace=True)

print('Removing short tweets')
MIN_LENGTH = 16
df = df[df['Tweet'].str.len() > MIN_LENGTH]

print('Making all lowercase')
df['Tweet'] = df['Tweet'].apply(lambda tweet: str(tweet).lower())

print('Dataset info:')
print(df['Topic'].value_counts())
print(df.head())
    

Building 13/13
Done building
Removing links
Removing hashtags
Removing @s and RTs
Removing extra whitespace
Removing special characters
Removing short tweets
Making all lowercase
Dataset info:
covid        403997
political    185553
none          36717
finance       32884
hate          25627
spam          11314
Name: Topic, dtype: int64
                                               Tweet  Topic
1  advice talk to your neighbours family to excha...  covid
2  coronavirus australia: woolworths to give elde...  covid
3  my food stock is not the only one which is emp...  covid
4  me, ready to go at supermarket during the  out...  covid
5  as news of the regions first confirmed covid-1...  covid


In [17]:
save_path = 'train-data.csv'
print(f'Saving to {save_path}')
df.to_csv(save_path)

Saving to train-data.csv


In [18]:
df['Topic'].value_counts().min()

11314