# Dataset Preparation

## Imports

In [None]:
import os, glob, re
import pandas as pd
from sklearn.model_selection import train_test_split

## Constants

In [None]:
# constants
MIN_LENGTH = 40

DATASET_PREFIX = '../dataset/'
ENGLISH_PATH = DATASET_PREFIX + '1_english/'
URLS_MENTIONS_REMOVED_PATH = DATASET_PREFIX + '2_filtered_urls_mentions/'
MIN_LENGTH_PATH = DATASET_PREFIX + '3_length_greater_than_{0}/'.format(MIN_LENGTH)
PREPROCESSED_PATH = DATASET_PREFIX + '4_pre-processed/'
PROCESSED_PATH = DATASET_PREFIX + '5_processed/'

## Load Data

In [None]:
def get_file_pairs(files_dir):
    files = glob.glob(files_dir + '*.csv')
    file_pairs = []

    for each in files:
        file_name = each.split('/')[-1:][0]

        dF = pd.read_csv(each, usecols=['Text'])
        dF['Text'] = dF['Text'].astype('str')
        file_pairs.append((file_name, dF))
        print('filename: {0}'.format(each))
        print('size:     {0}'.format(dF.size))
        print('----------------------------------------------------\n')
    
    return file_pairs

## Filter out URLs and @ Mentions

In [None]:
english_file_pairs = get_file_pairs(ENGLISH_PATH)

In [None]:
# URLS_REMOVED_PATH
regex_urls = r'(https?://.+\b\/?|.{3}\.twitter.+\b)'
regex_mentions = r'@\w+'
regex = re.compile(regex_urls)

if not os.path.exists(URLS_MENTIONS_REMOVED_PATH):
    os.makedirs(URLS_MENTIONS_REMOVED_PATH)

for each in english_file_pairs:
    filename = each[0]
    dF = each[1].copy()
    output_file = URLS_MENTIONS_REMOVED_PATH + filename

    # remove URLs
    dF.Text = dF.apply(lambda x: x.str.replace(regex_urls, ' '))
    # remove mentions
    dF.Text = dF.apply(lambda x: x.str.replace(regex_mentions, ' '))
    # save
    dF.to_csv(output_file, index=False, columns=['Text'])
    print(dF, '\n----------------------------------------------------')    

## Filter for MIN_LENGTH

In [None]:
url_filtered_file_pairs = get_file_pairs(URLS_MENTIONS_REMOVED_PATH)

In [None]:
if not os.path.exists(MIN_LENGTH_PATH):
    os.makedirs(MIN_LENGTH_PATH)


for each in url_filtered_file_pairs:
    filename = each[0]
    dF = each[1].copy()
    output_file = MIN_LENGTH_PATH + filename
    print(output_file)

    dF = dF[ dF['Text'].str.len() > MIN_LENGTH]
    
    # save
    dF.to_csv(output_file, index=False, columns=['Text'])

    print(dF, '\n----------------------------------------------------')

## Prepare Dataset
- change case to smaller-case
- remove punctuation
- replace multiple consecutive spaces with single space
- filter for MIN_LENGTH x 2
- add class labels

In [None]:
file_pairs = get_file_pairs(MIN_LENGTH_PATH)

In [None]:
if not os.path.exists(PREPROCESSED_PATH):
    os.makedirs(PREPROCESSED_PATH)

for each in file_pairs:
    file_name = each[0]
    label = file_name.split('.csv')[0]
    dF = each[1].copy()
    output_file = PREPROCESSED_PATH + file_name

    # convert to lower-case
    dF.Text = dF.apply(lambda x: x.str.lower())
    # remove punctuation
    dF.Text = dF.apply(lambda x: x.str.replace(r'[^a-zA-Z_\s0-9#]', ' '))
    # remove extra spaces
    dF.Text = dF.apply(lambda x: x.str.replace(r'\s{2,}', ' '))
    # filter for len(tweets) > MIN_LENGTH x 2 to get more meaningful tweets
    dF = dF[ dF['Text'].str.len() > MIN_LENGTH * 2]

    # add label
    dF['Label'] = label

    dF.to_csv(output_file, index=False, columns=['Label','Text'])
    
    print(dF)

In [None]:
print('==========FINAL DATASET STATS==========\n')
for pair in get_prefixed_file_pairs(PREPROCESSED_PATH):
    print('{0}\t\tLength: {1}'.format(pair[0], len(pair[1])))

### Create Dataset Files
- single dataset file
- split to train, test, validation sets

#### dataset.csv

In [None]:
def get_prefixed_file_pairs(files_dir):
    files = glob.glob(files_dir + '*.csv')
    file_pairs = []

    for each in files:
        file_name = each.split('/')[-1:][0]

        dF = pd.read_csv(each, usecols=['Label', 'Text'])
        dF['Text'] = dF['Text'].astype('str')
        file_pairs.append((file_name, dF))
        print('filename: {0}'.format(each))
        print('size:     {0}'.format(dF.size))
        print('----------------------------------------------------\n')
    
    return file_pairs

In [None]:
file_pairs = get_prefixed_file_pairs(PREPROCESSED_PATH)

In [None]:
frames_list = []
for each in file_pairs:
    dF = each[1].copy()
    frames_list.append(dF)

bigDF = pd.concat(frames_list)

if not os.path.exists(PROCESSED_PATH):
    os.makedirs(PROCESSED_PATH)

bigDF.to_csv(PROCESSED_PATH + 'dataset.csv', index=False, columns=['Label', 'Text'])

print(bigDF)

#### Train, Test, Val Split

In [None]:
input_file = PROCESSED_PATH + 'dataset.csv';
datasetFrame = pd.read_csv(input_file)
print('dataset length:', len(datasetFrame))

In [None]:
non_test_set, test_set = train_test_split(datasetFrame, test_size=0.2, random_state=18030010)
train_set, val_set = train_test_split(non_test_set, test_size=0.15, random_state=18030010)

len(train_set), len(val_set), len(test_set)

In [None]:
train_set.to_csv(PROCESSED_PATH + 'train.csv', index=False, columns=['Label', 'Text'])
print('Train Set\n',train_set)

val_set.to_csv(PROCESSED_PATH + 'val.csv', index=False, columns=['Label', 'Text'])
print('Val Set\n',val_set)

test_set.to_csv(PROCESSED_PATH + 'test.csv', index=False, columns=['Label', 'Text'])
print('Test Set\n',test_set)