# Dataset Preparation

## Imports

In [52]:
import os, glob, re
import pandas as pd
from sklearn.model_selection import train_test_split

## Constants

In [53]:
# constants
MIN_LENGTH = 40

DATASET_PREFIX = '../dataset/'
ENGLISH_PATH = DATASET_PREFIX + '1_english/'
URLS_MENTIONS_REMOVED_PATH = DATASET_PREFIX + '2_filtered_urls_mentions/'
MIN_LENGTH_PATH = DATASET_PREFIX + '3_length_greater_than_{0}/'.format(MIN_LENGTH)
PREPROCESSED_PATH = DATASET_PREFIX + '4_pre-processed/'
PROCESSED_PATH = DATASET_PREFIX + '5_processed/'

## Load Data

In [54]:
def get_file_pairs(files_dir):
    files = glob.glob(files_dir + '*.csv')
    file_pairs = []

    for each in files:
        file_name = each.split('/')[-1:][0]

        dF = pd.read_csv(each, usecols=['Text'])
        dF['Text'] = dF['Text'].astype('str')
        file_pairs.append((file_name, dF))
        print('filename: {0}'.format(each))
        print('size:     {0}'.format(dF.size))
        print('----------------------------------------------------\n')
    
    return file_pairs

## Filter out URLs and @ Mentions

In [55]:
english_file_pairs = get_file_pairs(ENGLISH_PATH)

filename: ../dataset/1_english/imrankhanpti.csv
size:     5519
----------------------------------------------------

filename: ../dataset/1_english/bbhuttozardari.csv
size:     4339
----------------------------------------------------

filename: ../dataset/1_english/narendramodi.csv
size:     20173
----------------------------------------------------

filename: ../dataset/1_english/mjibrannasir.csv
size:     19275
----------------------------------------------------

filename: ../dataset/1_english/maryamnsharif.csv
size:     27681
----------------------------------------------------

filename: ../dataset/1_english/sherryrehman.csv
size:     18469
----------------------------------------------------

filename: ../dataset/1_english/marvimemon.csv
size:     5974
----------------------------------------------------

filename: ../dataset/1_english/fawadchaudhry.csv
size:     23525
----------------------------------------------------



In [56]:
# URLS_REMOVED_PATH
regex_urls = r'(https?://.+\b\/?|.{3}\.twitter.+\b)'
regex_mentions = r'@\w+'
regex = re.compile(regex_urls)

if not os.path.exists(URLS_MENTIONS_REMOVED_PATH):
    os.makedirs(URLS_MENTIONS_REMOVED_PATH)

for each in english_file_pairs:
    filename = each[0]
    dF = each[1].copy()
    output_file = URLS_MENTIONS_REMOVED_PATH + filename

    # remove URLs
    dF.Text = dF.apply(lambda x: x.str.replace(regex_urls, ' '))
    # remove mentions
    dF.Text = dF.apply(lambda x: x.str.replace(regex_mentions, ' '))
    # save
    dF.to_csv(output_file, index=False, columns=['Text'])
    print(dF, '\n----------------------------------------------------')    

                                                   Text
0     Overseas Pakistani have always been Pakistan's...
1     My appeal to the international community, the ...
2     Wishing all our Christian citizens a happy Eas...
3     Pakistan innovates in the time of COVID19. Wit...
4     With tax refunds & opening up the construction...
...                                                 ...
5514  There is not one statement I have ever given s...
5515  How does opposing military operations which ha...
5516  Pakistan needs to form an independent and sove...
5517  The carnage in Lahore today is a direct result...
5518  Drone attacks are immoral. They cause deaths o...

[5519 rows x 1 columns] 
----------------------------------------------------
                                                   Text
0                       Message from   #Sindh #Covid_19
1     PPP, Pakistan Sweet Homes set up dastarkhwan -...
2                         So cool #PakistanFightsCorona
3     Opinion | The Huge 

## Filter for MIN_LENGTH

In [57]:
url_filtered_file_pairs = get_file_pairs(URLS_MENTIONS_REMOVED_PATH)

filename: ../dataset/2_filtered_urls_mentions/imrankhanpti.csv
size:     5513
----------------------------------------------------

filename: ../dataset/2_filtered_urls_mentions/bbhuttozardari.csv
size:     4292
----------------------------------------------------

filename: ../dataset/2_filtered_urls_mentions/narendramodi.csv
size:     20170
----------------------------------------------------

filename: ../dataset/2_filtered_urls_mentions/mjibrannasir.csv
size:     19206
----------------------------------------------------

filename: ../dataset/2_filtered_urls_mentions/maryamnsharif.csv
size:     27460
----------------------------------------------------

filename: ../dataset/2_filtered_urls_mentions/sherryrehman.csv
size:     18218
----------------------------------------------------

filename: ../dataset/2_filtered_urls_mentions/marvimemon.csv
size:     5671
----------------------------------------------------

filename: ../dataset/2_filtered_urls_mentions/fawadchaudhry.csv
size:  

In [58]:
if not os.path.exists(MIN_LENGTH_PATH):
    os.makedirs(MIN_LENGTH_PATH)


for each in url_filtered_file_pairs:
    filename = each[0]
    dF = each[1].copy()
    output_file = MIN_LENGTH_PATH + filename
    print(output_file)

    dF = dF[ dF['Text'].str.len() > MIN_LENGTH]
    
    # save
    dF.to_csv(output_file, index=False, columns=['Text'])

    print(dF, '\n----------------------------------------------------')

../dataset/3_length_greater_than_40/imrankhanpti.csv
                                                   Text
0     Overseas Pakistani have always been Pakistan's...
1     My appeal to the international community, the ...
2     Wishing all our Christian citizens a happy Eas...
3     Pakistan innovates in the time of COVID19. Wit...
4     With tax refunds & opening up the construction...
...                                                 ...
5508  There is not one statement I have ever given s...
5509  How does opposing military operations which ha...
5510  Pakistan needs to form an independent and sove...
5511  The carnage in Lahore today is a direct result...
5512  Drone attacks are immoral. They cause deaths o...

[5357 rows x 1 columns] 
----------------------------------------------------
../dataset/3_length_greater_than_40/bbhuttozardari.csv
                                                   Text
1     PPP, Pakistan Sweet Homes set up dastarkhwan -...
3     Opinion | The Huge Cost

## Prepare Dataset
- change case to smaller-case
- remove punctuation
- replace multiple consecutive spaces with single space
- filter for MIN_LENGTH x 2
- add class labels

In [59]:
file_pairs = get_file_pairs(MIN_LENGTH_PATH)

filename: ../dataset/3_length_greater_than_40/imrankhanpti.csv
size:     5357
----------------------------------------------------

filename: ../dataset/3_length_greater_than_40/bbhuttozardari.csv
size:     3415
----------------------------------------------------

filename: ../dataset/3_length_greater_than_40/narendramodi.csv
size:     19153
----------------------------------------------------

filename: ../dataset/3_length_greater_than_40/mjibrannasir.csv
size:     17010
----------------------------------------------------

filename: ../dataset/3_length_greater_than_40/maryamnsharif.csv
size:     17578
----------------------------------------------------

filename: ../dataset/3_length_greater_than_40/sherryrehman.csv
size:     14226
----------------------------------------------------

filename: ../dataset/3_length_greater_than_40/marvimemon.csv
size:     3917
----------------------------------------------------

filename: ../dataset/3_length_greater_than_40/fawadchaudhry.csv
size:  

In [60]:
if not os.path.exists(PREPROCESSED_PATH):
    os.makedirs(PREPROCESSED_PATH)

for each in file_pairs:
    file_name = each[0]
    label = file_name.split('.csv')[0]
    dF = each[1].copy()
    output_file = PREPROCESSED_PATH + file_name

    # convert to lower-case
    dF.Text = dF.apply(lambda x: x.str.lower())
    # remove punctuation
    dF.Text = dF.apply(lambda x: x.str.replace(r'[^a-zA-Z_\s0-9#]', ' '))
    # remove extra spaces
    dF.Text = dF.apply(lambda x: x.str.replace(r'\s{2,}', ' '))
    # filter for len(tweets) > MIN_LENGTH x 2 to get more meaningful tweets
    dF = dF[ dF['Text'].str.len() > MIN_LENGTH * 2]

    # add label
    dF['Label'] = label

    dF.to_csv(output_file, index=False, columns=['Label','Text'])
    
    print(dF)

                                                   Text         Label
0     overseas pakistani have always been pakistan s...  imrankhanpti
1     my appeal to the international community the u...  imrankhanpti
2     wishing all our christian citizens a happy eas...  imrankhanpti
3     pakistan innovates in the time of covid19 with...  imrankhanpti
4     with tax refunds opening up the construction s...  imrankhanpti
...                                                 ...           ...
5350   my political role model is quaid e azam and i...  imrankhanpti
5351  there is an institutional collapse in pakistan...  imrankhanpti
5353  how does opposing military operations which ha...  imrankhanpti
5354  pakistan needs to form an independent and sove...  imrankhanpti
5356  drone attacks are immoral they cause deaths of...  imrankhanpti

[4843 rows x 2 columns]
                                                   Text           Label
3     i hope we emerge from this pandemic with a gre...  bbhutt

In [61]:
print('==========FINAL DATASET STATS==========\n')
for pair in get_file_pairs(PREPROCESSED_PATH):
    print('{0}\t\tLength: {1}'.format(pair[0], len(pair[1])))


filename: ../dataset/4_pre-processed/imrankhanpti.csv
size:     4843
----------------------------------------------------

filename: ../dataset/4_pre-processed/bbhuttozardari.csv
size:     2272
----------------------------------------------------

filename: ../dataset/4_pre-processed/narendramodi.csv
size:     16271
----------------------------------------------------

filename: ../dataset/4_pre-processed/mjibrannasir.csv
size:     12992
----------------------------------------------------

filename: ../dataset/4_pre-processed/maryamnsharif.csv
size:     9892
----------------------------------------------------

filename: ../dataset/4_pre-processed/sherryrehman.csv
size:     10206
----------------------------------------------------

filename: ../dataset/4_pre-processed/marvimemon.csv
size:     2834
----------------------------------------------------

filename: ../dataset/4_pre-processed/fawadchaudhry.csv
size:     9803
----------------------------------------------------

imrankhanp

### Create Dataset Files
- single dataset file
- split to train, test, validation sets

#### dataset.csv

In [62]:
def get_prefixed_file_pairs(files_dir):
    files = glob.glob(files_dir + '*.csv')
    file_pairs = []

    for each in files:
        file_name = each.split('/')[-1:][0]

        dF = pd.read_csv(each, usecols=['Label', 'Text'])
        dF['Text'] = dF['Text'].astype('str')
        file_pairs.append((file_name, dF))
        print('filename: {0}'.format(each))
        print('size:     {0}'.format(len(dF)))
        print('----------------------------------------------------\n')
    
    return file_pairs

In [63]:
file_pairs = get_prefixed_file_pairs(PREPROCESSED_PATH)

filename: ../dataset/4_pre-processed/imrankhanpti.csv
size:     4843
----------------------------------------------------

filename: ../dataset/4_pre-processed/bbhuttozardari.csv
size:     2272
----------------------------------------------------

filename: ../dataset/4_pre-processed/narendramodi.csv
size:     16271
----------------------------------------------------

filename: ../dataset/4_pre-processed/mjibrannasir.csv
size:     12992
----------------------------------------------------

filename: ../dataset/4_pre-processed/maryamnsharif.csv
size:     9892
----------------------------------------------------

filename: ../dataset/4_pre-processed/sherryrehman.csv
size:     10206
----------------------------------------------------

filename: ../dataset/4_pre-processed/marvimemon.csv
size:     2834
----------------------------------------------------

filename: ../dataset/4_pre-processed/fawadchaudhry.csv
size:     9803
----------------------------------------------------



In [64]:
frames_list = []
for each in file_pairs:
    dF = each[1].copy()
    frames_list.append(dF)

bigDF = pd.concat(frames_list)

if not os.path.exists(PROCESSED_PATH):
    os.makedirs(PROCESSED_PATH)

bigDF.to_csv(PROCESSED_PATH + 'dataset.csv', index=False, columns=['Label', 'Text'])

print(bigDF)

              Label                                               Text
0      imrankhanpti  overseas pakistani have always been pakistan s...
1      imrankhanpti  my appeal to the international community the u...
2      imrankhanpti  wishing all our christian citizens a happy eas...
3      imrankhanpti  pakistan innovates in the time of covid19 with...
4      imrankhanpti  with tax refunds opening up the construction s...
...             ...                                                ...
9798  fawadchaudhry  future of pakistan is to follow sufi tradition...
9799  fawadchaudhry  now naval base attacked dont remember what las...
9800  fawadchaudhry  our establishments policy of inflaming hatred ...
9801  fawadchaudhry  in the heat of dubai thinking whats biiger vio...
9802  fawadchaudhry  seems the favorite pass game in islamabad is b...

[69113 rows x 2 columns]


#### Train, Test, Val Split

In [65]:
input_file = PROCESSED_PATH + 'dataset.csv';
datasetFrame = pd.read_csv(input_file)
print('dataset length:', len(datasetFrame))

dataset length: 69113


In [66]:
non_test_set, test_set = train_test_split(datasetFrame, test_size=0.2, random_state=18030010)
train_set, val_set = train_test_split(non_test_set, test_size=0.15, random_state=18030010)

len(train_set), len(val_set), len(test_set)

(46996, 8294, 13823)

In [67]:
train_set.to_csv(PROCESSED_PATH + 'train.csv', index=False, columns=['Label', 'Text'])
print('Train Set\n',train_set)

val_set.to_csv(PROCESSED_PATH + 'val.csv', index=False, columns=['Label', 'Text'])
print('Val Set\n',val_set)

test_set.to_csv(PROCESSED_PATH + 'test.csv', index=False, columns=['Label', 'Text'])
print('Test Set\n',test_set)

Train Set
                Label                                               Text
14500   narendramodi  here is a newsletter covering my visits to kan...
56217   sherryrehman  great turnout for lunch at pak embassy in dc t...
51809   sherryrehman   isis just one mile from baghdad as al qaeda f...
46050  maryamnsharif   so the govt happily goes for re elections on ...
61615  fawadchaudhry   p rasheed kafir hai tou all anti taliban s ar...
...              ...                                                ...
14396   narendramodi  may these festivals bring abundance of happine...
21473   narendramodi  spoke to advani ji on the phone he gave me his...
17376   narendramodi  the overwhelming response along with a lot of ...
51500   sherryrehman  pleasure wish we could have chatted some next ...
12980   narendramodi  good to see you wield the broom and spread a s...

[46996 rows x 2 columns]
Val Set
                Label                                               Text
2810    imrankhanp