# Dataset Preparation

## Imports

In [334]:
import os, glob, re
import pandas as pd
from sklearn.model_selection import train_test_split

## Constants

In [335]:
# constants
MIN_LENGTH = 40

DATASET_PREFIX = '../dataset/'
ENGLISH_PATH = DATASET_PREFIX + '1_english/'
URLS_MENTIONS_REMOVED_PATH = DATASET_PREFIX + '2_filtered_urls_mentions/'
MIN_LENGTH_PATH = DATASET_PREFIX + '3_length_greater_than_{0}/'.format(MIN_LENGTH)
STOPWORDS_PATH = DATASET_PREFIX + '4_no-stopword/'
NONROMAN_PATH = DATASET_PREFIX + '5_non-roman/'
PREPROCESSED_PATH = DATASET_PREFIX + '6_pre-processed/'
PROCESSED_PATH = DATASET_PREFIX + '7_processed/'

## Load Data

In [336]:
def get_file_pairs(files_dir):
    files = glob.glob(files_dir + '*.csv')
    file_pairs = []

    for each in files:
        file_name = each.split('/')[-1:][0]

        dF = pd.read_csv(each, usecols=['Text'])
        dF['Text'] = dF['Text'].astype('str')
        file_pairs.append((file_name, dF))
        print('filename: {0}'.format(each))
        print('size:     {0}'.format(dF.size))
        print('----------------------------------------------------\n')
    
    return file_pairs

## Filter out URLs and @ Mentions

In [337]:
english_file_pairs = get_file_pairs(ENGLISH_PATH)

filename: ../dataset/1_english/narendramodi.csv
size:     20173
----------------------------------------------------

filename: ../dataset/1_english/mjibrannasir.csv
size:     19275
----------------------------------------------------

filename: ../dataset/1_english/maryamnsharif.csv
size:     27681
----------------------------------------------------

filename: ../dataset/1_english/sherryrehman.csv
size:     18469
----------------------------------------------------

filename: ../dataset/1_english/fawadchaudhry.csv
size:     23525
----------------------------------------------------



In [338]:
# URLS_REMOVED_PATH
regex_urls = r'(https?://.+\b\/?|.{3}\.twitter.+\b)'
regex_mentions = r'@\w+'
regex = re.compile(regex_urls)

if not os.path.exists(URLS_MENTIONS_REMOVED_PATH):
    os.makedirs(URLS_MENTIONS_REMOVED_PATH)

for each in english_file_pairs:
    filename = each[0]
    dF = each[1].copy()
    output_file = URLS_MENTIONS_REMOVED_PATH + filename

    # remove URLs
    dF.Text = dF.apply(lambda x: x.str.replace(regex_urls, ' '))
    # remove mentions
    dF.Text = dF.apply(lambda x: x.str.replace(regex_mentions, ' '))
    # save
    dF.to_csv(output_file, index=False, columns=['Text'])
    print(dF, '\n----------------------------------------------------')    

                                                    Text
0      Ramzan Mubarak! I pray for everyone’s safety, ...
1      Best wishes to Goa CM   Ji on his birthday. Ma...
2      Today’s discussion with Panchayat Sarpanchs wa...
3      Interacting with Sarpanchs across the country ...
4      Some of the steps taken to help the most vulne...
...                                                  ...
20168  Narendrabhai Modi: Through water conservation ...
20169  Women would play a crucial role in the develop...
20170  Most awaited gujarati version of www.narendram...
20171  An inspiring address to Scouts & Guides in Jam...
20172  2nd Feb, I will be in Dahod for Gujarat Swarni...

[20173 rows x 1 columns] 
----------------------------------------------------
                                                    Text
0      When someone propagates and enforces ill infor...
1      Tonight as our   and   teams deliver ration to...
2      Alhumdulillah more than 5000+ families served ...
3      S

## Filter for MIN_LENGTH

In [339]:
url_filtered_file_pairs = get_file_pairs(URLS_MENTIONS_REMOVED_PATH)

filename: ../dataset/2_filtered_urls_mentions/narendramodi.csv
size:     20170
----------------------------------------------------

filename: ../dataset/2_filtered_urls_mentions/mjibrannasir.csv
size:     19206
----------------------------------------------------

filename: ../dataset/2_filtered_urls_mentions/maryamnsharif.csv
size:     27460
----------------------------------------------------

filename: ../dataset/2_filtered_urls_mentions/sherryrehman.csv
size:     18218
----------------------------------------------------

filename: ../dataset/2_filtered_urls_mentions/fawadchaudhry.csv
size:     23349
----------------------------------------------------



In [340]:
if not os.path.exists(MIN_LENGTH_PATH):
    os.makedirs(MIN_LENGTH_PATH)


for each in url_filtered_file_pairs:
    filename = each[0]
    dF = each[1].copy()
    output_file = MIN_LENGTH_PATH + filename
    print(output_file)

    dF = dF[ dF['Text'].str.len() > MIN_LENGTH]
    
    # save
    dF.to_csv(output_file, index=False, columns=['Text'])

    print(dF, '\n----------------------------------------------------')

../dataset/3_length_greater_than_40/narendramodi.csv
                                                    Text
0      Ramzan Mubarak! I pray for everyone’s safety, ...
1      Best wishes to Goa CM   Ji on his birthday. Ma...
2      Today’s discussion with Panchayat Sarpanchs wa...
3      Interacting with Sarpanchs across the country ...
4      Some of the steps taken to help the most vulne...
...                                                  ...
20165  Narendrabhai Modi: Through water conservation ...
20166  Women would play a crucial role in the develop...
20167  Most awaited gujarati version of www.narendram...
20168  An inspiring address to Scouts & Guides in Jam...
20169  2nd Feb, I will be in Dahod for Gujarat Swarni...

[19153 rows x 1 columns] 
----------------------------------------------------
../dataset/3_length_greater_than_40/mjibrannasir.csv
                                                    Text
0      When someone propagates and enforces ill infor...
1      Tonight a

## Stopword Removal

In [341]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/inam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [342]:
min_length_file_pairs = get_file_pairs(MIN_LENGTH_PATH)

filename: ../dataset/3_length_greater_than_40/narendramodi.csv
size:     19153
----------------------------------------------------

filename: ../dataset/3_length_greater_than_40/mjibrannasir.csv
size:     17010
----------------------------------------------------

filename: ../dataset/3_length_greater_than_40/maryamnsharif.csv
size:     17578
----------------------------------------------------

filename: ../dataset/3_length_greater_than_40/sherryrehman.csv
size:     14226
----------------------------------------------------

filename: ../dataset/3_length_greater_than_40/fawadchaudhry.csv
size:     16934
----------------------------------------------------



In [343]:
if not os.path.exists(STOPWORDS_PATH):
    os.makedirs(STOPWORDS_PATH)

STOP_WORDS = stopwords.words('english')
print('STOP_WORDS', STOP_WORDS)
    
def remove_stopwords(texts):
    new_texts = []
    for tx in texts:
        splitted = tx.lower().split(' ')
        removed = [word for word in splitted if not word in STOP_WORDS]
        removed = ' '.join(removed)
        new_texts.append(removed)
    return new_texts

for each in min_length_file_pairs:
    filename = each[0]
    dF = each[1].copy()
    output_file = STOPWORDS_PATH + filename
    print('output_file', output_file)

    # remove all stop_words
    dF.Text = remove_stopwords(dF.Text)

    # save
    dF.to_csv(output_file, index=False, columns=['Text'])

    print(dF, '\n----------------------------------------------------')

STOP_WORDS ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so'

## Roman English Removal
> Removes roman english tweets

In [344]:
from enchant.checker import SpellChecker

In [345]:
stopwords_file_pairs = get_file_pairs(STOPWORDS_PATH)

filename: ../dataset/4_no-stopword/narendramodi.csv
size:     19153
----------------------------------------------------

filename: ../dataset/4_no-stopword/mjibrannasir.csv
size:     17010
----------------------------------------------------

filename: ../dataset/4_no-stopword/maryamnsharif.csv
size:     17578
----------------------------------------------------

filename: ../dataset/4_no-stopword/sherryrehman.csv
size:     14226
----------------------------------------------------

filename: ../dataset/4_no-stopword/fawadchaudhry.csv
size:     16934
----------------------------------------------------



In [346]:
if not os.path.exists(NONROMAN_PATH):
    os.makedirs(NONROMAN_PATH)


ROMAN_THRESHOLD = .5

def remove_romans(texts):
    """
    Check if 50% of each sentence has valid english words
    """
    new_texts = []
    d = SpellChecker("en_US")
    
    for quote in texts:      
        d.set_text(quote)
        errors = [err.word for err in d]
        target_length = len(quote.split()) * ROMAN_THRESHOLD
        if len(errors) <= target_length:
            new_texts.append(quote)

    return pd.DataFrame({'Text': new_texts})

for each in stopwords_file_pairs:
    filename = each[0]
    dF = each[1].copy()
    output_file = NONROMAN_PATH + filename
    print('output_file', output_file)

    # filter sentences that have more than 50% non-english content
    dF = remove_romans(dF.Text)

    # save
    dF.to_csv(output_file, index=False, columns=['Text'])

    print(dF, '\n----------------------------------------------------')

output_file ../dataset/5_non-roman/narendramodi.csv
                                                    Text
0      ramzan mubarak! pray everyone’s safety, well-b...
1      best wishes goa cm   ji birthday. may blessed ...
2      today’s discussion panchayat sarpanchs insight...
3      interacting sarpanchs across country video-con...
4                         steps taken help vulnerable...
...                                                  ...
18521  reading: "modi dedicates cybre long distance e...
18522  narendrabhai modi: water conservation movement...
18523  women would play crucial role development guja...
18524  awaited gujarati version www.narendramodi.in l...
18525  inspiring address scouts & guides jamboree-200...

[18526 rows x 1 columns] 
----------------------------------------------------
output_file ../dataset/5_non-roman/mjibrannasir.csv
                                                    Text
0      someone propagates enforces ill informed polic...
1      tonight    

## Prepare Dataset
- change case to smaller-case
- remove punctuation
- replace multiple consecutive spaces with single space
- filter for MIN_LENGTH x 2
- add class labels

In [347]:
file_pairs = get_file_pairs(NONROMAN_PATH)

filename: ../dataset/5_non-roman/narendramodi.csv
size:     18526
----------------------------------------------------

filename: ../dataset/5_non-roman/mjibrannasir.csv
size:     14980
----------------------------------------------------

filename: ../dataset/5_non-roman/maryamnsharif.csv
size:     16085
----------------------------------------------------

filename: ../dataset/5_non-roman/sherryrehman.csv
size:     13888
----------------------------------------------------

filename: ../dataset/5_non-roman/fawadchaudhry.csv
size:     12451
----------------------------------------------------



In [348]:
if not os.path.exists(PREPROCESSED_PATH):
    os.makedirs(PREPROCESSED_PATH)

for each in file_pairs:
    file_name = each[0]
    label = file_name.split('.csv')[0]
    dF = each[1].copy()
    output_file = PREPROCESSED_PATH + file_name

    # convert to lower-case
    dF.Text = dF.apply(lambda x: x.str.lower())
    # remove punctuation
    dF.Text = dF.apply(lambda x: x.str.replace(r"[^a-zA-Z_\s0-9#]", ''))
    # remove extra spaces
    dF.Text = dF.apply(lambda x: x.str.replace(r'\s{2,}', ' '))
    # filter for len(tweets) > MIN_LENGTH x 2 to get more meaningful tweets
    dF = dF[ dF['Text'].str.len() > MIN_LENGTH * 1.5]

    # add label
    dF['Label'] = label

    dF.to_csv(output_file, index=False, columns=['Label','Text'])
    
    print(dF)

                                                    Text         Label
0      ramzan mubarak pray everyones safety wellbeing...  narendramodi
1      best wishes goa cm ji birthday may blessed lon...  narendramodi
2      todays discussion panchayat sarpanchs insightf...  narendramodi
3      interacting sarpanchs across country videoconf...  narendramodi
5      exchanged views covid19 pandemic pm thanked su...  narendramodi
...                                                  ...           ...
18521  reading modi dedicates cybre long distance edu...  narendramodi
18522  narendrabhai modi water conservation movement ...  narendramodi
18523  women would play crucial role development guja...  narendramodi
18524  awaited gujarati version wwwnarendramodiin lau...  narendramodi
18525  inspiring address scouts guides jamboree2009 w...  narendramodi

[15686 rows x 2 columns]
                                                    Text         Label
0      someone propagates enforces ill informed pol

In [349]:
print('==========FINAL DATASET STATS==========\n')
for pair in get_file_pairs(PREPROCESSED_PATH):
    print('{0}\t\tLength: {1}'.format(pair[0], len(pair[1])))


filename: ../dataset/6_pre-processed/narendramodi.csv
size:     15686
----------------------------------------------------

filename: ../dataset/6_pre-processed/mjibrannasir.csv
size:     11334
----------------------------------------------------

filename: ../dataset/6_pre-processed/maryamnsharif.csv
size:     8814
----------------------------------------------------

filename: ../dataset/6_pre-processed/sherryrehman.csv
size:     10046
----------------------------------------------------

filename: ../dataset/6_pre-processed/fawadchaudhry.csv
size:     7984
----------------------------------------------------

narendramodi.csv		Length: 15686
mjibrannasir.csv		Length: 11334
maryamnsharif.csv		Length: 8814
sherryrehman.csv		Length: 10046
fawadchaudhry.csv		Length: 7984


### Create Dataset Files
- single dataset file
- split to train, test, validation sets

#### dataset.csv

In [350]:
def get_prefixed_file_pairs(files_dir):
    files = glob.glob(files_dir + '*.csv')
    file_pairs = []

    for each in files:
        file_name = each.split('/')[-1:][0]

        dF = pd.read_csv(each, usecols=['Label', 'Text'])
        dF['Text'] = dF['Text'].astype('str')
        file_pairs.append((file_name, dF))
        print('filename: {0}'.format(each))
        print('size:     {0}'.format(len(dF)))
        print('----------------------------------------------------\n')
    
    return file_pairs

In [351]:
file_pairs = get_prefixed_file_pairs(PREPROCESSED_PATH)

filename: ../dataset/6_pre-processed/narendramodi.csv
size:     15686
----------------------------------------------------

filename: ../dataset/6_pre-processed/mjibrannasir.csv
size:     11334
----------------------------------------------------

filename: ../dataset/6_pre-processed/maryamnsharif.csv
size:     8814
----------------------------------------------------

filename: ../dataset/6_pre-processed/sherryrehman.csv
size:     10046
----------------------------------------------------

filename: ../dataset/6_pre-processed/fawadchaudhry.csv
size:     7984
----------------------------------------------------



In [352]:
frames_list = []
for each in file_pairs:
    dF = each[1].copy()
    frames_list.append(dF)

bigDF = pd.concat(frames_list)

if not os.path.exists(PROCESSED_PATH):
    os.makedirs(PROCESSED_PATH)

bigDF.to_csv(PROCESSED_PATH + 'dataset.csv', index=False, columns=['Label', 'Text'])

print(bigDF)

              Label                                               Text
0      narendramodi  ramzan mubarak pray everyones safety wellbeing...
1      narendramodi  best wishes goa cm ji birthday may blessed lon...
2      narendramodi  todays discussion panchayat sarpanchs insightf...
3      narendramodi  interacting sarpanchs across country videoconf...
4      narendramodi  exchanged views covid19 pandemic pm thanked su...
...             ...                                                ...
7979  fawadchaudhry   agree brings in democrats isnt democracy sans...
7980  fawadchaudhry  naval base attackeddont remember last good new...
7981  fawadchaudhry  establishments policy inflaming hatred usa may...
7982  fawadchaudhry  heat dubai thinking whats biiger violation fac...
7983  fawadchaudhry  seems favorite pass game islamabad blaming mus...

[53864 rows x 2 columns]


#### Train, Test, Val Split

In [353]:
input_file = PROCESSED_PATH + 'dataset.csv';
datasetFrame = pd.read_csv(input_file)
print('dataset length:', len(datasetFrame))

dataset length: 53864


In [354]:
non_test_set, test_set = train_test_split(datasetFrame, test_size=0.2, random_state=18030010)
train_set, val_set = train_test_split(non_test_set, test_size=0.15, random_state=18030010)

len(train_set), len(val_set), len(test_set)

(36627, 6464, 10773)

In [355]:
train_set.to_csv(PROCESSED_PATH + 'train.csv', index=False, columns=['Label', 'Text'])
print('Train Set\n',train_set)

val_set.to_csv(PROCESSED_PATH + 'val.csv', index=False, columns=['Label', 'Text'])
print('Val Set\n',val_set)

test_set.to_csv(PROCESSED_PATH + 'test.csv', index=False, columns=['Label', 'Text'])
print('Test Set\n',test_set)

Train Set
                Label                                               Text
30193  maryamnsharif  remove dark shades see historic jalsi jalsa pl...
17762   mjibrannasir  several issues connect us citizens growing can...
24841   mjibrannasir  taking effort forward sindh govt make prominen...
18156   mjibrannasir  2018 commonwealth games erratic india punished...
24583   mjibrannasir   people islamabad tell differently people g6 s...
...              ...                                                ...
40204   sherryrehman  na committee allowed gag cybercrime bill go th...
35301  maryamnsharif   unbecoming attitude warrants response v civil...
11686   narendramodi  thoughts prayers families lost lives board fli...
46868  fawadchaudhry  hi allah g ik said something bc tht ne er said...
3246    narendramodi  ekta bhyan started playing sports regular basi...

[36627 rows x 2 columns]
Val Set
                Label                                               Text
35943   sherryrehm