In [10]:
# Import libraries
import os
import pandas as pd

In [11]:
# Set options for viewing dataframes properly

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [12]:
# Load dataset

def load_data(filename):
    data = pd.read_csv(filename, index_col=False, usecols=['Datetime','Text'])
    return data

In [13]:
# Get dataframes for all politicians

DATASET_PREFIX = '../dataset/raw/'

df_bbhuttozardari = load_data(DATASET_PREFIX + 'bbhuttozardari.csv')
df_ImranKhanPTI = load_data(DATASET_PREFIX + 'imrankhanpti.csv')
df_MaryamNSharif = load_data(DATASET_PREFIX + 'maryamnsharif.csv')
df_MJibranNasir = load_data(DATASET_PREFIX + 'mjibrannasir.csv')
df_narendramodi = load_data(DATASET_PREFIX + 'narendramodi.csv')
df_fawadChaudhry = load_data(DATASET_PREFIX + 'fawadchaudhry.csv')
df_marviMemon = load_data(DATASET_PREFIX + 'marvimemon.csv')
df_sherryRehman = load_data(DATASET_PREFIX + 'sherryrehman.csv')

In [14]:
print('df_bbhuttozardari\t', len(df_bbhuttozardari))
print('df_ImranKhanPTI \t', len(df_ImranKhanPTI))
print('df_MaryamNSharif\t', len(df_MaryamNSharif))
print('df_MJibranNasir \t', len(df_MJibranNasir))
print('df_narendramodi \t', len(df_narendramodi))
print('df_fawadChaudhry\t', len(df_fawadChaudhry))
print('df_marviMemon   \t', len(df_marviMemon))
print('df_sherryRehman \t', len(df_sherryRehman))

df_bbhuttozardari	 4595
df_ImranKhanPTI 	 6254
df_MaryamNSharif	 31515
df_MJibranNasir 	 19861
df_narendramodi 	 23796
df_fawadChaudhry	 25075
df_marviMemon   	 6262
df_sherryRehman 	 20381


In [15]:
# Get filtered dataframes - filtering out tweets with non-english text

filtered_bbhuttozardari = df_bbhuttozardari[df_bbhuttozardari.Text.str.contains('[^A-Za-z0-9\s\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\]\^\_\`\{\|\}\~\’\\\]') == False]
filtered_bbhuttozardari.reset_index(drop=True, inplace=True)

filtered_ImranKhanPTI = df_ImranKhanPTI[df_ImranKhanPTI.Text.str.contains('[^A-Za-z0-9\s\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\]\^\_\`\{\|\}\~\’\\\]') == False]
filtered_ImranKhanPTI.reset_index(drop=True, inplace=True)

filtered_MaryamNSharif = df_MaryamNSharif[df_MaryamNSharif.Text.str.contains('[^A-Za-z0-9\s\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\]\^\_\`\{\|\}\~\’\\\]') == False]
filtered_MaryamNSharif.reset_index(drop=True, inplace=True)

filtered_MJibranNasir = df_MJibranNasir[df_MJibranNasir.Text.str.contains('[^A-Za-z0-9\s\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\]\^\_\`\{\|\}\~\’\\\]') == False]
filtered_MJibranNasir.reset_index(drop=True, inplace=True)

filtered_narendramodi = df_narendramodi[df_narendramodi.Text.str.contains('[^A-Za-z0-9\s\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\]\^\_\`\{\|\}\~\’\\\]') == False]
filtered_narendramodi.reset_index(drop=True, inplace=True)

filtered_fawadChaudhry = df_fawadChaudhry[df_fawadChaudhry.Text.str.contains('[^A-Za-z0-9\s\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\]\^\_\`\{\|\}\~\’\\\]') == False]
filtered_fawadChaudhry.reset_index(drop=True, inplace=True)

filtered_marviMemon = df_marviMemon[df_marviMemon.Text.str.contains('[^A-Za-z0-9\s\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\]\^\_\`\{\|\}\~\’\\\]') == False]
filtered_marviMemon.reset_index(drop=True, inplace=True)

filtered_sherryRehman = df_sherryRehman[df_sherryRehman.Text.str.contains('[^A-Za-z0-9\s\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\]\^\_\`\{\|\}\~\’\\\]') == False]
filtered_sherryRehman.reset_index(drop=True, inplace=True)

In [16]:
# Write filtered dataframes to csv file

DATASET_PREFIX = r'../dataset/1_english/'

if not os.path.exists(DATASET_PREFIX):
    os.makedirs(DATASET_PREFIX)

filtered_bbhuttozardari.to_csv(DATASET_PREFIX + 'bbhuttozardari.csv', index=False)
filtered_ImranKhanPTI.to_csv(DATASET_PREFIX + 'imrankhanpti.csv', index=False)
filtered_MaryamNSharif.to_csv(DATASET_PREFIX + 'maryamnsharif.csv', index=False)
filtered_MJibranNasir.to_csv(DATASET_PREFIX + 'mjibrannasir.csv', index=False)
filtered_narendramodi.to_csv(DATASET_PREFIX + 'narendramodi.csv', index=False)
filtered_fawadChaudhry.to_csv(DATASET_PREFIX + 'fawadchaudhry.csv', index=False)
filtered_marviMemon.to_csv(DATASET_PREFIX + 'marvimemon.csv', index=False)
filtered_sherryRehman.to_csv(DATASET_PREFIX + 'sherryrehman.csv', index=False)

In [17]:
print('bbhuttozardari.csv:\tbefore: {0}\tafter: {1}'.format(len(df_bbhuttozardari), len(filtered_bbhuttozardari))) 
print('imrankhanpti.csv:\tbefore: {0}\tafter: {1}'.format(len(df_ImranKhanPTI), len(filtered_ImranKhanPTI)))
print('maryamnsharif.csv:\tbefore: {0}\tafter: {1}'.format(len(df_MaryamNSharif), len(filtered_MaryamNSharif)))
print('mjibrannasir.csv:\tbefore: {0}\tafter: {1}'.format(len(df_MJibranNasir), len(filtered_MJibranNasir)))
print('narendramodi.csv:\tbefore: {0}\tafter: {1}'.format(len(df_narendramodi), len(filtered_narendramodi)))
print('fawadchaudhry.csv:\tbefore: {0}\tafter: {1}'.format(len(df_fawadChaudhry), len(filtered_fawadChaudhry)))
print('marvimemon.csv:\tbefore: {0}\tafter: {1}'.format(len(df_marviMemon), len(filtered_marviMemon)))
print('sherryrehman.csv:\tbefore: {0}\tafter: {1}'.format(len(df_sherryRehman), len(filtered_sherryRehman)))


bbhuttozardari.csv:	before: 4595	after: 4339
imrankhanpti.csv:	before: 6254	after: 5519
maryamnsharif.csv:	before: 31515	after: 27681
mjibrannasir.csv:	before: 19861	after: 19275
narendramodi.csv:	before: 23796	after: 20173
fawadchaudhry.csv:	before: 25075	after: 23525
marvimemon.csv:	before: 6262	after: 5974
sherryrehman.csv:	before: 20381	after: 18469
