In [None]:
# Import libraries
import os
import pandas as pd

In [None]:
# Set options for viewing dataframes properly

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [None]:
# Load dataset

def load_data(filename):
    data = pd.read_csv(filename, index_col=False, usecols=['Datetime','Text'])
    return data

In [None]:
# Get dataframes for all politicians

DATASET_PREFIX = '../dataset/raw/'

df_bbhuttozardari = load_data(DATASET_PREFIX + 'bbhuttozardari-tweets.csv')
df_ImranKhanPTI = load_data(DATASET_PREFIX + 'ImranKhanPTI-tweets.csv')
df_MaryamNSharif = load_data(DATASET_PREFIX + 'MaryamNSharif-tweets.csv')
df_MJibranNasir = load_data(DATASET_PREFIX + 'MJibranNasir-tweets.csv')
df_narendramodi = load_data(DATASET_PREFIX + 'narendramodi-tweets.csv')
df_fawadChaudhry = load_data(DATASET_PREFIX + 'fawadchaudhry-tweets.csv')

In [None]:
# Get filtered dataframes - filtering out tweets with non-english text

filtered_bbhuttozardari = df_bbhuttozardari[df_bbhuttozardari.Text.str.contains('[^A-Za-z0-9\s\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\]\^\_\`\{\|\}\~\’\\\]') == False]
filtered_bbhuttozardari.reset_index(drop=True, inplace=True)

filtered_ImranKhanPTI = df_ImranKhanPTI[df_ImranKhanPTI.Text.str.contains('[^A-Za-z0-9\s\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\]\^\_\`\{\|\}\~\’\\\]') == False]
filtered_ImranKhanPTI.reset_index(drop=True, inplace=True)

filtered_MaryamNSharif = df_MaryamNSharif[df_MaryamNSharif.Text.str.contains('[^A-Za-z0-9\s\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\]\^\_\`\{\|\}\~\’\\\]') == False]
filtered_MaryamNSharif.reset_index(drop=True, inplace=True)

filtered_MJibranNasir = df_MJibranNasir[df_MJibranNasir.Text.str.contains('[^A-Za-z0-9\s\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\]\^\_\`\{\|\}\~\’\\\]') == False]
filtered_MJibranNasir.reset_index(drop=True, inplace=True)

filtered_narendramodi = df_narendramodi[df_narendramodi.Text.str.contains('[^A-Za-z0-9\s\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\]\^\_\`\{\|\}\~\’\\\]') == False]
filtered_narendramodi.reset_index(drop=True, inplace=True)

filtered_fawadChaudhry = df_fawadChaudhry[df_fawadChaudhry.Text.str.contains('[^A-Za-z0-9\s\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\]\^\_\`\{\|\}\~\’\\\]') == False]
filtered_fawadChaudhry.reset_index(drop=True, inplace=True)

In [None]:
# Write filtered dataframes to csv file

DATASET_PREFIX = r'../dataset/1_english/'

if not os.path.exists(DATASET_PREFIX):
    os.makedirs(DATASET_PREFIX)

filtered_bbhuttozardari.to_csv(DATASET_PREFIX + 'bbhuttozardari.csv', index=False)
filtered_ImranKhanPTI.to_csv(DATASET_PREFIX + 'imrankhanpti.csv', index=False)
filtered_MaryamNSharif.to_csv(DATASET_PREFIX + 'maryamnsharif.csv', index=False)
filtered_MJibranNasir.to_csv(DATASET_PREFIX + 'mjibrannasir.csv', index=False)
filtered_narendramodi.to_csv(DATASET_PREFIX + 'narendramodi.csv', index=False)
filtered_fawadChaudhry.to_csv(DATASET_PREFIX + 'fawadchaudhry.csv', index=False)

In [None]:
print('bbhuttozardari.csv:\tbefore: {0}\tafter: {1}'.format(len(df_bbhuttozardari), len(filtered_bbhuttozardari))) 
print('imrankhanpti.csv:\tbefore: {0}\tafter: {1}'.format(len(df_ImranKhanPTI), len(filtered_ImranKhanPTI)))
print('maryamnsharif.csv:\tbefore: {0}\tafter: {1}'.format(len(df_MaryamNSharif), len(filtered_MaryamNSharif)))
print('mjibrannasir.csv:\tbefore: {0}\tafter: {1}'.format(len(df_MJibranNasir), len(filtered_MJibranNasir)))
print('narendramodi.csv:\tbefore: {0}\tafter: {1}'.format(len(df_narendramodi), len(filtered_narendramodi)))
print('fawadchaudhry.csv:\tbefore: {0}\tafter: {1}'.format(len(df_fawadChaudhry), len(filtered_fawadChaudhry)))