In [1]:
import pandas as pd
import numpy as np
import chardet
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
import string
import utils.data_processing_utils as util

In [2]:

#Processing data for modelling
def process_data(data_frame,text_label='raw_text',y_label="username",remove_url = True, 
remove_text_with_substrings = None, remove_characters_from_text = None, remove_column_indexs =None, 
drop_na = False, remove_word_count_outlier=[False,0,0],remove_word_length_outlier=[False,0,0], remove_white_space = False):


    #Remove columns based off column index
    if(remove_column_indexs != None):
       data_frame.drop(data_frame.columns[remove_column_indexs],axis=1,inplace=True)


    #Remove white space
    if(remove_white_space):
        data_frame[text_label] = util.remove_block_of_white_space(data_frame[text_label])

    if(remove_url):
        data_frame[text_label] = data_frame[text_label].apply(util.remove_url)

    #Remove rows with substring from the data   
    if(remove_text_with_substrings != None):
        for substring in remove_text_with_substrings:
            data_frame = util.remove_text_with_substring(data_frame,data_frame[text_label],substring)

    #Remove characters within the rows of the data  
    if(remove_characters_from_text != None):
        data_frame[text_label] = util.remove_character_from_text(data_frame[text_label],remove_characters_from_text)


    #Remove outliers
    if(remove_word_count_outlier[0] == True or remove_word_length_outlier[0] == True):
        # Create word count and character count lists
        word_count, char_count, ave_length =  util.getTextMetaInformation(data_frame,text_label)
        word_count_outliers = []
        word_length_outliers = []

        if(remove_word_count_outlier):
            #Get text indexes with short word count 
            short_word_count_outliers = util.get_outlier_indexes(word_count, remove_word_count_outlier[1], "less")
            #Get text indexes with long word count 
            long_word_count_outliers = util.get_outlier_indexes(word_count, remove_word_count_outlier[2], "greater")
            #Outliers for word count
            word_count_outliers = np.append(short_word_count_outliers, long_word_count_outliers)
        if(remove_word_length_outlier):
            # Get long word length outliers
            long_word_length_outliers = util.get_outlier_indexes(ave_length, remove_word_length_outlier[0], "greater")
            # Get short word length outliers
            short_word_length_outliers = util.get_outlier_indexes(ave_length, remove_word_length_outlier[1], "less")
            #Outliers for word length
            word_length_outliers = np.append(short_word_length_outliers, long_word_length_outliers)

        if(remove_word_count_outlier and remove_word_length_outlier ):
           
            outliers = np.append(word_count_outliers,word_length_outliers)   
            data_frame = data_frame.iloc[np.unique(outliers)]

        elif(remove_word_count_outlier and not remove_word_length_outlier):
            outliers = word_count_outliers      
            data_frame = data_frame.iloc[np.unique(outliers)]  

        elif(not remove_word_count_outlier and remove_word_length_outlier):
            outliers = word_length_outliers      
            data_frame = data_frame.iloc[np.unique(outliers)]       


    #Remove any row with just empty text 
    filter = data_frame[text_label] != ""
    data_frame = data_frame[filter]

    #Remove null values
    if(drop_na):
       data_frame.dropna()
       
    return data_frame


In [3]:
#Hyper Parameters
Y_LABEL = "username"
SAVE_FILE_NAME = "processed_data.csv"
X_LABEL = "raw_text"
FILE_NAME = "authorship_dataset.csv"
REMOVE_CHARACTERS = ['ï', 'é', 'ñ', 'è', 'ö', 'æ', 'ô', 'â', 'á', 'à', 'ê', 'ë']
REMOVE_COLUMNN_INDEXES =[0,1]
REMOVE_SUBSTRINGS_FROM_TEXT = ["RT",'"']
DROP_NA = True
REMOVE_WORD_COUNT_OUTLIER = [False,0,0]
REMOVE_WORD_LENGTH_OUTLIER = [False,0,0]
REMOVE_WHITE_SPACE = True
REMOVE_URLS = True


In [4]:
data = pd.read_csv(FILE_NAME)
len(data)

63019

In [5]:
df = process_data(data_frame = data,text_label= X_LABEL,y_label=Y_LABEL, remove_url=REMOVE_URLS, remove_text_with_substrings=REMOVE_SUBSTRINGS_FROM_TEXT,
remove_characters_from_text=REMOVE_CHARACTERS,remove_column_indexs=REMOVE_COLUMNN_INDEXES,drop_na=DROP_NA,remove_word_count_outlier = REMOVE_WORD_COUNT_OUTLIER,
remove_word_length_outlier=REMOVE_WORD_LENGTH_OUTLIER,remove_white_space=REMOVE_WHITE_SPACE)


63019


In [6]:
len(df)

27417

In [7]:
#Removing white space in y label
df[Y_LABEL] = df[Y_LABEL].str.replace(" ", "")

In [9]:
clean_df = df[[X_LABEL,Y_LABEL]]

In [256]:
#save to excel
clean_df.to_csv(SAVE_FILE_NAME)