# Data Cleaning and Preprocessing

## Import Libraries

In [1]:
# Import Dependencies
%matplotlib inline

# Begin Python Imports
import datetime, warnings, scipy
warnings.filterwarnings("ignore")

# Data Manipulation
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

# Visualization 
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-whitegrid')

# Progress bar
from tqdm._tqdm_notebook import tqdm_notebook
from tqdm import tqdm
tqdm_notebook.pandas()

# Text Cleaning & Normalization
import re
import pickle
import spacy
import nltk
from emot.emo_unicode import UNICODE_EMOJI, EMOTICONS_EMO
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

print(spacy.__version__)

nltk.download('averaged_perceptron_tagger')
nlp = spacy.load("en_core_web_sm")

3.6.0


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Import data

In [2]:
# Read data 
bully_data = pd.read_csv('bully_data_toclean_version.csv', encoding='utf8')

## Explore Dataset

In [3]:
# Check dimension of dataset
bully_data.shape
print("There are "+ str(bully_data.shape[0]) +" rows and "+ str(bully_data.shape[1]) +" columns from the dataset.")

There are 123548 rows and 8 columns from the dataset.


In [4]:
# Check column type
bully_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123548 entries, 0 to 123547
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         123548 non-null  int64  
 1   tag                123548 non-null  object 
 2   text               123548 non-null  object 
 3   label              5376 non-null    object 
 4   role               5376 non-null    object 
 5   harmfulness_score  5376 non-null    float64
 6   oth_language       8754 non-null    float64
 7   file_index         123548 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 7.5+ MB


In [5]:
# Delete Unwanted column
bully_data.drop('Unnamed: 0', inplace=True, axis=1)

In [6]:
# Delete Unwanted column
bully_data=bully_data.reset_index(drop=True)

In [7]:
# Last check column type
bully_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123548 entries, 0 to 123547
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   tag                123548 non-null  object 
 1   text               123548 non-null  object 
 2   label              5376 non-null    object 
 3   role               5376 non-null    object 
 4   harmfulness_score  5376 non-null    float64
 5   oth_language       8754 non-null    float64
 6   file_index         123548 non-null  object 
dtypes: float64(2), object(5)
memory usage: 6.6+ MB


## Check if got missing data

In [8]:
# Calculate the proportion of missing data

def checkMissing(data,perc=0):
    """ 
    Function that takes in a dataframe and returns
    the percentage of missing value.
    """
    missing = [(i, data[i].isna().mean()*100) for i in data]
    missing = pd.DataFrame(missing, columns=["column_name", "percentage"])
    missing = missing[missing.percentage > perc]
    print(missing.sort_values("percentage", ascending=False).reset_index(drop=True))

print("Proportion of missing data in columns")
checkMissing(bully_data)

Proportion of missing data in columns
         column_name  percentage
0              label   95.648655
1               role   95.648655
2  harmfulness_score   95.648655
3       oth_language   92.914495


## Text Preprocessing

In [9]:
import preprocess_text as pt
import language_tool_python
import gensim.downloader as api
#from pycontractions.contractions import Contractions
from sklearn.feature_extraction.text import CountVectorizer

# Instantiate
tool = language_tool_python.LanguageTool('en-US')
#cont = Contractions(api_key="glove-twitter-100")
glove_model = api.load("glove-twitter-100")

# Functions
def expand_contractions_with_embeddings(text, embeddings_model):
    words = text.split()
    expanded_words = []

    for word in words:
        if "'" in word and word.lower() != "n't":
            contraction = re.sub(r"[^\w\s]", "", word)
            
            # Check if the contraction is present in the vocabulary
            if contraction in embeddings_model:
                expansion_candidates = embeddings_model.most_similar(positive=[contraction], topn=5)
                best_expansion = expansion_candidates[0][0]
                expanded_words.append(best_expansion)
            else:
                # Handle common variations of contractions
                if contraction == "i":
                    expanded_words.append("I")
                elif contraction == "youre":
                    expanded_words.append("you're")
                else:
                    expanded_words.append(word)
        else:
            expanded_words.append(word)

    expanded_text = ' '.join(expanded_words)
    return expanded_text


def get_term_list(path):
    '''
    Function to import term list file
    '''
    word_list = []
    with open(path,"r") as f:
        for line in f:
            word = line.replace("\n","").strip()
            word_list.append(word)
    return word_list

def get_vocab(corpus):
    '''
    Function returns unique words in document corpus
    '''
    # vocab set
    unique_words = set()
    
    # looping through each document in corpus
    for document in tqdm(corpus):
        for word in document.split(" "):
            if len(word) > 2:
                unique_words.add(word)
    
    return unique_words

def create_profane_mapping(profane_words,vocabulary):
    '''
    Function creates a mapping between commonly found profane words and words in 
    document corpus 
    '''
    
    # mapping dictionary
    mapping_dict = dict()
    
    # looping through each profane word
    for profane in tqdm(profane_words):
        mapped_words = set()
        
        # looping through each word in vocab
        for word in vocabulary:
            # mapping only if ratio > 80
            try:
                if fuzz.ratio(profane,word) > 90:
                    mapped_words.add(word)
            except:
                pass
                
        # list of all vocab words for given profane word
        mapping_dict[profane] = mapped_words
    
    return mapping_dict

def replace_words(corpus,mapping_dict):
    '''
    Function replaces obfuscated profane words using a mapping dictionary
    '''
    
    processed_corpus = []
    
    # iterating over each document in the corpus
    for document in tqdm(corpus):
        
        # splitting sentence to word
        comment = document.split()
        
        # iterating over mapping_dict
        for mapped_word,v in mapping_dict.items():
            
            # comparing target word to each comment word 
            for target_word in v:
                
                # each word in comment
                for i,word in enumerate(comment):
                    if word == target_word:
                        comment[i] = mapped_word
        
        # joining comment words
        document = " ".join(comment)
        document = document.strip()
                    
        processed_corpus.append(document)
        
    return processed_corpus

# Counts of term by category
countvec = CountVectorizer(ngram_range=(1,3))
def get_term_counts(x,category):
    
    # Split input text by unigram, bigram and trigram
    # as the keywords may span up to 3 words
    def get_ngram_text(x):
        
        try:
            countvec.fit_transform(x)
            text_list = countvec.get_feature_names()
            return text_list

        except ValueError:
            return [' '] # to handle scenario where text input are all stop words only
    
    # check the existence of word by category
    term_category = [t for t in get_ngram_text(x) if t in category]
    
    # return the number of occurence
    return len(term_category)


# Import external list, store as list
term_absolute_list = get_term_list("term_list/compiled_absolute.txt")
term_allness_list = get_term_list("term_list/compiled_allness.txt")
term_badword_list = get_term_list("term_list/compiled_badword.txt")
term_negation_list = get_term_list("term_list/compiled_negation.txt")
term_diminisher_list = get_term_list("term_list/compiled_diminisher.txt")
term_intensifier_list = get_term_list("term_list/compiled_intensifier.txt")

In [10]:
# Text Preprocessing

def text_preprocessing_pipeline(df=bully_data,
                                textual_statistics=False,
                                remove_url=False,
                                remove_email=False,
                                remove_user_mention=False,
                                remove_html=False,
                                remove_space_single_char=False,
                                normalize_elongated_char=False,
                                normalize_emoji=False,
                                normalize_emoticon=False,
                                normalize_accented=False,
                                lower_case=False,
                                normalize_slang=False,
                                normalize_badterm=False,
                                spelling_check=False,
                                normalize_contraction=False,
                                term_list=False,
                                remove_numeric=False,
                                remove_stopword=False,
                                keep_pronoun=False,
                                remove_punctuation=False,
                                pos=False,
                                ner=False,
                                lemmatise=False
                               ):
    '''
    -------------
     Description
    -------------
    Function that compile all preprocessing steps in one go
    
    -----------
     Parameter
    -----------
    df: Data Frame
    textual_statistics: Boolean
    remove_url: Boolean
    remove_email: Boolean
    remove_user_mention: Boolean
    remove_html: Boolean
    remove_space_single_char: Boolean
    normalize_elongated_char: Boolean
    normalize_emoji: Boolean
    normalize_emoticon: Boolean
    normalize_accented: Boolean
    lower_case: Boolean
    normalize_slang: Boolean
    normalize_badterm: Boolean
    spelling_check: Boolean
    normalize_contraction: Boolean
    remove_numeric: Boolean
    remove_stopword: Boolean
    keep_pronoun: Boolean
    remove_punctuation: Boolean
    pos: Boolean
    ner: Boolean
    lemmatise: Boolean
    
    '''
    
    if textual_statistics:
        print('Developing textual statistics from original text')
        df['word_count'] = df['text'].progress_apply(lambda x: pt.get_wordcounts(x))
        df['char_count'] = df['text'].progress_apply(lambda x: pt.get_char_counts(x))
        df['avg_word_len'] = df['text'].progress_apply(lambda x: pt.get_avg_wordlength(x))
        df['stopword_count'] = df['text'].progress_apply(lambda x: pt.get_stopwords_counts(x))
        df['hashtag_count'] = df['text'].progress_apply(lambda x: pt.get_hashtag_counts(x))
        df['mention_count'] = df['text'].progress_apply(lambda x: pt.get_mention_counts(x))
        df['digit_counts'] = df['text'].progress_apply(lambda x: pt.get_digit_counts(x))
        df['uppercase_count'] = df['text'].progress_apply(lambda x: pt.get_uppercase_counts(x))
        df['emails_count'] = df['text'].progress_apply(lambda x: pt.get_emails(x))
        df['urls_count'] = df['text'].progress_apply(lambda x: pt.get_urls(x))
        df['punc_count'] = df['text'].progress_apply(lambda x: pt.get_punc_counts(x))
        df["exclaimation_count"] = df["text"].progress_apply(lambda x: x.count("!"))
        df["questionmark_count"] = df["text"].progress_apply(lambda x: x.count("?"))
    
    if pos:
        print('Text Preprocessing: Developing POS tag count')
        df["pos"] = df["text"].progress_apply(lambda x: pt.get_pos_tag(x))
        df["pos_ADJ_counts"] = df["pos"].progress_apply(lambda x: pt.get_pos_tag_counts(x,pos_tag="ADJ"))     #adjective
        df["pos_ADP_counts"] = df["pos"].progress_apply(lambda x: pt.get_pos_tag_counts(x,pos_tag="ADP"))     #adposition
        df["pos_ADV_counts"] = df["pos"].progress_apply(lambda x: pt.get_pos_tag_counts(x,pos_tag="ADV"))     #adverb
        df["pos_AUX_counts"] = df["pos"].progress_apply(lambda x: pt.get_pos_tag_counts(x,pos_tag="AUX"))     #auxiliary
        df["pos_CCONJ_counts"] = df["pos"].progress_apply(lambda x: pt.get_pos_tag_counts(x,pos_tag="CCONJ")) #coordinating conjunction
        df["pos_DET_counts"] = df["pos"].progress_apply(lambda x: pt.get_pos_tag_counts(x,pos_tag="DET"))     #determiner
        df["pos_NOUN_counts"] = df["pos"].progress_apply(lambda x: pt.get_pos_tag_counts(x,pos_tag="NOUN"))   #noun
        df["pos_INTJ_counts"] = df["pos"].progress_apply(lambda x: pt.get_pos_tag_counts(x,pos_tag="INTJ"))   #interjection
        df["pos_NUM_counts"] = df["pos"].progress_apply(lambda x: pt.get_pos_tag_counts(x,pos_tag="NUM"))     #numeral
        df["pos_PART_counts"] = df["pos"].progress_apply(lambda x: pt.get_pos_tag_counts(x,pos_tag="PART"))   #particle
        df["pos_PRON_counts"] = df["pos"].progress_apply(lambda x: pt.get_pos_tag_counts(x,pos_tag="PRON"))   #pronoun
        df["pos_PROPN_counts"] = df["pos"].progress_apply(lambda x: pt.get_pos_tag_counts(x,pos_tag="PROPN")) #proper noun
        df["pos_PUNCT_counts"] = df["pos"].progress_apply(lambda x: pt.get_pos_tag_counts(x,pos_tag="PUNCT")) #punctuation
        df["pos_SCONJ_counts"] = df["pos"].progress_apply(lambda x: pt.get_pos_tag_counts(x,pos_tag="SCONJ")) #subordinating conjunction
        df["pos_SYM_counts"] = df["pos"].progress_apply(lambda x: pt.get_pos_tag_counts(x,pos_tag="SYM"))     #symbol
        df["pos_VERB_counts"] = df["pos"].progress_apply(lambda x: pt.get_pos_tag_counts(x,pos_tag="VERB"))   #verb
        df["pos_other_counts"] = df["pos"].progress_apply(lambda x: pt.get_pos_tag_counts(x,pos_tag="X"))     #other
    
    if ner:
        print('Text Preprocessing: Developing NER tag count')
        df["ner"] = df["text"].progress_apply(lambda x: pt.get_ner(x))
        ner_lst = nlp.pipe_labels['ner']
        for ner in ner_lst:
             df["ner_"+ ner +"_counts"] =  df["ner"].apply(lambda x: pt.get_ner_counts(x,ner))
                
    if remove_url:
        print('Text Preprocessing: Remove URL')
        df['text_check'] = df['text'].progress_apply(lambda x: pt.remove_urls(x))
        
    if remove_email:
        print('Text Preprocessing: Remove email')
        df['text_check'] = df['text_check'].progress_apply(lambda x: pt.remove_emails(x))
        
    if remove_user_mention:
        print('Text Preprocessing: Remove user mention')
        df['text_check'] = df['text_check'].progress_apply(lambda x: pt.remove_mention(x))
    
    if remove_html:
        print('Text Preprocessing: Remove html element')
        df['text_check'] = df['text_check'].progress_apply(lambda x: pt.remove_html_tags(x))
        
    if remove_space_single_char:
        print('Text Preprocessing: Remove single spcae between single characters e.g F U C K')
        df['text_check'] = df['text_check'].progress_apply(lambda x: pt.remove_space_single_chars(x))
        
    if normalize_elongated_char:
        print('Text Preprocessing: Reduction of elongated characters')
        df['text_check'] = df['text_check'].progress_apply(lambda x: pt.remove_elongated_chars(x))
        
    if normalize_emoji:
        print('Text Preprocessing: Normalize and count emoji')
        df['emoji_counts'] = df['text_check'].progress_apply(lambda x: pt.get_emoji_counts(x))
        df['text_check'] = df['text_check'].progress_apply(lambda x: pt.convert_emojis(x))
        
        
    if normalize_emoticon:
        print('Text Preprocessing: Normalize and count emoticon')
        df['emoticon_counts'] = df['text_check'].progress_apply(lambda x: pt.get_emoticon_counts(x))
        df['text_check'] = df['text_check'].progress_apply(lambda x: pt.convert_emoticons(x))
        
        
    if normalize_accented:
        print('Text Preprocessing: Normalize accented character')
        df['text_check'] = df['text_check'].progress_apply(lambda x: pt.remove_accented_chars(x))
        
    if lower_case:
        print('Text Preprocessing: Convert to lower case')
        df['text_check'] = df['text_check'].progress_apply(lambda x: str(x).lower())
    
    if normalize_slang:
        print('Text Preprocessing: Normalize slang')
        df['text_check'] = df['text_check'].progress_apply(lambda x: pt.slang_resolution(x))
        
    if normalize_badterm:
        print('Text Preprocessing: Replace obfuscated bad term')
        # unique words in vocab 
        unique_words = get_vocab(corpus= df['text_check'])
        
        # creating mapping dict 
        mapping_dict = create_profane_mapping(profane_words=term_badword_list,vocabulary=unique_words)
        
        df['text_check'] = replace_words(corpus=df['text_check'],
                                                 mapping_dict=mapping_dict)
        
    if spelling_check:
        print('Text Preprocessing: Spelling Check')
        df['text_check'] = df['text_check'].progress_apply(lambda x: tool.correct(x))
        tool.close()
        
    if normalize_contraction:
        print('Text Preprocessing: Contraction to Expansion')
        
        # Special handling to prevent code from taking forever to run
        hardcode_clean_50702 = df['text_check'].iloc[50702].replace("'d"," would").replace("wasn't","was not").replace("wouldn't","would not").replace("'s"," is").replace("'m"," am")
        df['text_check'].iloc[50702] = hardcode_clean_50702

        hardcode_clean_107720 = df['text_check'].iloc[107720].replace("'d"," would").replace("wasn't","was not").replace("wouldn't","would not")
        df['text_check'].iloc[107720] = hardcode_clean_107720

        df['text_check'] = df['text_check'].progress_apply(lambda x: expand_contractions_with_embeddings(x, glove_model))

    if remove_numeric: 
        print('Text Preprocessing: Remove numeric')
        df['text_check'] = df['text_check'].progress_apply(lambda x: pt.remove_numeric(x))
        
    if remove_punctuation:
        print('Text Preprocessing: Remove punctuations')
        df['text_check'] = df['text_check'].progress_apply(lambda x: pt.remove_special_chars(x))
        
    if remove_stopword:
        print('Text Preprocessing: Remove stopword')
        if keep_pronoun:
            print('Text Preprocessing: and, keep Pronoun')
        df["text_check"] = df["text_check"].progress_apply(lambda x: pt.remove_stopwords(x,keep_pronoun=keep_pronoun))
        
    # Remove multiple spaces
    print('Text Preprocessing: Remove multiple spaces')
    df['text_check'] = df['text_check'].progress_apply(lambda x: ' '.join(x.split()))
    
    if lemmatise:
        print('Text Preprocessing: Lemmatization')
        df["text_check"] = df["text_check"].progress_apply(lambda x: pt.make_base(x))
        
    # Make sure remove multiple spaces
    # df['text_check'] = df['text_check'].progress_apply(lambda x: ' '.join(x.split()))
    
    # Make sure lower case for all again
    df['text_check'] = df['text_check'].progress_apply(lambda x: str(x).lower())
    
    # Remove empty text after cleaning
    print('Last Step: Remove empty text after preprocessing. Done')
    df = df[~df['text_check'].isna()]
    df = df[df['text_check'] != '']
    df = df.reset_index(drop=True)
    
    return df

## Output the preprocessed and clean data

In [11]:
bully_data_clean_with_stopword = text_preprocessing_pipeline(
                                    df=bully_data,
                                    textual_statistics=True,
                                    remove_url=True,
                                    remove_email=True,
                                    remove_user_mention=True,
                                    remove_html=True,
                                    remove_space_single_char=True,
                                    normalize_elongated_char=True,
                                    normalize_emoji=True,
                                    normalize_emoticon=True,
                                    normalize_accented=True,
                                    lower_case=True,
                                    normalize_slang=True,
                                    normalize_badterm=True,
                                    spelling_check=True,
                                    normalize_contraction=True,
                                    remove_numeric=True,
                                    remove_stopword=False, # Keep stopwords
                                    keep_pronoun=False,  # Keep pronoun
                                    remove_punctuation=True,
                                    pos=True,
                                    ner=True,
                                    lemmatise=True)


Developing textual statistics from original text


  0%|          | 0/123548 [00:00<?, ?it/s]

  0%|          | 0/123548 [00:00<?, ?it/s]

  0%|          | 0/123548 [00:00<?, ?it/s]

  0%|          | 0/123548 [00:00<?, ?it/s]

  0%|          | 0/123548 [00:00<?, ?it/s]

  0%|          | 0/123548 [00:00<?, ?it/s]

  0%|          | 0/123548 [00:00<?, ?it/s]

  0%|          | 0/123548 [00:00<?, ?it/s]

  0%|          | 0/123548 [00:00<?, ?it/s]

  0%|          | 0/123548 [00:00<?, ?it/s]

  0%|          | 0/123548 [00:00<?, ?it/s]

  0%|          | 0/123548 [00:00<?, ?it/s]

  0%|          | 0/123548 [00:00<?, ?it/s]

Text Preprocessing: Developing POS tag count


  0%|          | 0/123548 [00:00<?, ?it/s]

  0%|          | 0/123548 [00:00<?, ?it/s]

  0%|          | 0/123548 [00:00<?, ?it/s]

  0%|          | 0/123548 [00:00<?, ?it/s]

  0%|          | 0/123548 [00:00<?, ?it/s]

  0%|          | 0/123548 [00:00<?, ?it/s]

  0%|          | 0/123548 [00:00<?, ?it/s]

  0%|          | 0/123548 [00:00<?, ?it/s]

  0%|          | 0/123548 [00:00<?, ?it/s]

  0%|          | 0/123548 [00:00<?, ?it/s]

  0%|          | 0/123548 [00:00<?, ?it/s]

  0%|          | 0/123548 [00:00<?, ?it/s]

  0%|          | 0/123548 [00:00<?, ?it/s]

  0%|          | 0/123548 [00:00<?, ?it/s]

  0%|          | 0/123548 [00:00<?, ?it/s]

  0%|          | 0/123548 [00:00<?, ?it/s]

  0%|          | 0/123548 [00:00<?, ?it/s]

  0%|          | 0/123548 [00:00<?, ?it/s]

Text Preprocessing: Developing NER tag count


  0%|          | 0/123548 [00:00<?, ?it/s]

Text Preprocessing: Remove URL


  0%|          | 0/123548 [00:00<?, ?it/s]

Text Preprocessing: Remove email


  0%|          | 0/123548 [00:00<?, ?it/s]

Text Preprocessing: Remove user mention


  0%|          | 0/123548 [00:00<?, ?it/s]

Text Preprocessing: Remove html element


  0%|          | 0/123548 [00:00<?, ?it/s]

Text Preprocessing: Remove single spcae between single characters e.g F U C K


  0%|          | 0/123548 [00:00<?, ?it/s]

Text Preprocessing: Reduction of elongated characters


  0%|          | 0/123548 [00:00<?, ?it/s]

Text Preprocessing: Normalize and count emoji


  0%|          | 0/123548 [00:00<?, ?it/s]

  0%|          | 0/123548 [00:00<?, ?it/s]

Text Preprocessing: Normalize and count emoticon


  0%|          | 0/123548 [00:00<?, ?it/s]

  0%|          | 0/123548 [00:00<?, ?it/s]

Text Preprocessing: Normalize accented character


  0%|          | 0/123548 [00:00<?, ?it/s]

Text Preprocessing: Convert to lower case


  0%|          | 0/123548 [00:00<?, ?it/s]

Text Preprocessing: Normalize slang


  0%|          | 0/123548 [00:00<?, ?it/s]

Text Preprocessing: Replace obfuscated bad term


100%|██████████| 123548/123548 [00:00<00:00, 236306.14it/s]
100%|██████████| 1921/1921 [1:02:39<00:00,  1.96s/it]
100%|██████████| 123548/123548 [02:52<00:00, 718.10it/s]

Text Preprocessing: Spelling Check





  0%|          | 0/123548 [00:00<?, ?it/s]

Text Preprocessing: Contraction to Expansion


  0%|          | 0/123548 [00:00<?, ?it/s]

Text Preprocessing: Remove numeric


  0%|          | 0/123548 [00:00<?, ?it/s]

Text Preprocessing: Remove punctuations


  0%|          | 0/123548 [00:00<?, ?it/s]

Text Preprocessing: Remove multiple spaces


  0%|          | 0/123548 [00:00<?, ?it/s]

Text Preprocessing: Lemmatization


  0%|          | 0/123548 [00:00<?, ?it/s]

  0%|          | 0/123548 [00:00<?, ?it/s]

Last Step: Remove empty text after preprocessing. Done


In [12]:
bully_data_clean_with_stopword_base1 =  bully_data_clean_with_stopword.copy()
bully_data_clean_with_stopword_base2 =  bully_data_clean_with_stopword.copy()

In [13]:
bully_data_clean_no_stopword_pronoun = text_preprocessing_pipeline(
                                            df=bully_data_clean_with_stopword_base1, 
                                            remove_stopword=True, # Remove stopwords
                                            keep_pronoun=True) # But keep pronoun

Text Preprocessing: Remove stopword
Text Preprocessing: and, keep Pronoun


  0%|          | 0/120938 [00:00<?, ?it/s]

Text Preprocessing: Remove multiple spaces


  0%|          | 0/120938 [00:00<?, ?it/s]

  0%|          | 0/120938 [00:00<?, ?it/s]

Last Step: Remove empty text after preprocessing. Done


In [14]:
bully_data_clean_no_stopword_all = text_preprocessing_pipeline(
                                        df=bully_data_clean_with_stopword_base2,
                                        remove_stopword=True, # Remove all stopwords
                                        keep_pronoun=False)

Text Preprocessing: Remove stopword


  0%|          | 0/120938 [00:00<?, ?it/s]

Text Preprocessing: Remove multiple spaces


  0%|          | 0/120938 [00:00<?, ?it/s]

  0%|          | 0/120938 [00:00<?, ?it/s]

Last Step: Remove empty text after preprocessing. Done


In [15]:
bully_data_clean_with_stopword.to_csv('bully_data_clean_with_stopword.csv')
bully_data_clean_no_stopword_pronoun.to_csv('bully_data_clean_no_stopword_pronoun.csv')
bully_data_clean_no_stopword_all.to_csv('bully_data_clean_no_stopword_all.csv')