# Comments Cleaning

### Contents

1. Python Libraries
2. Function containing Text Preprocessing Techiniques done
> * Case Folding
> * Removal of Non-English words
> * Removal of Punctuations
> * Removal of Stopwords
> * Removal of Emojis
> * Word Stemming
3. Words Cloud

## Python Libraries

In [None]:

import matplotlib.pyplot as plt
import pandas as pd
import re
import numpy as np
import string
import seaborn as sns
import demoji

from collections import Counter
from IPython.core.interactiveshell import InteractiveShell
from nltk.corpus import stopwords, words
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud

## Function containing Text Preprocessing Techiniques done

In [None]:
def fxn_case_folding(var_input):
    """
    Preprocessing: Case Folding
    """
    return var_input.lower()

# To remove vernacular words, acronyms and wrong spellings.
def fxn_remove_non_english(input_text):
    """
    Preprocessing: Removing non-english words
    """
    remove_words = " ".join([w for w in input_text.split() if w in words.words()])
    return remove_words

def fxn_punctuation(var_input_text):
    """
    Preprocessing: Punctuation Removal
    """
    var_output_text = re.sub("[%s]" % re.escape(string.punctuation), " ", var_input_text)
    var_output_text = re.sub("[%s]" % re.escape(string.punctuation), " ", var_output_text)
    var_output_text = re.sub('\w*\d\w*', '', var_output_text) # HINT: lookup isalpha() function
    return var_output_text

def fxn_stopwords(var_input_text):
    """
    Preprocessing: Stopwords Removal
    """
    var_etd_stop = " ".join([
        var_etd_word for var_etd_word in var_input_text.split() 
        if var_etd_word not in stopwords.words('english')
    ])
    return var_etd_stop

def fxn_demoji(text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U00010000-\U0010ffff"
                               "]+", flags=re.UNICODE)
    return(emoji_pattern.sub(r'', text))

def fxn_stem(var_input_text):
    """
    Preprocessing: Stemming
    """
    var_stemmer = PorterStemmer()
    var_output_text = " ".join([
        var_stemmer.stem(var_etd_word) for var_etd_word in var_input_text.split() 
    ])
    return var_output_text

In [None]:
!cat -n mwebantu_scraped_comments.txt| head

In [None]:
# Duplicate the csv file for cleaning
!cat mwebantu_scraped_comments.txt > mwebantu_scraped_comments_cleaned.txt

In [None]:
!wc -l mwebantu_scraped_comments_cleaned.txt

In [None]:
facebook_post_comments = pd.read_csv('mwebantu_scraped_comments_cleaned.txt', sep = '|', header = None)

In [None]:
facebook_post_comments.columns = ['UserComments']
facebook_post_comments.head(10)

In [None]:
facebook_post_comments['UserComments'].replace('', np.nan, inplace = True)

In [None]:
# Checking if there any Null entries in the Series
facebook_post_comments['UserComments'].isna().values.any()

In [None]:
# Checking the current length of the pandas Series
len(facebook_post_comments)

In [None]:
# Changing every character to lowercase
facebook_post_comments['UserComments'] = facebook_post_comments['UserComments'].apply(fxn_case_folding)

In [None]:
facebook_post_comments.head(10)

In [None]:
# Removing stopwords from the comments
facebook_post_comments['UserComments'] = facebook_post_comments['UserComments'].apply(fxn_stopwords)

In [None]:
facebook_post_comments.head(10)

In [None]:
# Removing all the emojis from the comments
facebook_post_comments['UserComments'] = facebook_post_comments['UserComments'].apply(fxn_demoji)

In [None]:
facebook_post_comments.head(10)

In [None]:
# removing every punction from the comments
facebook_post_comments['UserComments'] = facebook_post_comments['UserComments'].apply(fxn_punctuation)

In [None]:
facebook_post_comments.head(10)

In [None]:
# removing non-english words
facebook_post_comments['UserComments'] = facebook_post_comments['UserComments'].apply(fxn_remove_non_english)

In [None]:
facebook_post_comments.head(10)

In [None]:
len(facebook_post_comments)

In [None]:
facebook_post_comments.tail(10)

In [None]:
# Checking if there any Null entries in the Series
facebook_post_comments['UserComments'].isna().values.any()

In [None]:
# replacing empty entries with NAN
facebook_post_comments['UserComments'].replace('', np.nan, inplace = True)
facebook_post_comments['UserComments'].tail(10)

In [None]:
# dropping all rows that have NAN
facebook_post_comments.dropna(subset = ['UserComments'], inplace = True)

In [None]:
len(facebook_post_comments)

In [None]:
facebook_post_comments['UserComments'].tail(10)

In [None]:
# Stemming all the comments
facebook_post_comments['UserComments'] = facebook_post_comments['UserComments'].apply(fxn_stem)

In [None]:
facebook_post_comments.tail(10)

In [None]:
len(facebook_post_comments)

In [None]:
facebook_post_comments['UserComments'].replace('', np.nan, inplace = True)

In [None]:
facebook_post_comments['UserComments'].isna().values.any()

In [None]:
# creating a word cloud to see most typed words which will
# this will be used to identify key words when classifying the comments
most_typed_words = WordCloud(stopwords = stopwords.words('english'), background_color = 'black', colormap = 'Dark2', max_font_size = 100, random_state = 42)
most_typed_words.generate(' '.join(facebook_post_comments['UserComments']))

plt.figure(figsize = (15,10))
plt.imshow(most_typed_words)
plt.axis("off")

In [None]:
import joblib
joblib.dump(facebook_post_comments, 'facebook_post_comments_series.pkl')