# Function to perform pre-processing:
* clean noise
* remove stopwords
* stemming 
* lemmatization 

In [1]:
#Import functions and packages, necessary for clean function
import re
import spacy
import nltk
import pandas as pd 
import numpy as np 

In [3]:
#Download stopwords corpuses from nltk and spacy
from nltk.corpus import stopwords
from spacy.lang.en.stop_words import STOP_WORDS
from stop_words import get_stop_words

In [4]:
# Import stemming packages
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.arlstem import ARLSTem
from nltk.stem.arlstem2 import ARLSTem2
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
def clean_text(text, stop_words, stop_words_param):
    """Function to clean the text from noise, with the option to remove stopwords
    """
    # regex to remove all Non-Alpha Numeric and space
    special_character_removal=re.compile(r'[^a-z\d ]',re.IGNORECASE)
    # regex to replace all numeric
    replace_numbers=re.compile(r'\d+',re.IGNORECASE)
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"i’m", "i am", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"'", " ", text)
    text = replace_numbers.sub('', text)
    text = str(text).replace("\n", " ")
    text = str(text).replace("   ", " ")
    text = special_character_removal.sub('',text)
    #check for stop_words parameters: nltk or spacy bibliothek for stopwords
    if stop_words_param =='nltk':
        stop_words_corpus = set(stopwords.words('english'))
    elif stop_words_param =='spacy':
        stop_words_corpus = set(STOP_WORDS)
    else:
        stop_words_corpus = get_stop_words('english')
     #check for stop_words= True or False, when True, then remove
    if stop_words: 
        #split into list for processing
        text = text.split()
        #check for stopwords and remove
        text = [word for word in text if not word in stop_words_corpus]
        text = ' '.join(text)
    return text

In [None]:
def clean_dataframe(df, stop_words=True, lemmatization=True, stem_words=True, write_to_file_param=True, stop_words_param='nltk', lem_param='wn', stem_words_param='lancaster', path='data/'):
    """Function performs cleaning for 'comment_text' column in dataframe df. 
    Returns dataframe with new columns with cleaned text (with or without stopwords) and new column after stemming and lemmatazation
    Function operates with following parameters:
    df - dataframe with column 'comment_text'
    stop_words - boolean "True" or "False" (by default "True"). This parameter defines whether perform cleaning with (or without) stop-words removal 
    lemmatization - boolean "True" or "False" (by default "True"). This parameter defines whether perform cleaning with (or without) lemmatization
    stem_words - boolean "True" or "False" (by default "True"). This parameter defines whether perform cleaning with (or without) stemming
    write_to_file_param - boolean "True" or "False" (by default "True"). This parameter defines whether to save dataframe with cleaned comment to file or not
    stop_words_param - parameter to define corpus(default:'nltk', 'spacy' or 'py'), according to which function deletes stop-words
    path - directory, to which file will be saved
    lem_param - parameter to define lemmatizer ((default:'wn')
    stem_words_param - parameter to define stemmer (default:'lancaster', 'snow', 'arl')
    """
    if stop_words:
        df['clean_comments_without_stop_w'] = df['comment_text'].map(lambda text : clean_text(text, stop_words, stop_words_param))
        col_name = 'clean_comments_without_stop_w'
    else:
        df['clean_comments'] = df['comment_text'].map(lambda text : clean_text(text, stop_words, stop_words_param))
        col_name = 'clean_comments'
    if lemmatization:
        df['lem_comments'] = df[col_name].map(lambda text : lem_sentence(text, lem_param))
    if stem_words:
        df['stem_comments'] = df[col_name].map(lambda text : stem_sentence(text, stem_words_param))
    if write_to_file_param:
        df.to_csv(path + 'df_cleaned.csv', index=False)
    return df

In [None]:
def stem_sentence(sentence, stem_words_param):
    """Stemming function
    """
    token_words = word_tokenize(sentence)
    stem_sentence=[]
    if stem_words_param =='snow':
        stem_parameter = SnowballStemmer("english")
    elif stem_words_param =='arl':
        stem_parameter = ARLSTem2()
    else:  
        stem_parameter = LancasterStemmer()
    for word in token_words:
        stem_sentence.append(stem_parameter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

In [None]:
def lem_sentence(sentence, lem_param):
    """Lemmatization function
    """
    token_words = word_tokenize(sentence)
    lem_sentence=[]
    if lem_param =='wn':
        lemmatizer = WordNetLemmatizer()
        for word in token_words:
            lem_sentence.append(lemmatizer.lemmatize(word))
            lem_sentence.append(" ")
    return "".join(lem_sentence)