# Common Data Cleansing Functions

**Below I have compiled a few common functions I used to clean up each text recursively and assign sentiments**

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from textblob import TextBlob

# Download these packages if required
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
# Create function to return boolean result if a word is found in the text.
# Use this to drops records that may not be relevant to your analysis prior 
# to any cleaning
def word_in_text(word, text):
    word = word.lower()
    text = text.lower()
    match = re.search(word, text)
    if match:
        return True
    return False

In [None]:
# Use function if required to clean text
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

In [None]:
# Function to remove hashtags, URLs, mentions, punctuations, RTs, whitespace 
def clean_tweet(tweet):
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|(RT|cc)", " ", tweet).split()) 

In [None]:
# After tweets have been cleaned of hashtags, URLs, mentions, punctuations, RTs, whitespace.
# Perform sentiment analysis using this wrapper function

def get_tweet_sentiment(tweet): 

    #Utility function to classify sentiment of passed tweet 
    #using textblob's sentiment method 

    # create TextBlob object of passed tweet text 
    analysis = TextBlob(clean_tweet(tweet)) 
    # set sentiment 
    if analysis.sentiment.polarity > 0: 
        return 4  #positive
    elif analysis.sentiment.polarity == 0: 
        return 2  #neutral
    else: 
        return 0  #negative

In [None]:
# Function to perform stemming on texts
def stem_words(words):
    stemmer = PorterStemmer() #not as aggressive
    stems=[]
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems