## Data Pre-Processing

The following code will remove emoticons, hyperlinks, whitepsaces such as new lines and indentations.
It will also tokenize text into words, remove slang words, and visualize the text pre-processed data to show patterns

In [None]:
def removeUnicode(text):
    """ Removes unicode strings like "\u002c" and "x96" """
    text = re.sub(r'(\\u[0-9A-Fa-f]+)',r'', text)       
    text = re.sub(r'[^\x00-\x7f]',r'',text)
    return text

def replaceURL(text):
    """ Replaces url address with "url" """
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','url',text)
    text = re.sub(r'#([^\s]+)', r'\1', text)
    return text

In [None]:
def removeEmoticons(text):
    """ Removes emoticons from text """
    text = re.sub(':\)|;\)|:-\)|\(-:|:-D|=D|:P|xD|X-p|\^\^|:-*|\^\.\^|\^\-\^|\^\_\^|\,-\)|\)-:|:\'\(|:\(|:-\(|:\S|T\.T|\.\_\.|:<|:-\S|:-<|\*\-\*|:O|=O|=\-O|O\.o|XO|O\_O|:-\@|=/|:/|X\-\(|>\.<|>=\(|D:', '', text)
    return text

def countEmoticons(text):
    """ Input: a text, Output: how many emoticons """
    return len(re.findall(':\)|;\)|:-\)|\(-:|:-D|=D|:P|xD|X-p|\^\^|:-*|\^\.\^|\^\-\^|\^\_\^|\,-\)|\)-:|:\'\(|:\(|:-\(|:\S|T\.T|\.\_\.|:<|:-\S|:-<|\*\-\*|:O|=O|=\-O|O\.o|XO|O\_O|:-\@|=/|:/|X\-\(|>\.<|>=\(|D:', text))


In [None]:
nltk.download('stopwords')

""" Tokenizes a text to its words, removes and replaces some of them """    
finalTokens = [] # all tokens
stoplist = stopwords.words('english')
my_stopwords = "multiexclamation multiquestion multistop url atuser st rd nd th am pm" # my extra stopwords
stoplist = stoplist + my_stopwords.split()
allowedWordTypes = ["J","R","V","N"] #  J is Adject, R is Adverb, V is Verb, N is Noun. These are used for POS Tagging

# Create a DataFrame to store the preprocessed text and tweet ID
preprocessed_data = pd.DataFrame(columns=['tweet_id', 'text'])

In [None]:
def tokenize(text, wordCountBefore, tweet_id):
    totalAdjectives = 0
    totalAdverbs = 0
    totalVerbs = 0
    onlyOneSentenceTokens = []  # tokens of one sentence each time

    tokens = nltk.word_tokenize(text)

    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)  # Technique 7: remove punctuation

    tokens = nltk.word_tokenize(text)

    tagged = nltk.pos_tag(tokens)  # Technique 13: part of speech tagging
    for w in tagged:
        if w[1][0] in allowedWordTypes and w[0] not in stoplist:
            onlyOneSentenceTokens.append(w[0])
            finalTokens.append(w[0])

    onlyOneSentence = " ".join(onlyOneSentenceTokens)

    # Store the preprocessed text and tweet ID in the DataFrame
    preprocessed_data.loc[len(preprocessed_data)] = [tweet_id, onlyOneSentence]

    return finalTokens

In [None]:
from time import time
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Retrieve all tweets from the database
query = "SELECT DISTINCT * FROM tweets"
df_tweets = pd.read_sql_query(query, conn)
#Tweet_id is column 0, text is column 3


t0 = time()
totalSentences = 0
totalEmoticons = 0
totalSlangs = 0
totalSlangsFound = []
totalElongated = 0
totalMultiExclamationMarks = 0
totalMultiQuestionMarks = 0
totalMultiStopMarks = 0
totalAllCaps = 0

# Iterate over each row in df_tweets
for index, row in df_tweets.iterrows():
    totalSentences += 1
    feat = []

    tweet_id = row['tweet_id']
    text     = removeUnicode(row['text'])  # Technique 0
    text     = replaceURL(row['text'])

    wordCountBefore = len(re.findall(r'\w+', text))  # word count of one sentence before preprocess

    emoticons = countEmoticons(text)  # how many emoticons in this sentence
    totalEmoticons += emoticons

    text = removeEmoticons(text)  # removes emoticons from text
    tokens = tokenize(text, wordCountBefore, tweet_id)  

    # print("Processed tweet:", tweet_id)  # Print the tweet ID after processing

# View the resulting preprocessed data
print(preprocessed_data.head())
    
print("Total sentences: ",                          totalSentences,"\n")
print("Total Words before preprocess: ",            len(re.findall(r'\w+', ' '.join(df_tweets['text']))))
print("Total Distinct Tokens before preprocess: ",  len(set(re.findall(r'\w+', ' '.join(df_tweets['text'])))))
print("Average word/sentence before preprocess: ",  len(re.findall(r'\w+', ' '.join(df_tweets['text']))) / totalSentences, "\n")
print("Total Words after preprocess: ",             len(tokens))
print("Total Distinct Tokens after preprocess: ",   len(set(tokens)))
print("Average word/sentence after preprocess: ",   len(tokens)/totalSentences,"\n")


print("Total run time: ",                           time() - t0," seconds\n")

print("Total emoticons: ",                          totalEmoticons,"\n")
print("Total slangs: ",                             totalSlangs,"\n")
# commonSlangs = nltk.FreqDist(totalSlangsFound)
# for (word, count) in commonSlangs.most_common(20): # most common slangs across all texts
#     print(word,"\t",count)
# commonSlangs.plot(20, cumulative=False) # plot most common slangs

print("Total elongated words: ",                    totalElongated,"\n")
print("Total multi exclamation marks: ",            totalMultiExclamationMarks)
print("Total multi question marks: ",               totalMultiQuestionMarks)
print("Total multi stop marks: ",                   totalMultiStopMarks,"\n")
print("Total all capitalized words: ",              totalAllCaps,"\n")

#print(tokens)
commonWords = nltk.FreqDist(tokens)
print("Most common words ")
print("Word\tCount")
for (word, count) in commonWords.most_common(100): # most common words across all texts
    print(word,"\t",count)

# Create a larger plot with adjusted dimensions
plt.figure(figsize=(20, 8))  # Adjust the width and height as needed
commonWords.plot(100, cumulative=False) # plot most common words


bgm = nltk.collocations.BigramAssocMeasures()
tgm = nltk.collocations.TrigramAssocMeasures()
bgm_finder = nltk.collocations.BigramCollocationFinder.from_words(tokens)
tgm_finder = nltk.collocations.TrigramCollocationFinder.from_words(tokens)
bgm_finder.apply_freq_filter(5) # bigrams that occur at least 5 times
print("Most common collocations (bigrams)")
print(bgm_finder.nbest(bgm.pmi, 50)) # top 50 bigram collocations
tgm_finder.apply_freq_filter(5) # trigrams that occur at least 5 times
print("Most common collocations (trigrams)")
print(tgm_finder.nbest(tgm.pmi, 20)) # top 20 trigrams collocations

In [None]:
# Merge the preprocessed_data DataFrame with df_tweets based on tweet_id
df_tweets_preprocessed = df_tweets.merge(preprocessed_data, on='tweet_id', how='left', suffixes=('_original', '_preprocessed'))

# Deduplicate the merged DataFrame based on the 'tweet_id' column
df_tweets_preprocessed = df_tweets_preprocessed.drop_duplicates(subset=['tweet_id', 'text_preprocessed'])

In [None]:
df_tweets_preprocessed.info()
df_tweets_preprocessed.head()