In [None]:
import pandas as pd
import datetime as dt

# Reading the dataset to preprocess
df = pd.read_csv('reddit_posts_2022_07_21-10_16_58_AM_no_duplicates_personnal_corrected.csv', index_col=0)
df.info()

### Data Preprocessing

In [None]:
# Importing and downloading the necessary text processing tools
import re, string, unicodedata
import nltk
import contractions # expanding contractions
import inflect # natural language related tasks of generating plurals, singular nouns, etc.
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from nltk.tokenize import TweetTokenizer
import string


#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('omw-1.4')

In [None]:
import sys

# Adding the utils preprocess folder to the import path to be able to easily use the defined functions
sys.path.append("../utils/preprocess")

In [None]:
from preprocess import Preprocess

# Creating the preprocessor object
prep = Preprocess()

In [None]:
# Defining functions to replace frequent text expressions in the text

# The arg Key values consists of a tuple of the expressions to replace and the replacement
def multiple_replacer(*key_values):
    replace_dict = dict(key_values)
    replacement_function = lambda match: replace_dict[match.group(0)]
    pattern = re.compile("|".join([re.escape(k) for k, v in key_values]), re.M)
    return lambda string: pattern.sub(replacement_function, string)

def multiple_replace(string, *key_values):
    return multiple_replacer(*key_values)(string)

# Example of execution where we replace the expression 'value to replace' with 'replacement'
multiple_replace('example: value to replace', ('value to replace', 'replacement'))

In [None]:
# Function to run the different preprocessing on a reddit post steps using the preprocessor object
def preprocess_post(post, replacements, stemming=True, numbers_processing='replace'):
    
    post = prep.replace_contractions(post)
    post = prep.replace_special_words(post)
    post = prep.replace_hashtags_URL_USER(post, mode_URL="delete", mode_Mentions="delete", mode_Hashtag="replace")
    post = prep.tokenize(post)
    post = prep.remove_punctuation(post)
    post = prep.preprocess_emojis(post)
    post = prep.preprocess_emoticons(post)
    post = prep.remove_non_ascii(post)
    post = prep.to_lowercase(post)
    post = prep.replace_numbers(post, mode=numbers_processing)
    post = prep.lemmatize_verbs(post)
    post = prep.remove_stopwords(post)
    if replacements:
        post = " ".join(post)
        for i in range(len(replacements)):
            post= multiple_replace(post, *replacements[i])
        post=post.split()
    if stemming:
        post = prep.stem_words(post)
    return post

# Function to run the different preprocessing on all of the posts
def preprocess_posts(posts, *replacements, stemming=True, numbers_processing='replace'):
    return [" ".join(preprocess_post(post, replacements, stemming, numbers_processing)) for post in posts if post]

In [None]:
# Executing the preprocessing steps and saving the results in a new columns
replacements = [(u"view poll", u""), (u"httpurl", u"")]

df['long_covid_related_text_unstemmed_without_numbers'] = preprocess_posts(df['concatenated_sentences'], replacements, stemming=False, numbers_processing='delete')

In [None]:
# Dropping NA values and nearly empty text after the preprocessing 
 df.dropna(axis=0, how='any', thresh=None, subset=['long_covid_related_text_unstemmed_without_numbers'], inplace=True)
indices = df[df['long_covid_related_text_unstemmed_without_numbers'].str.len() < 4 ]['long_covid_related_text_unstemmed_without_numbers'].index
df.drop(indices, inplace=True)

In [None]:
# Saving the preprocessed posts and their details into a new dataframe
df.to_csv('reddit_posts_2022_07_21-10_16_58_AM_no_duplicates_personnal_concatenated_clean.csv')