In [39]:
reset_variables = False
generate_wordcloud = True

In [40]:
import pandas as pd

# Introduce the *paths* of the datasets to be preprocessed

print("Loading training datasets...")
df_train_depression = pd.DataFrame(pd.read_json("../datasets/training/raw_dep_authors_training_cut.jsonl", lines=True))
df_train_control = pd.DataFrame(pd.read_json("../datasets/training/raw_ctrl_authors_training_cut.jsonl", lines=True))

print("Loading testing datasets...")
df_test_depression = pd.DataFrame(pd.read_json("../datasets/testing/raw_dep_authors_testing_cut.jsonl", lines=True))
df_test_control = pd.DataFrame(pd.read_json("../datasets/testing/raw_ctrl_authors_testing_cut.jsonl", lines=True))

print("Datasets succesfully loaded")

Loading training datasets...
Loading testing datasets...
Datasets succesfully loaded


In [41]:
# Join dataframes
df_train = df_train_depression.append(df_train_control, ignore_index=True)
df_test = df_test_depression.append(df_test_control, ignore_index=True)

if not generate_wordcloud:
    # Remove old dataframes to preserve memory
    del df_train_depression, df_train_control, df_test_depression, df_test_control

In [42]:
import numpy as np

# Substitute reddit's keyword '[removed]'
df_train["title"] = np.where((df_train.title == "[removed]"),'', df_train.title)
df_train["selftext"] = np.where((df_train.selftext == "[removed]"),'', df_train.selftext)
df_test["title"] = np.where((df_test.title == "[removed]"),'', df_test.title)
df_test["selftext"] = np.where((df_test.selftext == "[removed]"),'', df_test.selftext)

if generate_wordcloud:
    df_train_depression["title"] = np.where((df_train_depression.title == "[removed]"),'', df_train_depression.title)
    df_train_depression["selftext"] = np.where((df_train_depression.selftext == "[removed]"),'', df_train_depression.selftext)
    df_train_control["title"] = np.where((df_train_control.title == "[removed]"),'', df_train_control.title)
    df_train_control["selftext"] = np.where((df_train_control.selftext == "[removed]"),'', df_train_control.selftext)

# Remove rows with no text in title neither selftext
df_train = df_train[df_train[["title", "selftext"]].ne('').all(axis=1)]
df_test = df_test[df_test[["title", "selftext"]].ne('').all(axis=1)]

if generate_wordcloud:
    df_train_depression = df_train_depression[df_train_depression[["title", "selftext"]].ne('').all(axis=1)]
    df_train_control = df_train_control[df_train_control[["title", "selftext"]].ne('').all(axis=1)]

# Join the rest
df_train["text"] = df_train["title"] + " "  + df_train["selftext"]
df_test["text"] = df_test["title"] + " " + df_test["selftext"]

if generate_wordcloud:
    df_train_depression["text"] = df_train_depression["title"] + " "  + df_train_depression["selftext"]
    df_train_control["text"] = df_train_control["title"] + " " + df_train_control["selftext"]

In [43]:
import datetime
import cleantext
import re
import swifter
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

pst = PorterStemmer()

try:
    stopwords_set = set(stopwords.words("english"))
except LookupError:
    import nltk
    nltk.download("stopwords")
    stopwords_set = set(stopwords.words("english"))

pattern = re.compile(r'\b(' + r'|'.join(stopwords_set) + r')\b\s*')

def pre_process(text: str):
    """
    Given a string cleans it up and returns it once processed. Applies:
    
    1) Lowercase
    2) Punctuation removal
    3) Conversion of unicode symbols
    4) Normalize whitespaces
    5) Stopwords removal (using nltk's english list)
    6) Stem words using a semi-aggressive stemmer (the classical Porter Stemmer)
    
    :param text: str - the string to be processed
    :return: str - the processed string
    """
    
    # Lowercase, remove punctuation, convert unicode symbols and normalize whitespaces
    processed = cleantext.clean(text, lower=True, fix_unicode=True, no_punct=True, no_urls=True)

    # Remove stopwords
    processed = pattern.sub('', processed)

    # Stem words
    try:
        stemmed_words = [pst.stem(word) for word in word_tokenize(processed)]
    except LookupError:
        import nltk
        nltk.download("punkt")
        stemmed_words = [pst.stem(word) for word in word_tokenize(processed)]

    return " ".join(stemmed_words)

df_train["text"] = df_train["text"].swifter.apply(lambda x: pre_process(x))
df_test["text"] = df_test["text"].swifter.apply(lambda x: pre_process(x))

if generate_wordcloud:
    df_train_depression["text"] = df_train_depression["text"].swifter.apply(lambda x: pre_process(x))
    df_train_control["text"] = df_train_control["text"].swifter.apply(lambda x: pre_process(x))

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=13838.0, style=ProgressStyle(descripti…







HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=2968.0, style=ProgressStyle(descriptio…

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=7512.0, style=ProgressStyle(descriptio…

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=6326.0, style=ProgressStyle(descriptio…

In [44]:
# Dataset reduction to only text
df_train = df_train[["text", "depression_related"]]
df_test = df_test[["text", "depression_related"]]

if generate_wordcloud:
    df_train_depression = df_train_depression[["text", "depression_related"]]
    df_train_control = df_train_control[["text", "depression_related"]]

df_train.to_json(orient="records", lines=True, force_ascii=True,
                 path_or_buf="../datasets/training/proc_training.jsonl")
df_test.to_json(orient="records", lines=True, force_ascii=True,
                 path_or_buf="../datasets/testing/proc_testing.jsonl")

In [45]:

import joblib
from scipy import sparse
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from wordcloud import WordCloud

def vectorize(ngram, step_name, train, test):
    # Bag-of-words
    begin_time = datetime.datetime.now()
    cv = CountVectorizer(ngram_range=ngram, max_features=10000)
    train_bow = cv.fit_transform(train.text.to_list())

    if generate_wordcloud:
        freqs = dict(zip(cv.get_feature_names(), train_bow.toarray().sum(axis=0)))
        wordcloud = WordCloud(background_color="white", max_words=100).generate_from_frequencies(freqs)
        wordcloud.to_file("./img/depression_bow_{}.png".format(step_name))
    else:
        test_bow = cv.transform(test.text.tolist())
        # Save both the train and test arrays generated and the vectorizer used
        sparse.save_npz("./vectorizers/train_bow_{}.npz".format(step_name), train_bow)
        sparse.save_npz("./vectorizers/test_bow_{}.npz".format(step_name), test_bow)
        joblib.dump(cv, "./vectorizers/vect_bow_{}".format(step_name))

    print("Shape of BOW vector: {}".format(train_bow.shape))
    print("\tTotal elapsed time (BOW): {} for ngram_range {}".format(datetime.datetime.now() - begin_time, ngram_range))

    # Term frequency - inverse term frequency
    begin_time = datetime.datetime.now()
    tf_trans = TfidfTransformer()
    train_tfidf = tf_trans.fit_transform(train_bow)

    if generate_wordcloud:
        freqs = dict(zip(cv.get_feature_names(), train_tfidf.toarray().sum(axis=0)))
        wordcloud = WordCloud(background_color="white", max_words=100).generate_from_frequencies(freqs)
        wordcloud.to_file("./img/reference_tfidf_{}.png".format(step_name))
    else:
        test_tfidf = tf_trans.transform(test_bow)
        # Save both the train and test arrays generated and the vectorizer used
        sparse.save_npz("./vectorizers/train_tfidf_{}.npz".format(step_name), train_tfidf)
        sparse.save_npz("./vectorizers/test_tfidf_{}.npz".format(step_name), test_tfidf)
        joblib.dump(tf_trans, "./vectorizers/vect_tfidf_{}".format(step_name))

    print("Shape of TFIDF vector: {}".format(train_tfidf.shape))
    print("\tTotal elapsed time (TFIDF): {} for ngram_range {}\n".format(datetime.datetime.now() - begin_time, ngram_range))

In [46]:
import file_manager

file_manager.clear_path("./vectorizers/")
file_manager.clear_path("./img/")
# Define the ranges of n_grams that we want to use
n_grams = [[(1, 1), (1, 2), (2, 2)],[]]
for ngram_range in n_grams[0]:
    formatted_range = str(ngram_range).strip("(").strip(")").replace(" ", "").replace(",", "_")
    n_grams[1].append(formatted_range)

for i, ngram_range in enumerate(n_grams[0]):
   vectorize(ngram_range, n_grams[1][i], df_train, df_test)
   if generate_wordcloud:
       vectorize(ngram_range, n_grams[1][i], df_train_depression, df_train_depression)

Shape of BOW vector: (13838, 10000)
	Total elapsed time (BOW): 0:00:01.232580 for ngram_range (1, 1)
Shape of TFIDF vector: (13838, 10000)
	Total elapsed time (TFIDF): 0:00:00.600098 for ngram_range (1, 1)

Shape of BOW vector: (7512, 10000)
	Total elapsed time (BOW): 0:00:00.771939 for ngram_range (1, 1)
Shape of TFIDF vector: (7512, 10000)
	Total elapsed time (TFIDF): 0:00:00.419431 for ngram_range (1, 1)

Shape of BOW vector: (13838, 10000)
	Total elapsed time (BOW): 0:00:03.369066 for ngram_range (1, 2)
Shape of TFIDF vector: (13838, 10000)
	Total elapsed time (TFIDF): 0:00:00.636576 for ngram_range (1, 2)

Shape of BOW vector: (7512, 10000)
	Total elapsed time (BOW): 0:00:01.929534 for ngram_range (1, 2)
Shape of TFIDF vector: (7512, 10000)
	Total elapsed time (TFIDF): 0:00:00.430164 for ngram_range (1, 2)

Shape of BOW vector: (13838, 10000)
	Total elapsed time (BOW): 0:00:02.963905 for ngram_range (2, 2)
Shape of TFIDF vector: (13838, 10000)
	Total elapsed time (TFIDF): 0:00:00.

In [47]:
np.save("./vectorizers/labels_train", df_train["depression_related"])
np.save("./vectorizers/labels_test", df_test["depression_related"])

In [48]:
if reset_variables:
    %reset -f