In [1]:
import pandas as pd

print("Loading training datasets...")
df_train_depression = pd.DataFrame(pd.read_json("../datasets/training/raw_dep_training_cut_s.jsonl", lines=True))
df_train_control = pd.DataFrame(pd.read_json("../datasets/training/raw_ctrl_training_cut_s.jsonl", lines=True))

print("Loading testing datasets...")
df_test_depression = pd.DataFrame(pd.read_json("../datasets/testing/raw_dep_testing_cut_s.jsonl", lines=True))
df_test_control = pd.DataFrame(pd.read_json("../datasets/testing/raw_ctrl_testing_cut_s.jsonl", lines=True))

print("Datasets succesfully loaded")

Loading training datasets...
Loading testing datasets...
Datasets succesfully loaded


In [2]:
df_train = df_train_depression.append(df_test_control, ignore_index=True)
df_test = df_test_depression.append(df_test_control, ignore_index=True)

del df_train_depression, df_train_control, df_test_depression, df_test_control

In [3]:
import numpy as np

df_train["title"] = np.where((df_train.title == "[removed]"),'', df_train.title)
df_train["selftext"] = np.where((df_train.selftext == "[removed]"),'', df_train.selftext)
df_test["title"] = np.where((df_test.title == "[removed]"),'', df_test.title)
df_test["selftext"] = np.where((df_test.selftext == "[removed]"),'', df_test.selftext)

df_train = df_train[df_train[["title", "selftext"]].ne('').all(axis=1)]
df_test = df_test[df_test[["title", "selftext"]].ne('').all(axis=1)]

df_train["text"] = df_train["title"] + " "  + df_train["selftext"]
df_test["text"] = df_test["title"] + " " + df_test["selftext"]

In [4]:
import datetime
import cleantext
import re
import swifter
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

pst = PorterStemmer()
stopwords_set = set(stopwords.words("english"))

def pre_process(text: str):
    processed = cleantext.clean(text, lower=True, fix_unicode=True, no_punct=True)

    pattern = re.compile(r'\b(' + r'|'.join(stopwords_set) + r')\b\s*')
    processed = pattern.sub('', processed)

    stemmed_words = [pst.stem(word) for word in word_tokenize(processed)]

    return " ".join(stemmed_words)

df_train["text"] = df_train["text"].swifter.apply(lambda x: pre_process(x))
df_test["text"] = df_test["text"].swifter.apply(lambda x: pre_process(x))

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=9771.0, style=ProgressStyle(descriptio…





HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=2942.0, style=ProgressStyle(descriptio…

In [5]:
df_train = df_train[["text", "depression_related"]]
df_test = df_test[["text", "depression_related"]]

df_train.to_json(orient="records", lines=True, force_ascii=True,
                 path_or_buf="../datasets/training/proc_training.jsonl")
df_test.to_json(orient="records", lines=True, force_ascii=True,
                 path_or_buf="../datasets/testing/proc_testing.jsonl")

In [7]:
import numpy as np
import file_manager
import joblib
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

file_manager.clear_path("./vectorizers")
n_grams = [(1, 1), (1, 2), (2, 2)]
for ngram_range in n_grams:
    formatted_range = str(ngram_range).strip("(").strip(")").replace(" ", "").replace(",", "_")
    file_manager.create_subdir("./vectorizers", formatted_range)

for ngram_range in n_grams:
    formatted_range = str(ngram_range).strip("(").strip(")").replace(" ", "").replace(",", "_")

    begin_time = datetime.datetime.now()
    cv = CountVectorizer(ngram_range=ngram_range)
    bow = cv.fit_transform(df_train.text.to_list())
    np.save("./vectorizers/{}/bow".format(formatted_range), bow)
    joblib.dump(cv, "./vectorizers/{}/bow_vect".format(formatted_range))

    print("Shape of BOW vector: {}".format(bow.shape))
    print("\tTotal elapsed time (BOW): {} for ngram_range {}".format(datetime.datetime.now() - begin_time, ngram_range))

    begin_time = datetime.datetime.now()
    tf_trans = TfidfTransformer()
    tfidf = tf_trans.fit_transform(bow)
    np.save("./vectorizers/{}/tfidf".format(formatted_range), bow)
    joblib.dump(tf_trans, "./vectorizers/{}/bow_vect".format(formatted_range))

    print("Shape of TFIDF vector: {}".format(tfidf.shape))
    print("\tTotal elapsed time (TFIDF): {} for ngram_range {}\n".format(datetime.datetime.now() - begin_time, ngram_range))


Shape of BOW vector: (9771, 22575)
	Total elapsed time (BOW): 0:00:01.213426 for ngram_range (1, 1)
Shape of TFIDF vector: (9771, 22575)
	Total elapsed time (TFIDF): 0:00:00.048870 for ngram_range (1, 1)

Shape of BOW vector: (9771, 404956)
	Total elapsed time (BOW): 0:00:10.318794 for ngram_range (1, 2)
Shape of TFIDF vector: (9771, 404956)
	Total elapsed time (TFIDF): 0:00:00.418879 for ngram_range (1, 2)

Shape of BOW vector: (9771, 382381)
	Total elapsed time (BOW): 0:00:10.703642 for ngram_range (2, 2)
Shape of TFIDF vector: (9771, 382381)
	Total elapsed time (TFIDF): 0:00:00.205451 for ngram_range (2, 2)

