In [12]:
import pandas as pd
import numpy as np
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec, FastText, KeyedVectors
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM, Dense
from keras.preprocessing.text import Tokenizer
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split


In [2]:
df = pd.read_csv('eRisk_DP_Dataset.csv')
df

Unnamed: 0,ID,TITLE,DATE,INFO,TEXT,depression
0,subject1257,,2017-05-08 03:44:04,reddit post,Thank you for taking the time to answer with t...,1
1,subject1257,,2017-05-06 17:40:31,reddit post,"Ugh, I'm sorry Jina... We also got bad news re...",1
2,subject1257,,2017-05-06 17:33:43,reddit post,"Thanks, I'm just looking forward to answers, w...",1
3,subject1257,,2017-05-06 16:42:28,reddit post,That's what we're here for!! Enjoy :) I actual...,1
4,subject1257,,2017-05-06 16:02:41,reddit post,"Well, officially on CD 1 of cycle 5, 9 months ...",1
...,...,...,...,...,...,...
1076577,train_subject9974,Favorite SciFi ride?,2014-04-22 17:49:27,reddit post,What's your favorite vehicle of any kind from ...,0
1076578,train_subject9974,Why did Kane go from demon spawn to corporate ...,2014-04-22 17:35:18,reddit post,"Back in the day, Kane could barely speak. He w...",0
1076579,train_subject9974,Favorite comic book easter egg?,2014-04-20 15:21:57,reddit post,"In honor of easter, what's your favorite easte...",0
1076580,train_subject9974,Recommended Dark Horse reading?,2014-04-19 17:40:11,reddit post,"I've never really read many Dark Horse titles,...",0


# Pre-Processing

### Data Cleaning

In [3]:
import re
def clean_text(text):
    # Check if the input is a string
    if not isinstance(text, str):
        return ""  # or return text if you want to keep the original value
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove numbers and special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove extra whitespace
    text = text.strip()
    return text

df['TEXT'] = df['TEXT'].apply(clean_text)
print(' .:. Data Cleaning Done .:.')

 .:. Data Cleaning Done .:.


### Data Transformation, Stemming and Tokenization


In [4]:
import regex as re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Downloading stopwords and wordnet data
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def advanced_transform_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Expand contractions (e.g., "don't" -> "do not")
    contractions = {
        "ain't": "am not",
        "aren't": "are not",
        "can't": "cannot",
        "can't've": "cannot have",
        "'cause": "because",
        "could've": "could have",
        "couldn't": "could not",
        "couldn't've": "could not have",
        "didn't": "did not",
        "doesn't": "does not",
        "don't": "do not",
        "hadn't": "had not",
        "hadn't've": "had not have",
        "hasn't": "has not",
        "haven't": "have not",
        "he'd": "he would",
        "he'd've": "he would have",
        "he'll": "he will",
        "he's": "he is",
        "how'd": "how did",
        "how'll": "how will",
        "how's": "how is",
        "i'd": "i would",
        "i'll": "i will",
        "i'm": "i am",
        "i've": "i have",
        "isn't": "is not",
        "it'd": "it would",
        "it'll": "it will",
        "it's": "it is",
        "let's": "let us",
        "ma'am": "madam",
        "might've": "might have",
        "mightn't": "might not",
        "must've": "must have",
        "mustn't": "must not",
        "needn't": "need not",
        "shan't": "shall not",
        "she'd": "she would",
        "she'll": "she will",
        "she's": "she is",
        "should've": "should have",
        "shouldn't": "should not",
        "that'd": "that would",
        "that's": "that is",
        "there's": "there is",
        "they'd": "they would",
        "they'll": "they will",
        "they're": "they are",
        "they've": "they have",
        "wasn't": "was not",
        "we'd": "we would",
        "we'll": "we will",
        "we're": "we are",
        "we've": "we have",
        "weren't": "were not",
        "what're": "what are",
        "what's": "what is",
        "when's": "when is",
        "where'd": "where did",
        "where's": "where is",
        "who'll": "who will",
        "who's": "who is",
        "won't": "will not",
        "wouldn't": "would not",
        "you'd": "you would",
        "you'll": "you will",
        "you're": "you are",
        "you've": "you have"
    }
    for contraction, expansion in contractions.items():
        text = text.replace(contraction, expansion)
    
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    
    # Remove punctuations
    text = re.sub(r'\p{P}+', '', text)
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Tokenization
    tokens = text.split()
    
    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    
    # Reconstruct the text
    text = ' '.join(tokens)
    
    return text

df['TEXT'] = df['TEXT'].apply(advanced_transform_text)



print(' .:. Data Transformation, Stemming and Tokenization are Done .:.')
#This function includes:
#Lowercasing
#Expanding contractions
#Removing URLs
#Removing punctuations
#Removing numbers
#Tokenization
#Removing stopwords
#Lemmatization
# can further expand or modify this function based on the specific requirements of your dataset and problem.

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hossein.glm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/hossein.glm/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


 .:. Data Transformation, Stemming and Tokenization are Done .:.


# Feature Extraction

In [5]:

# Load the LIWC dataset
DP_df = pd.read_csv('LIWC-Dictionary.csv')
liwc_corpus = DP_df['DicTerm'].dropna().tolist()  # Drop any NaN values and convert to a list

# Initialize and fit the tokenizer
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(liwc_corpus)

# Convert texts to sequences of integers
sequences = tokenizer.texts_to_sequences(liwc_corpus)

# Convert sequences back to words for Word2Vec and FastText training
sentences = [[tokenizer.index_word[idx] for idx in seq] for seq in sequences]

# Tokenize the text data in the DataFrame
df['tokens'] = [word_tokenize(text) for text in df['TEXT']]

# Load Google's pre-trained Word2Vec model.
# model_w2v_pretrained = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

# Load Facebook's pre-trained FastText model (assuming you have the .vec file).
model_ft_pretrained = KeyedVectors.load_word2vec_format('Facebook-crawl-300d-2M.vec')

# Train Word2Vec and FastText from scratch using the LIWC text corpus
# model_w2v_trained = Word2Vec(sentences, vector_size=300, window=5, min_count=1, workers=4)
model_ft_trained = FastText(sentences, vector_size=300, window=3, min_count=1, workers=4)


# Convert tokens to vectors
def get_vector(tokens, model):
    if isinstance(model, KeyedVectors):  # Pre-trained models
        vector_list = [model[word] for word in tokens if word in model.key_to_index]
    else:  # Trained from scratch models
        vector_list = [model.wv[word] for word in tokens if word in model.wv.key_to_index]
    return np.mean(vector_list, axis=0) if vector_list else np.zeros(300)  # Assuming vector_size=300

# Assuming df is your main dataframe with a 'tokens' column
# Using pre-trained models
# df['w2v_pretrained'] = df['tokens'].apply(lambda x: get_vector(x, model_w2v_pretrained))
df['ft_pretrained'] = df['tokens'].apply(lambda x: get_vector(x, model_ft_pretrained))

# Using models trained from scratch
# df['w2v_trained'] = df['tokens'].apply(lambda x: get_vector(x, model_w2v_trained))
df['ft_trained'] = df['tokens'].apply(lambda x: get_vector(x, model_ft_trained))



print(".:. Feature Extraction is Done .:.")


.:. Feature Extraction is Done .:.


# Handle Imbalanced Data

In [6]:

# Display new class counts
# Convert the vector columns into a list of arrays
# X_w2v = np.array(df['w2v_pretrained'].tolist())
X_ft = np.array(df['ft_pretrained'].tolist())

# Stack the arrays horizontally
# X = np.hstack((X_w2v, X_ft))
X = X_ft
# Now, you can resample using SMOTE
smote = SMOTE(random_state=101)
y = df['depression'].values
X_res, y_res = smote.fit_resample(X, y)

print ('Done....')

Done....


# Model

In [7]:
# Splitting the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.3, random_state=101)

In [8]:
# Define the model without the embedding layer
model = Sequential()
model.add(Bidirectional(LSTM(64, return_sequences=True, input_shape=(X_train.shape[1], 1))))
model.add(Bidirectional(LSTM(32)))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [9]:
# Reshape data for LSTM
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

# Check for NaN or Infinite values
if np.any(np.isnan(X_train)) or np.any(np.isnan(X_test)):
    print("NaN values found in the dataset!")
if np.all(np.isfinite(X_train)) or np.all(np.isfinite(X_test)):
    print("Infinite values found in the dataset!")

Infinite values found in the dataset!


In [10]:
model.fit(X_train, y_train, epochs=2, batch_size=128, validation_data=(X_test, y_test))


Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x55e09d280>

# Evaluation

In [11]:
y_pred = model.predict(X_test)
y_pred = [1 if p > 0.5 else 0 for p in y_pred]



In [12]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, auc, precision_recall_curve, roc_curve
import matplotlib.pyplot as plt

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Precision
precision = precision_score(y_test, y_pred)
print(f"Precision: {precision:.4f}")

# Recall
recall = recall_score(y_test, y_pred)
print(f"Recall: {recall:.4f}")

# F1-Score
f1 = f1_score(y_test, y_pred)
print(f"F1-Score: {f1:.4f}")

# AUC-ROC
roc_auc = roc_auc_score(y_test, y_pred)
print(f"AUC-ROC: {roc_auc:.4f}")

# AUC-PR
precision_curve, recall_curve, _ = precision_recall_curve(y_test, y_pred)
auc_pr = auc(recall_curve, precision_curve)
print(f"AUC-PR: {auc_pr:.4f}")




Precision: 0.7387954334583888
Recall: 0.6801259955233102
F1 Score: 0.7082477851216425
ROC AUC Score: 0.719593806414802
