# Text-based Depression Detection Model


In [None]:
from collections import Counter

import joblib
import nltk
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from wordcloud import WordCloud

from preprocessing.loader_results import ResultsLoader
from preprocessing.loader_text import TextLoader

#Constants
RANDOM_SEED = 42
DATA_PERCENTAGE = 0  # 100% of the data
FIGURE_SIZE = (15, 8)

# Hyperparameter tuning using Grid Search
PARAM_GRID = {
    'tfidf__max_df': [0.75, 1.0],
    'tfidf__min_df': [1, 2],
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [None, 10, 20]
}


## Data preparation

In [None]:
# Initialize loaders
results_loader = ResultsLoader()
text_loader = TextLoader()

# Load results data
df_result = results_loader.get_data(percentage=DATA_PERCENTAGE, random_state=RANDOM_SEED)
# Load text features
df_text = text_loader.get_data(percentage=DATA_PERCENTAGE, random_state=RANDOM_SEED)

display(df_text)
display(df_result)

### preprocessing

In [None]:

nltk.download('stopwords')
nltk.download('wordnet')


def text_preprocessing(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)


df_text['TRANSCRIPT_text'] = df_text['TRANSCRIPT_text'].apply(text_preprocessing)

display(df_text)

### EDA

In [None]:

# Word Cloud for the text data
text = " ".join(df_text['TRANSCRIPT_text'])
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

plt.figure(figsize=FIGURE_SIZE)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()


In [None]:

# Length of words
X_train_lengths = df_text['TRANSCRIPT_text'].apply(len)
plt.figure(figsize=FIGURE_SIZE)
sns.histplot(X_train_lengths, kde=True)
plt.title('Distribution of Text Lengths')
plt.xlabel('Length of Text')
plt.ylabel('Frequency')
plt.show()


In [None]:

# Most common tokens
nltk.download('punkt')
all_tokens = nltk.word_tokenize(text)
common_tokens = Counter(all_tokens).most_common(20)

tokens_df = pd.DataFrame(common_tokens, columns=['Token', 'Count'])

plt.figure(figsize=FIGURE_SIZE)
sns.barplot(data=tokens_df, x='Token', y='Count')
plt.title('Most Common Tokens')
plt.xlabel('Token')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()


### train test split

In [None]:

df = pd.merge(df_text, df_result, on="ID")

X_train, X_test, y_train, y_test = train_test_split(df['TRANSCRIPT_text'], df['PHQ_Binary'], test_size=0.2,
                                                    random_state=RANDOM_SEED)

### pipeline & hyperparameter tuning

In [None]:

# Create a pipeline with TF-IDF and a classifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2), stop_words='english')),  # Using bigrams
    ('clf', RandomForestClassifier(random_state=RANDOM_SEED))
])

grid_search = GridSearchCV(pipeline, PARAM_GRID, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)



In [None]:
# After grid search training
# Save the best model

joblib.dump(grid_search.best_estimator_, 'text_model.joblib')

### evaluation

In [None]:

y_pred = grid_search.predict(X_test)

y_test = y_test.to_numpy()
# Debug: Check the shapes and types of y_pred
print(f'y_pred: {y_pred}')
print(f'y_test: {y_test}')

# Generate and display classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Generate and display confusion matrix
cm = confusion_matrix(y_test, y_pred, labels=grid_search.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=grid_search.classes_)
disp.plot(cmap=plt.cm.Blues)
plt.show()

