### data prep

In [None]:
from preprocessing.loader import ResultsLoader, TextLoader, AudioLoader, FaceLoader

# Initialize loaders
results_loader = ResultsLoader()
text_loader = TextLoader() 
audio_loader = AudioLoader()
face_loader = FaceLoader()

# Get balanced subset of data (100% of total data)
percentage = 0
random_state = 42


In [None]:
# Load results data
df_result = results_loader.get_data(percentage=percentage, random_state=random_state)
# Load text features
df_text = text_loader.get_data(percentage=percentage, random_state=random_state)

display(df_text)    
display(df_result)

### preprocessing

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

def text_preprocessing(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

df_text['TRANSCRIPT_text'] = df_text['TRANSCRIPT_text'].apply(text_preprocessing)

display(df_text)

### EDA

In [None]:
from matplotlib import pyplot as plt
from wordcloud import WordCloud

# Word Cloud for the text data
text = " ".join(df_text['TRANSCRIPT_text'])
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()


In [None]:
import seaborn as sns

# Length of words
X_train_lengths = df_text['TRANSCRIPT_text'].apply(len)
plt.figure(figsize=(10, 5))
sns.histplot(X_train_lengths, kde=True)
plt.title('Distribution of Text Lengths')
plt.xlabel('Length of Text')
plt.ylabel('Frequency')
plt.show()


In [None]:
from collections import Counter
import pandas as pd

# Most common tokens
nltk.download('punkt')
all_tokens = nltk.word_tokenize(text)
common_tokens = Counter(all_tokens).most_common(20)

tokens_df = pd.DataFrame(common_tokens, columns=['Token', 'Count'])

plt.figure(figsize=(10, 5))
sns.barplot(data=tokens_df, x='Token', y='Count')
plt.title('Most Common Tokens')
plt.xlabel('Token')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()


### train test split

In [None]:
from sklearn.model_selection import train_test_split

df = pd.merge(df_text, df_result, on="ID")

X_train, X_test, y_train, y_test = train_test_split(df['TRANSCRIPT_text'], df['PHQ_Binary'], test_size=0.2,random_state=rand_seed)

### pipeline & hyperparameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

# Create a pipeline with TF-IDF and a classifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2), stop_words='english')),  # Using bigrams
    ('clf', RandomForestClassifier(random_state=rand_seed))
])

# Hyperparameter tuning using Grid Search
param_grid = {
    'tfidf__max_df': [0.75, 1.0],
    'tfidf__min_df': [1, 2],
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [None, 10, 20]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)



In [None]:
# After grid search training
# Save the best model
import joblib
joblib.dump(grid_search.best_estimator_, 'text_model.joblib')

### evaluation

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

y_pred = grid_search.predict(X_test)

y_test = y_test.to_numpy()
# Debug: Check the shapes and types of y_pred
print(f'y_pred: {y_pred}')
print(f'y_test: {y_test}')

# Generate and display classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Generate and display confusion matrix
cm = confusion_matrix(y_test, y_pred, labels=grid_search.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=grid_search.classes_)
disp.plot(cmap=plt.cm.Blues)
plt.show()

