### 3. Implementation Using N-Grams

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv("data/preprocessing/mental_health_preprocessed.csv")

# remove rows that have null values in the "text_lemma" column
df = df.dropna(subset=['text_lemma'])

# Split dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)


# Define TF-IDF vectorizer for n-grams (bigrams and trigrams)
vectorizer = TfidfVectorizer(ngram_range=(2, 3), stop_words="english", max_features=100)
X_train_tfidf = vectorizer.fit_transform(train_df["text_lemma"])
X_test_tfidf = vectorizer.transform(test_df["text_lemma"])

# Extract dictionary from each corpus using TF-IDF n-grams
dictionary_0 = set(vectorizer.get_feature_names_out())  # Full vocabulary of n-grams

# Save the dictionary for future use
with open("data/dictionaries/dictionary_tfidf_ngrams.txt", "w") as f:
    f.write("\n".join(dictionary_0))

print("N-gram Dictionaries extracted and saved.")

# Separate texts based on label
texts_train_0 = train_df[train_df["label"] == 0]["text_lemma"].tolist()
texts_train_1 = train_df[train_df["label"] == 1]["text_lemma"].tolist()

# Apply TF-IDF separately for each group using n-grams
vectorizer_0 = TfidfVectorizer(ngram_range=(2, 3), stop_words="english", max_features=100)
tfidf_0 = vectorizer_0.fit_transform(texts_train_0)
ngrams_0 = vectorizer_0.get_feature_names_out()
scores_0 = np.asarray(tfidf_0.mean(axis=0)).flatten()

vectorizer_1 = TfidfVectorizer(ngram_range=(2, 3), stop_words="english", max_features=100)
tfidf_1 = vectorizer_1.fit_transform(texts_train_1)
ngrams_1 = vectorizer_1.get_feature_names_out()
scores_1 = np.asarray(tfidf_1.mean(axis=0)).flatten()

# Create DataFrames with extracted n-grams and scores
df_tfidf_0 = pd.DataFrame({"ngram": ngrams_0, "score": scores_0}).sort_values(by="score", ascending=False)
df_tfidf_1 = pd.DataFrame({"ngram": ngrams_1, "score": scores_1}).sort_values(by="score", ascending=False)

# Compute relative importance of n-grams (frequency ratio between classes)
ngram_weights = {}
for ngram in set(ngrams_1):
    freq_1 = df_tfidf_1[df_tfidf_1["ngram"] == ngram]["score"].values[0] if ngram in df_tfidf_1["ngram"].values else 0
    freq_0 = df_tfidf_0[df_tfidf_0["ngram"] == ngram]["score"].values[0] if ngram in df_tfidf_0["ngram"].values else 0
    ngram_weights[ngram] = freq_1 / (freq_0 + 1e-6)  # Avoid division by zero

# Sort n-grams by importance (higher values = more relevant to mentally ill class)
filtered_ngrams = sorted(ngram_weights.items(), key=lambda x: x[1], reverse=True)[:100]  # Keep top 100 n-grams

# Save filtered n-gram dictionary
with open("data/dictionaries/filtered_dictionary_ngrams.txt", "w") as f:
    f.write("\n".join([ngram for ngram, score in filtered_ngrams]))

print("Filtered n-gram dictionary generated.")

df_filtered_ngrams = pd.DataFrame(filtered_ngrams, columns=["Word", "Importance Score"])
df_filtered_ngrams

N-gram Dictionaries extracted and saved.
Filtered n-gram dictionary generated.


Unnamed: 0,Word,Importance Score
0,want die,43103.206047
1,want kill,24208.852731
2,suicidal thought,21820.077927
3,ve try,20665.049534
4,want end,19782.049870
...,...,...
95,people think,1.642011
96,like know,1.545719
97,people like,1.344910
98,look like,0.819315


In [8]:
# Extract dictionary from each corpus using TF-IDF n-grams
dictionary_0 = set(df_tfidf_0["ngram"])  # Healthy n-grams
dictionary_1 = set(df_tfidf_1["ngram"])  # Mentally ill n-grams

# Save the dictionaries for future use
with open("data/dictionaries/dictionary_healthy_ngrams.txt", "w") as f:
    f.write("\n".join(dictionary_0))

with open("data/dictionaries/dictionary_mentally_ill_ngrams.txt", "w") as f:
    f.write("\n".join(dictionary_1))

print("N-gram dictionaries extracted and saved.")

# Validate using Logistic Regression
vectorizer_filtered = TfidfVectorizer(vocabulary=[ngram for ngram, _ in filtered_ngrams])
X_train_filtered = vectorizer_filtered.fit_transform(train_df["text_lemma"])
X_test_filtered = vectorizer_filtered.transform(test_df["text_lemma"])

# Train logistic regression model
model = LogisticRegression(random_state=42)
model.fit(X_train_filtered, train_df["label"])

# Make predictions
y_pred = model.predict(X_test_filtered)

# Evaluate performance
accuracy = accuracy_score(test_df["label"], y_pred)
print(f"Logistic Regression Accuracy: {accuracy:.4f}")

# Display confirmation
print("Training and test sets created successfully.")

N-gram dictionaries extracted and saved.
Logistic Regression Accuracy: 0.5055
Training and test sets created successfully.
