In [13]:
import umap.umap_ as umap
from sklearn.mixture import GaussianMixture

import numpy as np 
import pandas as pd 

In [14]:
def data_prep(dataset):
    
    features = pd.read_csv(f"bert-training-features-{dataset}.csv", delimiter=",", header=None)
    labels = pd.read_csv(f"data/{dataset}/train.csv", delimiter=",")
    if dataset == "reddit":
        labels = labels.dropna()
        labels = labels["is_stress"]
        labels = labels.iloc[:5952]
    elif dataset == "dreaddit":
        labels = labels["label"]
        labels = labels.iloc[:2816]
    return (features, labels)
    


In [15]:
def reduce(features):
    reducer = umap.UMAP(
        n_neighbors=45, 
        min_dist=0.7, 
        n_components=2, 
        metric='manhattan'
    )

    low_dim_features = reducer.fit_transform(features)
    return low_dim_features

In [16]:
def GMM(low_dim_features_train, low_dim_features_test):
    gmm = GaussianMixture(n_components=2, covariance_type="full").fit(low_dim_features_train)
    predictions = gmm.predict(low_dim_features_test)
    probs = gmm.predict_proba(low_dim_features_test)
    return (predictions, probs)

In [17]:
def accuracy(predictions, labels):
    assert predictions.shape == labels.shape, f"shape is not equal predictions: {predictions.shape}, labels: {labels.shape}"
    return (predictions==labels).sum() / len(predictions)

In [18]:
features_reddit, labels_reddit = data_prep("reddit")
features_dreaddit, labels_dreaddit = data_prep("dreaddit")

In [19]:
low_dim_features_reddit = reduce(features_reddit)
low_dim_features_dreaddit = reduce(features_dreaddit)

In [20]:
# Experiment # 01
# Train on reddit and test on reddit 
pred_reddit, prob_reddit = GMM(low_dim_features_reddit, low_dim_features_reddit)
acc_reddit = accuracy(pred_reddit, labels_reddit)
acc_reddit

0.5581317204301075

In [21]:
# Experiment # 02 
# Train on reddit and test on dreaddit 
pred_red_dr, prob_red_dr = GMM(low_dim_features_reddit, low_dim_features_dreaddit)
acc_red_dr = accuracy(pred_red_dr, labels_dreaddit)
acc_red_dr

0.4758522727272727

In [22]:
# Experiment # 03 
# Train on dreaddit and test on dreaddit 
pred_dr, prob_dr = GMM(low_dim_features_dreaddit, low_dim_features_dreaddit)
acc_dr = accuracy(pred_dr, labels_dreaddit)
acc_dr

0.5944602272727273

In [23]:
# Experiment # 04 
# train on dreaddit and test on reddit 
pred_dr_rd, prob_dr_rd = GMM(low_dim_features_dreaddit, low_dim_features_reddit)
acc_dr_rd = accuracy(pred_dr_rd, labels_reddit)
acc_dr_rd

0.582997311827957

In [26]:
# the models trained on reddit data seem to perform the best
# save the probabilities and prediction in a csv file 
np.savetxt("reddit-trained-pred.csv", pred_reddit, delimiter=',')
np.savetxt("reddit-trained-prob.csv", prob_reddit, delimiter=',')
np.savetxt("dreaddit-trained-pred.csv", pred_dr_rd, delimiter=',')
np.savetxt("dreaddit-trained-prob.csv", prob_dr_rd, delimiter=',')
