In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re,os, glob, traceback, nltk
from collections import defaultdict
import torch.optim as optim
import torch.nn as nn
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score
import logging, sys
from datetime import datetime
timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
logging.basicConfig(filename=f'log/jp_log_{timestamp}.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [5]:
my_personality_file = 'data/mypersonality.csv'
main_df = pd.read_csv(my_personality_file, encoding='Windows-1252')
# main_df.drop(columns=['#AUTHID', 'sEXT', 'sNEU', 'sAGR', 'sCON', 'sOPN', 'DATE', 'NETWORKSIZE', 'BETWEENNESS', 'NBETWEENNESS', 'DENSITY', 'BROKERAGE', 'NBROKERAGE','TRANSITIVITY'], inplace=True)
main_df[['cOPN', 'cEXT', 'cNEU', 'cAGR', 'cCON']] = main_df[['cOPN', 'cEXT', 'cNEU', 'cAGR', 'cCON']].replace({'y': 1, 'n': 0})
logging.info(f'File={my_personality_file} shape={main_df.shape}')
main_df.head(1)

Unnamed: 0,#AUTHID,STATUS,sEXT,sNEU,sAGR,sCON,sOPN,cEXT,cNEU,cAGR,cCON,cOPN,DATE,NETWORKSIZE,BETWEENNESS,NBETWEENNESS,DENSITY,BROKERAGE,NBROKERAGE,TRANSITIVITY
0,b7b7764cfa1c523e4e93ab2a79a946c4,likes the sound of thunder.,2.65,3.0,3.15,3.25,4.4,0,1,0,0,1,06/19/09 03:21 PM,180.0,14861.6,93.29,0.03,15661.0,0.49,0.1


In [8]:
liwc_file = 'data/LIWC_mypersonality_oct_2.csv'
liwc_df = pd.read_csv(liwc_file)
liwc_df = liwc_df.drop(['Unnamed: 0', 'ColumnID', 'Text'], axis=1)
# liwc_df.fillna(value=np.nan, inplace=True)
# numerical_columns = liwc_df.select_dtypes(include=[np.number]).columns
# liwc_df[numerical_columns] = liwc_df[numerical_columns].fillna(liwc_df[numerical_columns].mean())
# liwc_df[numerical_columns] = liwc_df[numerical_columns].fillna(0)
# liwc_df.fillna(value=0, inplace=True)
logging.info(f'File={liwc_file} shape={liwc_df.shape}')
liwc_df.shape

(9917, 120)

In [10]:
# df = pd.merge(main_df, liwc_df, on="#AUTHID", how="left")
df = pd.concat([main_df, liwc_df], axis=1)
df = df.drop(['#AUTHID'], axis=1)
str(list(df.columns))
# df.dropna(inplace=True)
# df.head(1)

Unnamed: 0,STATUS,cEXT,cNEU,cAGR,cCON,cOPN,Segment,WC,Analytic,Clout,...,nonflu,filler,AllPunc,Period,Comma,QMark,Exclam,Apostro,OtherP,Emoji
0,likes the sound of thunder.,0,1,0,0,1,1,5,99.0,,...,0.0,0.0,20.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0


In [2]:
def read_data(main_file, liwc_file, is_mypersonality=True):
    if is_mypersonality:
        main_df = pd.read_csv(main_file, encoding='Windows-1252')
        main_df.drop(columns=['#AUTHID', 'sEXT', 'sNEU', 'sAGR', 'sCON', 'sOPN', 'DATE', 'NETWORKSIZE', 'BETWEENNESS', 'NBETWEENNESS', 'DENSITY', 'BROKERAGE', 'NBROKERAGE','TRANSITIVITY'], inplace=True)
        main_df[['cOPN', 'cEXT', 'cNEU', 'cAGR', 'cCON']] = main_df[['cOPN', 'cEXT', 'cNEU', 'cAGR', 'cCON']].replace({'y': 1, 'n': 0})
    else:
        main_df = pd.read_csv(main_file)
        cols = ['cOPN', 'cEXT', 'cNEU', 'cAGR', 'cCON']
        for col in cols:
            mean_value = main_df[col].mean()
            main_df[f'{col}'] = main_df[col] > mean_value
        main_df[cols] = main_df[cols].replace({True: 1, False: 0})  
    logging.info(f'File={main_file} shape={main_df.shape}')

    liwc_df = pd.read_csv(liwc_file)
    liwc_df = liwc_df.drop(['Unnamed: 0', 'ColumnID', 'STATUS'] if is_mypersonality else ['Unnamed: 0', 'STATUS'], axis=1)
    logging.info(f'File={liwc_file} shape={liwc_df.shape}')

    df = pd.concat([main_df, liwc_df], axis=1)
    df = df.drop(['#AUTHID'], axis=1)
    logging.info(f'Final shape after concat={df.shape}')
    return df

# fb_df = read_data('data/mypersonality.csv', 'data/LIWC_mypersonality_oct_2.csv')
df = read_data('data/pandora_to_big5.csv', 'data/LIWC_pandora_to_big5_oct_24.csv', False)
# df = pd.concat([fb_df, rd_df], ignore_index=True)

logging.info(f'Combined shape ={df.shape}')
logging.info(40*'-')

In [3]:
nrc_vad = pd.read_csv('data/NRC-VAD-Lexicon/NRC-VAD-Lexicon.csv', sep="\t")  
nrc_vad_dict = nrc_vad.set_index('Word').to_dict(orient='index')
def get_vad_scores(text):
    words = text.split()
    valence_scores, arousal_scores, dominance_scores = [], [], []
    for word in words:
        word = word.lower()  # Lowercase to match the lexicon
        if word in nrc_vad_dict:
            vad_values = nrc_vad_dict[word]
            valence_scores.append(vad_values['Valence'])
            arousal_scores.append(vad_values['Arousal'])
            dominance_scores.append(vad_values['Dominance'])
    if not valence_scores:
        return {'Valence': 0, 'Arousal': 0, 'Dominance': 0}

    valence_avg = sum(valence_scores) / len(valence_scores)
    arousal_avg = sum(arousal_scores) / len(arousal_scores)
    dominance_avg = sum(dominance_scores) / len(dominance_scores)
    return {'Valence': valence_avg, 'Arousal': arousal_avg, 'Dominance': dominance_avg}

df['VAD_Scores'] = df['STATUS'].apply( lambda x: get_vad_scores(x))
df[['Valence', 'Arousal', 'Dominance']] = pd.DataFrame(df['VAD_Scores'].tolist(), index=df.index)
df.drop(columns=['VAD_Scores'], inplace=True)
logging.info(f'NRC-VAD shape={df.shape}')

In [4]:
nrc_lexicon = pd.read_csv('data/NRC-Emotion-Lexicon/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt', names=["word", "emotion", "association"],sep="\t", header=None)
# Filter out words that have no association with emotions (association == 0)
nrc_lexicon = nrc_lexicon[nrc_lexicon['association'] == 1]
# nrc_lexicon.drop(columns=['association'], inplace=True)
nrc_pivot = nrc_lexicon.pivot(index="word", columns="emotion", values="association").fillna(0).astype(int)
# nrc_pivot.head(2)
nltk.download('punkt')
def get_emotion_counts(text, lexicon):
    # print(text)
    words = nltk.word_tokenize(text.lower())
    emotion_count = defaultdict(int)
    for word in words:
        if word in lexicon.index:
            for emotion in lexicon.columns:
                emotion_count[emotion] += lexicon.loc[word, emotion]
    return emotion_count
emotion_counts_list = df['STATUS'].apply(lambda x: get_emotion_counts(x, nrc_pivot))
emotion_counts_df = pd.DataFrame(emotion_counts_list.tolist())
emotion_counts_df.fillna(0, inplace=True)
emotion_counts_df = emotion_counts_df.astype(int)
df = pd.concat([df, emotion_counts_df], axis=1)
logging.info(f'NRC-Emotion shape={df.shape}')

[nltk_data] Downloading package punkt to /home/jmaharja/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
def find_sentiment(text):
    # print(text)
    vs = analyzer.polarity_scores(text)
    sc = vs['compound']
    # emo = 'pos' if sc >= 0.05 else 'neu' if -0.05 < sc < 0.05 else 'neg'
    return sc
df[['sent_score']] = df['STATUS'].apply(lambda x: pd.Series(find_sentiment(x)))
logging.info(f'VADER shape={df.shape}')

In [7]:
df = df[:100]

In [11]:
# #TFIDF
# from sklearn.feature_extraction.text import TfidfVectorizer
# # Step 1: Initialize the TfidfVectorizer
# # You can specify parameters like max_features, ngram_range, etc., based on your needs
# tfidf = TfidfVectorizer(max_features=100, stop_words='english')  # Adjust max_features as necessary

# # Step 2: Fit and transform the 'STATUS' column
# # This step converts the text in 'STATUS' to TF-IDF features
# tfidf_matrix = tfidf.fit_transform(df['STATUS'].astype(str))  # Ensure 'STATUS' column is in string format

# # Step 3: Convert the TF-IDF matrix into a DataFrame
# # The resulting matrix is sparse, so we'll convert it to a DataFrame with feature names
# tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())

# # Step 4: Optionally, merge the TF-IDF features back with your original DataFrame
# # This will add the new TF-IDF feature columns to your existing DataFrame
# df = pd.concat([df, tfidf_df], axis=1)
# df.head(1)

In [8]:
from transformers import AutoTokenizer, AutoModel
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def get_embeddings(df, model_name, batch_size=8):
    logging.info(f'Embeding : {model_name}')
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    model.to(device)
    model.eval()  
    embeddings_list = []
    
    for i in range(0, len(df), batch_size):
        batch_texts = df['STATUS'][i:i + batch_size].tolist()
        inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True)
        inputs = {key: value.to(device) for key, value in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        cls_embeddings = outputs.last_hidden_state[:, 0, :]
        embeddings_list.append(cls_embeddings.cpu().numpy())
    return np.vstack(embeddings_list)
    
# bert_embeddings = get_embeddings(df, 'bert-base-uncased', batch_size=2)
roberta_embeddings = get_embeddings(df, 'roberta-base', batch_size=2)
# berttweet_embeddings = get_embeddings(df, 'vinai/bertweet-base', batch_size=2)
# xlnet_embeddings = get_embeddings(df, 'xlnet-base-cased', batch_size=2)
# df['bert_embeddings'] = list(bert_embeddings)
df['roberta_embeddings'] = list(roberta_embeddings)
# df['berttweet_embeddings'] = list(berttweet_embeddings)
# df['xlnet_embeddings'] = list(xlnet_embeddings)
# df[['STATUS', 'bert_embeddings', 'roberta_embeddings', 'berttweet_embeddings', 'xlnet_embeddings']].head(2)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['roberta_embeddings'] = list(roberta_embeddings)


In [11]:
all_cols = df.columns
remove_cols = ['STATUS', 'cEXT','cNEU', 'cAGR', 'cCON', 'cOPN']
emb_cols = ['bert_embeddings', 'berttweet_embeddings', 'xlnet_embeddings', 'roberta_embeddings']
stat_cols = list (set(all_cols) - set(remove_cols) - set(emb_cols))
scaler = StandardScaler() 
stat_features = df[stat_cols]
stat_features_scaled = scaler.fit_transform(stat_features)
# stat_features_scaled

In [12]:
# roberta_embeddings
# np.array(roberta_embeddings.tolist())
X = np.concatenate([stat_features_scaled, roberta_embeddings], axis=1)
X


array([[-0.19006328, -0.12640104,  1.24846165, ..., -0.10689668,
        -0.03087542, -0.05612235],
       [-0.19006328, -0.12640104, -0.37582433, ..., -0.0694667 ,
        -0.03203625, -0.02442368],
       [-0.19006328, -0.12640104, -0.75957689, ..., -0.09310248,
        -0.05732121, -0.07783641],
       ...,
       [-0.19006328, -0.02101195, -0.88349698, ..., -0.09929476,
        -0.03069092, -0.02944294],
       [-0.19006328, -0.12640104, -1.08603305, ..., -0.04976954,
        -0.02862299, -0.02082449],
       [-0.19006328, -0.12640104, -0.83719351, ..., -0.09859692,
        -0.0528679 , -0.08392557]])

In [15]:
# filename = 'data/my_personality_all_embs.csv'
# df.to_csv(filename)
# logging.info(f'saving to {filename}')
# # df = pd.read_csv("data/my_personality_all_embs.csv")

In [13]:
dff = df.copy()
# dff.fillna(value=np.nan, inplace=True)
# numerical_columns = dff.select_dtypes(include=[np.number]).columns
# dff[numerical_columns] = dff[numerical_columns].fillna(dff[numerical_columns].mean())
dff.fillna(value=0, inplace=True)
logging.info(f'Total shape={dff.shape}')

In [14]:
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout_rate=0.5):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_rate) 
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc3(x)
        return x

class DotProductAttention(nn.Module):
    def __init__(self, hidden_dim):
        super(DotProductAttention, self).__init__()
    def forward(self, x):
        query = x[:, -1:, :]  # Shape: (batch_size, 1, hidden_dim * 2)
        scores = torch.bmm(query, x.transpose(1, 2))  # Shape: (batch_size, 1, seq_len)
        attention_weights = torch.softmax(scores, dim=-1)  # Shape: (batch_size, seq_len, 1)
        context_vector = torch.bmm(attention_weights, x)  # Shape: (batch_size, 1, hidden_dim * 2)
        return context_vector, attention_weights

class BiLSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers, bidirectional=True, do_attention=True, dropout_rate=0.5):
        super(BiLSTMClassifier, self).__init__()
        self.do_attention = do_attention
        self.attention = DotProductAttention(hidden_dim)
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, bidirectional=bidirectional, batch_first=True, dropout=dropout_rate)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.layer_norm1 = nn.LayerNorm(input_dim) 
        self.layer_norm2 = nn.LayerNorm(hidden_dim * 2)  
        self.dropout = nn.Dropout(dropout_rate)  
    def forward(self, x):
        if len(x.size()) == 2:
            x = x.unsqueeze(1)  
        if self.do_attention:
            context_vector, attention_weights = self.attention(x)
            context_vector = self.layer_norm1(context_vector)
        else:
            context_vector = x
        lstm_output, _ = self.lstm(context_vector)     
        lstm_output = self.layer_norm2(lstm_output)
        last_hidden_state = lstm_output[:, -1, :]  # Shape: (batch_size, hidden_dim * 2)
        last_hidden_state = self.dropout(last_hidden_state)
        output = self.fc(last_hidden_state)  
        return output

# # Test model Initialize model
# model = BiLSTMClassifier(input_dim=768, hidden_dim=128, output_dim=5, num_layers=2)
# input_data = torch.randn(2, 5, 768)  # Example input (batch_size=32, seq_len=50, input_dim=768)
# output= model(input_data)
# print(output.shape)  # Expected output: (batch_size, output_dim)

def train_val_dl_models(model, train_loader, val_loader, max_grad_norm=1.0, epochs=16, lr=0.001):
    logging.info(f'{model.__class__.__name__}')
    criterion = nn.BCEWithLogitsLoss()  
    optimizer = optim.Adam(model.parameters(), lr=lr)
    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        for t, labels in train_loader:
            optimizer.zero_grad()  
            outputs= model(t)
            loss = criterion(outputs, labels)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        model.eval()  
        val_preds, val_labels, val_loss = [], [], 0
        with torch.no_grad():  
            for inputs, labels in val_loader:
                outputs= model(inputs)
                val_loss += criterion(outputs, labels).item()  
                val_preds.append(torch.sigmoid(outputs))  
                val_labels.append(labels) 
        val_preds = torch.cat(val_preds)
        val_labels = torch.cat(val_labels)
        val_preds = (val_preds > 0.5).float() 
        val_accuracy = accuracy_score(val_labels.numpy(), val_preds.numpy())
        if epoch % 4 == 0:
            logging.info(f'Epoch [{epoch + 1}/{epochs}], Train Loss: {total_loss / len(train_loader):.4f}, 'f'Val Loss: {val_loss / len(val_loader):.4f}, Val Accuracy: {val_accuracy:.4f}')
    return val_accuracy


In [19]:
class My_training_class:
    def __init__(self, dff):
        self.dff = dff
        # self.output_df = pd.DataFrame()
     
    def prepare_dataset(self, embedding_type, target_col):
        all_cols = self.dff.columns
        remove_cols = ['STATUS', 'cEXT','cNEU', 'cAGR', 'cCON', 'cOPN']
        emb_cols = ['bert_embeddings', 'berttweet_embeddings', 'xlnet_embeddings', 'roberta_embeddings']
        stat_cols = list (set(all_cols) - set(remove_cols) - set(emb_cols))
        if embedding_type:
            contextual_embeddings = np.array(self.dff[embedding_type].tolist())
        scaler = StandardScaler() 
        stat_features = self.dff[stat_cols]
        stat_features_scaled = scaler.fit_transform(stat_features)
        X = np.concatenate([stat_features_scaled, contextual_embeddings] if embedding_type else [stat_features_scaled], axis=1)
        y = np.array(self.dff[[target_col]]) 
        self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(X, y, test_size=0.1, random_state=42)
        print(self.X_train.shape, self.y_train.shape)
        self.input_dim = self.X_train.shape[1]
        # X_test, X_val, y_test, y_val = train_test_split(X_val, y_val, test_size=0.5, random_state=42)
        # logging.info(f'Train  size: {X_train.shape}, Val size: {X_val.shape}, Test  size: {X_test.shape}')
        X_train_tensor = torch.tensor(self.X_train, dtype=torch.float32)
        y_train_tensor = torch.tensor(self.y_train, dtype=torch.float32)
        X_val_tensor = torch.tensor(self.X_val, dtype=torch.float32)
        y_val_tensor = torch.tensor(self.y_val, dtype=torch.float32)
        train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
        val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
        self.train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
        self.val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

    def init_models(self):
        self.svm_model = SVC(kernel='linear')
        self.lr_model = LogisticRegression(solver='lbfgs', max_iter=1000)
        self.rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
        self.xgb_model = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
        self.bilstm_model = BiLSTMClassifier(input_dim=self.input_dim, hidden_dim=128, output_dim=1, num_layers=2, bidirectional=True, do_attention=True, dropout_rate=0.5)
        self.mlp_model = MLP(input_size=self.input_dim, hidden_size=128, output_size=1, dropout_rate=0.3)
    
    def fit_validate_and_generate_acc_scr(self):
        self.svm_model.fit(self.X_train, self.y_train)
        self.lr_model.fit(self.X_train, self.y_train)
        self.rf_model.fit(self.X_train, self.y_train)
        self.xgb_model.fit(self.X_train, self.y_train)
        mlp_acc = train_val_dl_models(self.mlp_model, self.train_loader, self.val_loader)
        bilstm_acc = train_val_dl_models(self.bilstm_model, self.train_loader, self.val_loader)

        svm_y_pred = self.svm_model.predict(self.X_val)
        lr_y_pred = self.lr_model.predict(self.X_val)
        rf_y_pred = self.rf_model.predict(self.X_val)
        xgb_y_pred = self.xgb_model.predict(self.X_val)
        svm_accuracy = accuracy_score(self.y_val, svm_y_pred)
        lr_accuracy = accuracy_score(self.y_val, lr_y_pred)
        rf_accuracy = accuracy_score(self.y_val, rf_y_pred)
        xgb_accuracy = accuracy_score(self.y_val, xgb_y_pred)

        logging.info(f'SVM Val Acc: {svm_accuracy:.2f}')
        logging.info(f'LR Val Acc: {lr_accuracy:.2f}')
        logging.info(f'RF Val Acc: {rf_accuracy:.2f}')
        logging.info(f'SGBoost Val Acc: {xgb_accuracy:.2f}')
        logging.info(f'MLP Val Acc: {mlp_acc:.2f}')
        logging.info(f'BiLSTM Val Acc: {bilstm_acc:.2f}')

    # def test_model():
    #     model.eval()
    #     X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
    #     with torch.no_grad():  
    #         outputs = model(X_test_tensor)
    #     outputs = torch.sigmoid(outputs)
    #     preds = (outputs > 0.5).float()
    #     preds_np = preds.numpy()
    #     y_test_np = y_test  # If y_test is already in numpy format, otherwise y_test.numpy()
    #     accuracy = accuracy_score(y_test_np, preds_np)
    #     precision = precision_score(y_test_np, preds_np)
    #     recall = recall_score(y_test_np, preds_np)
    #     f1 = f1_score(y_test_np, preds_np)
            # Print metrics
            # print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

    def train_all_models(self,embedding_type):
        logging.info(f'Training started ::')
        for target_cols in ['cOPN', 'cCON', 'cEXT', 'cAGR', 'cNEU']:
            logging.info(f'Trait: {target_cols}')
            self.prepare_dataset(embedding_type, target_cols)
            self.init_models()
            self.fit_validate_and_generate_acc_scr()

In [20]:
my_train = My_training_class(dff)
my_train.train_all_models('roberta_embeddings')

ValueError: y should be a 1d array, got an array of shape (90, 2) instead.

In [104]:
import torch.nn as nn
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout_rate=0.5):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_rate) 
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc3(x)
        return x

In [None]:
# class BiLSTMClsAttn(nn.Module):
#     def __init__(self, input_dim, hidden_dim, output_dim, num_layers, bidirectional=True, do_attention=True, dropout_rate=0.5):
#         super(BiLSTMClsAttn, self).__init__()
#         self.do_attention = do_attention
#         self.attention = DotProductAttention(hidden_dim)
#         self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, bidirectional=bidirectional, batch_first=True, dropout=dropout_rate)
#         self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
#         self.layer_norm1 = nn.LayerNorm(input_dim) 
#         self.layer_norm2 = nn.LayerNorm(hidden_dim * 2)  
#         self.dropout = nn.Dropout(dropout_rate)  

#     def forward(self, x):
#         if len(x.size()) == 2:
#             x = x.unsqueeze(1)  
#         lstm_output, _ = self.lstm(x)     
#         lstm_output = self.layer_norm2(lstm_output)
#         last_hidden_state = lstm_output[:, -1, :]  # Shape: (batch_size, hidden_dim * 2)
#         last_hidden_state = self.dropout(last_hidden_state)
#         context_vector, attention_weights = self.attention(last_hidden_state)
#         context_vector = self.layer_norm1(context_vector)
#         output = self.fc(context_vector)  
#         return output

# model = BiLSTMClsAttn(input_dim=768, hidden_dim=128, output_dim=1, num_layers=2)
# input_data = torch.randn(2, 5, 768)  # Example input (batch_size=32, seq_len=50, input_dim=768)
# output= model(input_data)
# print(output.shape)  # Expected output: (batch_size, output_dim)

In [None]:
# model = BiLSTMClassifier(input_dim=X_train.shape[1], hidden_dim=128, output_dim=1, num_layers=2, bidirectional=True, do_attention=True, dropout_rate=0.5)
# # model = MLP(input_size=X_train.shape[1], hidden_size=128, output_size=1, dropout_rate=0.3)
# criterion = nn.BCEWithLogitsLoss()  # For multi-label classification
# optimizer = optim.Adam(model.parameters(), lr=0.003)
# num_epochs = 16
# max_grad_norm=1.0
# for epoch in range(num_epochs):
#     model.train()
#     total_loss = 0.0
#     for t, labels in train_loader:
#         optimizer.zero_grad()  
#         outputs= model(t)
#         # print(outputs.shape, labels.shape, t.shape)
#         loss = criterion(outputs, labels)
#         torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
#         loss.backward()
#         optimizer.step()
#         total_loss += loss.item()
#     # Validation phase
#     model.eval()  
#     val_preds = []
#     val_labels = []
#     val_loss = 0
#     with torch.no_grad():  
#         for inputs, labels in val_loader:
#             outputs= model(inputs)
#             val_loss += criterion(outputs, labels).item()  
#             val_preds.append(torch.sigmoid(outputs))  
#             val_labels.append(labels) 
#     val_preds = torch.cat(val_preds)
#     val_labels = torch.cat(val_labels)
#     val_preds = (val_preds > 0.5).float() 
#     val_accuracy = accuracy_score(val_labels.numpy(), val_preds.numpy())
#     print(f'Epoch [{epoch + 1}/{num_epochs}], '
#           f'Train Loss: {total_loss / len(train_loader):.4f}, '
#           f'Validation Loss: {val_loss / len(val_loader):.4f}, '
#           f'Validation Accuracy: {val_accuracy:.4f}')

In [284]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
model.eval()
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
with torch.no_grad():  
    outputs = model(X_test_tensor)
outputs = torch.sigmoid(outputs)
preds = (outputs > 0.5).float()
preds_np = preds.numpy()
y_test_np = y_test  # If y_test is already in numpy format, otherwise y_test.numpy()
accuracy = accuracy_score(y_test_np, preds_np)
precision = precision_score(y_test_np, preds_np)
recall = recall_score(y_test_np, preds_np)
f1 = f1_score(y_test_np, preds_np)

# Print metrics
print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

Accuracy: 0.7248, Precision: 0.6162, Recall: 0.5351, F1 Score: 0.5728


## Ensemble Training

In [32]:
train_df, test_df = train_test_split(dff, test_size=0.1, random_state=42)
print(f'Train set size: {train_df.shape}')
print(f'Test set size: {test_df.shape}')all_cols = dff.columns

# label_cols = ['cEXT','cNEU', 'cAGR', 'cCON', 'cOPN']
label_cols = ["cCON"]
remove_cols = ['#AUTHID', 'STATUS', 'sEXT', 'sNEU', 'sAGR', 'sCON', 'sOPN', 'cEXT','cNEU', 'cAGR', 'cCON', 'cOPN', 'DATE']
emb_cols = ['bert_embeddings', 'berttweet_embeddings', 'xlnet_embeddings', 'roberta_embeddings']
stat_cols = list (set(all_cols) - set(remove_cols) - set(emb_cols))

stat_features = train_df[stat_cols]
bert_embeddings = np.array(train_df["bert_embeddings"].tolist())
roberta_embeddings = np.array(train_df["roberta_embeddings"].tolist())

scaler = StandardScaler()  
stat_features_scaled = scaler.fit_transform(stat_features)
X1 = np.concatenate([stat_features_scaled, bert_embeddings], axis=1)
X2 = np.concatenate([stat_features_scaled, roberta_embeddings], axis=1)
y = np.array(train_df[label_cols]) 

# Split data into train+val and test sets (80% train+val, 20% test)
X1_train, X1_val, y1_train, y1_val = train_test_split(X1, y, test_size=0.1, random_state=42)
X2_train, X2_val, y2_train, y2_val = train_test_split(X2, y, test_size=0.1, random_state=42)

train_dataset1 = TensorDataset(torch.tensor(X1_train, dtype=torch.float32), torch.tensor(y1_train, dtype=torch.float32))
val_dataset1 = TensorDataset(torch.tensor(X1_val, dtype=torch.float32), torch.tensor(y1_val, dtype=torch.float32))
train_dataset2 = TensorDataset(torch.tensor(X2_train, dtype=torch.float32), torch.tensor(y2_train, dtype=torch.float32))
val_dataset2 = TensorDataset(torch.tensor(X2_val, dtype=torch.float32), torch.tensor(y2_val, dtype=torch.float32))
train_loader1 = DataLoader(train_dataset1, batch_size=32, shuffle=True)
val_loader1 = DataLoader(val_dataset1, batch_size=32, shuffle=False)
train_loader2 = DataLoader(train_dataset2, batch_size=32, shuffle=True)
val_loader2 = DataLoader(val_dataset2, batch_size=32, shuffle=False)

("['new', 'Authentic', 'right', 'joy', 'det', 'excited', 'illness', 'adj', 'car', 'auditory', 'feel', 'hours', 'old', 'watching', 'Emoji', 'Period', 'they', 'want', 'auxverb', 'socbehav', 'power', 'emo_neg', 'sadness', 'socrefs', 'tomorrow', 'negative', 'family', 'tech', 'Conversation', 'don', 'Social', 'substances', 'filler', 'day', 'emotion', 'night', 'going', 'trust', 'Drives', 'days', 'oh', 'Cognition', 'think', 'need', 'mental', 'ipron', 'memory', 'finally', 'trying', 'Tone', 'gonna', 'WC', 'NETWORKSIZE', 'Physical', 'surprise', 'Arousal', 'god', 'propname', 'sleep', 'focuspresent', 'ppron', 'prep', 'death', 'week', 'yay', 'year', 'Linguistic', 'Apostro', 'did', 'comm', 'politic', 'doing', 'getting', 'friends', 'moral', 'needs', 'come', 'loves', 'fun', 'sexual', 'thank', 'BETWEENNESS', 'better', 'prosocial', 'working', 'sent_score', 'BigWords', 'female', 'bed', 'weekend', 'life', 'motion', 'happy', 'article', 'certitude', 'ready', 'sick', 'best', 'food', 'christmas', 'love', 'nonf

In [49]:
#Model Aveaging
model1 = BiLSTMClassifier(input_dim=X2_train.shape[1], hidden_dim=128, output_dim=1, num_layers=2, bidirectional=True, do_attention=True, dropout_rate=0.3)
model2 = BiLSTMClassifier(input_dim=X2_train.shape[1], hidden_dim=128, output_dim=1, num_layers=2, bidirectional=True, do_attention=True, dropout_rate=0.3)

# model1 = MLP(input_size=X1_train.shape[1], hidden_size=128, output_size=1)
# model2 = MLP(input_size=X2_train.shape[1], hidden_size=128, output_size=1)
criterion = nn.BCEWithLogitsLoss()  
optimizer1 = optim.Adam(model1.parameters(), lr=0.001)
optimizer2 = optim.Adam(model2.parameters(), lr=0.001)
num_epochs = 12
max_grad_norm = 1.0

for epoch in range(num_epochs):
    model1.train()
    model2.train()
    total_loss = 0.0

    for (t1, labels1), (t2, labels2) in zip(train_loader1, train_loader2):
        optimizer1.zero_grad()
        optimizer2.zero_grad()
        outputs1 = model1(t1)
        outputs2 = model2(t2)
        
        avg_outputs = (outputs1 + outputs2) / 2.0
        loss = criterion(avg_outputs, labels1)  
        torch.nn.utils.clip_grad_norm_(model1.parameters(), max_grad_norm)
        torch.nn.utils.clip_grad_norm_(model2.parameters(), max_grad_norm)
        loss.backward()
        optimizer1.step()
        optimizer2.step()
        total_loss += loss.item()
    model1.eval()
    model2.eval()
    val_preds = []
    val_labels = []
    val_loss1 = 0
    val_loss2 = 0
    
    with torch.no_grad():
        for (inputs1, labels1), (inputs2, labels2) in zip(val_loader1, val_loader2):
            outputs1 = model1(inputs1)
            outputs2 = model2(inputs2)
            avg_outputs = (outputs1 + outputs2) / 2.0
            val_loss1 += criterion(outputs1, labels1).item()
            val_loss2 += criterion(outputs2, labels2).item()
            val_preds.append(torch.sigmoid(avg_outputs))
            val_labels.append(labels1)
    
    val_preds = torch.cat(val_preds)
    val_labels = torch.cat(val_labels)
    val_preds = (val_preds > 0.5).float()
    val_accuracy = accuracy_score(val_labels.numpy(), val_preds.numpy())
    print(f'Epoch [{epoch + 1}/{num_epochs}], '
          f'Train Loss Model1: {total_loss / len(train_loader1):.4f}, '
          f'Validation Loss Model1: {val_loss1 / len(val_loader1):.4f}, '
          f'Validation Loss Model2: {val_loss2 / len(val_loader2):.4f}, '
          f'Validation Accuracy: {val_accuracy:.4f}')


Epoch [1/12], Train Loss Model1: 0.6946, Validation Loss Model1: 0.7014, Validation Loss Model2: 0.7130, Validation Accuracy: 0.5566
Epoch [2/12], Train Loss Model1: 0.6414, Validation Loss Model1: 0.7564, Validation Loss Model2: 0.7003, Validation Accuracy: 0.5924
Epoch [3/12], Train Loss Model1: 0.5967, Validation Loss Model1: 0.8415, Validation Loss Model2: 0.6924, Validation Accuracy: 0.5969
Epoch [4/12], Train Loss Model1: 0.5288, Validation Loss Model1: 0.9173, Validation Loss Model2: 0.6923, Validation Accuracy: 0.6036
Epoch [5/12], Train Loss Model1: 0.4329, Validation Loss Model1: 1.1190, Validation Loss Model2: 0.6883, Validation Accuracy: 0.6047
Epoch [6/12], Train Loss Model1: 0.3463, Validation Loss Model1: 1.3680, Validation Loss Model2: 0.6925, Validation Accuracy: 0.6036
Epoch [7/12], Train Loss Model1: 0.2609, Validation Loss Model1: 1.5066, Validation Loss Model2: 0.6901, Validation Accuracy: 0.6260
Epoch [8/12], Train Loss Model1: 0.2033, Validation Loss Model1: 1.98