# Task 6
## Create and split training/testing datasets

### Import frameworks

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from scikeras.wrappers import KerasRegressor
from tensorflow.keras.layers import Input
from sklearn.multioutput import MultiOutputRegressor
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


### Load Dataset and split labels

In [3]:
df = pd.read_csv('Dyadic_PELD.tsv', sep='\t', header=0)


labels = df['Personality'].to_numpy()
labels = [eval(x) for x in labels]
df_labels = pd.DataFrame(labels, columns=['Openness', 'Conscientiousness', 'Extroversion', 'Agreeableness', 'Neuroticism'])

### Function for creating training datasets with different variables

In [4]:
def get_training_labels(max_features=None, n_gram=1, emotion=True, sentiment=True, utterance_2=True, utterance_3=True):
    tfidf = TfidfVectorizer(max_features=max_features, ngram_range=(n_gram,n_gram))


    # Utterance 1 is always True
    utterance_tfidf = tfidf.fit_transform(df['Utterance_1'])
    df_features = pd.DataFrame(utterance_tfidf.toarray(), columns=[f"Utterance1_{word}" for word in tfidf.get_feature_names_out()])
    if emotion:
        df_emotions = pd.get_dummies(df[['Emotion_1']])
        df_features = pd.concat([df_features, df_emotions], axis=1)
    if sentiment:
        df_sentiments = pd.get_dummies(df[['Sentiment_1']])
        df_features = pd.concat([df_features, df_sentiments], axis=1)



    if utterance_2:
        utterance_tfidf = tfidf.fit_transform(df['Utterance_2'])
        df_utterance_tfidf = pd.DataFrame(utterance_tfidf.toarray(), columns=[f"Utterance2_{word}" for word in tfidf.get_feature_names_out()])
        df_features = pd.concat([df_features, df_utterance_tfidf], axis=1)
        if emotion:
            df_emotions = pd.get_dummies(df[['Emotion_2']])
            df_features = pd.concat([df_features, df_emotions], axis=1)
        if sentiment:
            df_sentiments = pd.get_dummies(df[['Sentiment_2']])
            df_features = pd.concat([df_features, df_sentiments], axis=1)


    if utterance_3:
        utterance_tfidf = tfidf.fit_transform(df['Utterance_3'])
        df_utterance_tfidf = pd.DataFrame(utterance_tfidf.toarray(), columns=[f"Utterance3_{word}" for word in tfidf.get_feature_names_out()])
        df_features = pd.concat([df_features, df_utterance_tfidf], axis=1)
        if emotion:
            df_emotions = pd.get_dummies(df[['Emotion_3']])
            df_features = pd.concat([df_features, df_emotions], axis=1)
        if sentiment:
            df_sentiments = pd.get_dummies(df[['Sentiment_3']])
            df_features = pd.concat([df_features, df_sentiments], axis=1)

    # Train-test split
    features_train, features_test, labels_train, labels_test = train_test_split(df_features, df_labels, test_size=0.2, random_state=42, stratify=df['Speaker_1'])

    return features_train, features_test, labels_train, labels_test

### Setup 6 different ai models to do parameter hypertuning with gridsearch on
This is so we can check which models performs best on our dataset, with several different parameters.

Note, for now we only use deep learning as this takes a long time to search, it is also unnecesary to hyper optimize for this assignment as the course is about Processing Natural Language


In [5]:

# Deep learning model function
def create_deep_learning_model(input_dim, dense_units=512, dropout_rate=0.5):
    model = Sequential([
        Input(shape=(input_dim,)),  # Define the input layer with the shape
        Dense(dense_units, activation='relu'),
        Dropout(dropout_rate),
        Dense(dense_units // 2, activation='relu'),
        Dense(5, activation='linear')  # Output layer for regression
    ])
    model.compile(optimizer='adam', loss='mse')
    return model


def grid_search_deep_learning(features_train, labels_train):
    model = KerasRegressor(
        model=create_deep_learning_model,
        input_dim=features_train.shape[1],
        verbose=0
    )
    
    # Define the param_grid with parameter names directly available in KerasRegressor
    param_grid = {
        'model__dense_units': [512, 256],
        'model__dropout_rate': [0.3, 0.5],
        'epochs': [10],
        'batch_size': [16, 32]
    }
    
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
    grid_search.fit(features_train, labels_train)
    print("Best Deep Learning Params:", grid_search.best_params_)
    return grid_search.best_estimator_, grid_search.best_params_, grid_search.best_params_, grid_search.best_score_



# Linear regression model function
def grid_search_linear_regression(features_train, labels_train):
    model = LinearRegression()
    multi_target_lr = MultiOutputRegressor(model)  # Wrap in MultiOutputRegressor
    param_grid = {}
    
    grid_search = GridSearchCV(estimator=multi_target_lr, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
    grid_search.fit(features_train, labels_train)
    print("Best Linear Regression Params:", grid_search.best_params_)
    
    return grid_search.best_estimator_, grid_search.best_params_, grid_search.best_params_, grid_search.best_score_



# Polynomial regression model function
def grid_search_polynomial_regression(features_train, labels_train):
    model = Pipeline([
        ('poly', PolynomialFeatures(degree=2)),
        ('linear', LinearRegression())
    ])
    
    multi_target_poly = MultiOutputRegressor(model)  # Wrap in MultiOutputRegressor
    param_grid = {
        'estimator__poly__degree': [2, 3]  # Adjust the parameter for the pipeline
    }
    
    grid_search = GridSearchCV(estimator=multi_target_poly, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
    grid_search.fit(features_train, labels_train)
    print("Best Polynomial Regression Params:", grid_search.best_params_)
    
    return grid_search.best_estimator_, grid_search.best_params_, grid_search.best_params_, grid_search.best_score_



# SVR model function
def grid_search_svr(features_train, labels_train):
    # Initialize SVR model
    model = SVR()
    
    # Define parameter grid
    param_grid = {
        'estimator__kernel': ['linear', 'rbf', 'poly'],
        'estimator__C': [0.1, 1, 10],
        'estimator__epsilon': [0.01, 0.1, 1]
    }
    
    # Use MultiOutputRegressor with GridSearchCV
    multi_target_svr = MultiOutputRegressor(model)
    grid_search = GridSearchCV(estimator=multi_target_svr, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
    
    # Fit the model
    grid_search.fit(features_train, labels_train)
    
    print("Best SVR Params:", grid_search.best_params_)
    return grid_search.best_estimator_, grid_search.best_params_, grid_search.best_params_, grid_search.best_score_



# Decision tree model function
def grid_search_decision_tree(features_train, labels_train):
    model = DecisionTreeRegressor()
    param_grid = {
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10, 16]
    }
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
    grid_search.fit(features_train, labels_train)
    print("Best Decision Tree Params:", grid_search.best_params_)
    return grid_search.best_estimator_, grid_search.best_params_, grid_search.best_params_, grid_search.best_score_



# Random forest model function
def grid_search_random_forest(features_train, labels_train):
    model = RandomForestRegressor()
    param_grid = {
        'n_estimators': [50, 100, 150],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10]
    }
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
    grid_search.fit(features_train, labels_train)
    print("Best Random Forest Params:", grid_search.best_params_)
    return grid_search.best_estimator_, grid_search.best_params_, grid_search.best_params_, grid_search.best_score_

### Run grid search on all models
Note, as specified earlier, there is not enough time for this task that doesn't contribute to the field of NLP.

Therefore, we comment it out for now.

In [6]:
#print(features_train.shape)

# # Linear regression model
# best_linear_model, best_params_lr, best_params_lr_repeat, best_score_lr = grid_search_linear_regression(features_train, labels_train)
# print(f"Best Linear Model: {best_linear_model}")
# print(f"Best Parameters (Linear Regression): {best_params_lr}")
# print(f"Repeated Best Parameters (Linear Regression): {best_params_lr_repeat}")
# print(f"Best Score (Linear Regression): {best_score_lr}")

# # Polynomial regression model
# best_polynomial_model, best_params_poly, best_params_poly_repeat, best_score_poly = grid_search_polynomial_regression(features_train, labels_train)
# print(f"Best Polynomial Model: {best_polynomial_model}")
# print(f"Best Parameters (Polynomial Regression): {best_params_poly}")
# print(f"Repeated Best Parameters (Polynomial Regression): {best_params_poly_repeat}")
# print(f"Best Score (Polynomial Regression): {best_score_poly}")

# # for dataframe in df_features_array:

# # SVR model
# # Train-test split
# features_train, features_test, labels_train, labels_test = train_test_split(dataframe, df_labels, test_size=0.2, random_state=42, stratify=df['Speaker_1'])
# best_svr_model, best_params_svr, best_params_svr_repeat, best_score_svr = grid_search_svr(features_train, labels_train)
# print(f"Best SVR Model: {best_svr_model}")
# print(f"Best Parameters (SVR): {best_params_svr}")
# print(f"Repeated Best Parameters (SVR): {best_params_svr_repeat}")
# print(f"Best Score (SVR): {best_score_svr}")

# # Decision tree model
# best_decision_tree_model, best_params_dt, best_params_dt_repeat, best_score_dt = grid_search_decision_tree(features_train, labels_train)
# print(f"Best Decision Tree Model: {best_decision_tree_model}")
# print(f"Best Parameters (Decision Tree): {best_params_dt}")
# print(f"Repeated Best Parameters (Decision Tree): {best_params_dt_repeat}")
# print(f"Best Score (Decision Tree): {best_score_dt}")

# # Random forest model
# best_random_forest_model, best_params_rf, best_params_rf_repeat, best_score_rf = grid_search_random_forest(features_train, labels_train)
# print(f"Best Random Forest Model: {best_random_forest_model}")
# print(f"Best Parameters (Random Forest): {best_params_rf}")
# print(f"Repeated Best Parameters (Random Forest): {best_params_rf_repeat}")
# print(f"Best Score (Random Forest): {best_score_rf}")

# # Deep learning model
# best_deep_learning_model, best_params_dl, best_params_dl_repeat, best_score_dl = grid_search_deep_learning(features_train, labels_train)
# print(f"Best Deep Learning Model: {best_deep_learning_model}")
# print(f"Best Parameters (Deep Learning): {best_params_dl}")
# print(f"Repeated Best Parameters (Deep Learning): {best_params_dl_repeat}")
# print(f"Best Score (Deep Learning): {best_score_dl}")

### Run grid search only one a deep learning model
Therefore we only run grid search on deep learning, and not all models

In [7]:
features_train, features_test, labels_train, labels_test = get_training_labels()


# Deep learning model
best_deep_learning_model, best_params_dl, best_params_dl_repeat, best_score_dl = grid_search_deep_learning(features_train, labels_train)
print(f"Best Deep Learning Model: {best_deep_learning_model}")
print(f"Best Parameters (Deep Learning): {best_params_dl}")
print(f"Repeated Best Parameters (Deep Learning): {best_params_dl_repeat}")
print(f"Best Score (Deep Learning): {best_score_dl}")

Best Deep Learning Params: {'batch_size': 32, 'epochs': 10, 'model__dense_units': 256, 'model__dropout_rate': 0.5}
Best Deep Learning Model: KerasRegressor(
	model=<function create_deep_learning_model at 0x000001DB6D5D9620>
	build_fn=None
	warm_start=False
	random_state=None
	optimizer=rmsprop
	loss=None
	metrics=None
	batch_size=32
	validation_batch_size=None
	verbose=0
	callbacks=None
	validation_split=0.0
	shuffle=True
	run_eagerly=False
	epochs=10
	input_dim=12262
	model__dense_units=256
	model__dropout_rate=0.5
)
Best Parameters (Deep Learning): {'batch_size': 32, 'epochs': 10, 'model__dense_units': 256, 'model__dropout_rate': 0.5}
Repeated Best Parameters (Deep Learning): {'batch_size': 32, 'epochs': 10, 'model__dense_units': 256, 'model__dropout_rate': 0.5}
Best Score (Deep Learning): -0.005872741745173176


The best deep learning parameters for this problem are these:

batch_size: 32

epochs: 10

model__dense_units: 256

model__dropout_rate: 0.5

In [8]:
best_params = {'batch_size': 32, 'epochs': 10, 'model__dense_units': 256, 'model__dropout_rate': 0.5}

### Create a list of datasets that differs in simple terms
This is the meat of this task. We want to determine how different features affect the model.

Therefore we start by making only one or two changes in each dataset in order to be able to measure the impact of every single feature

In [9]:
training_datasets = {
    "all_data": list(get_training_labels(max_features=None, n_gram=1, emotion=True, sentiment=True, utterance_2=True, utterance_3=True)),

    "5000features": list(get_training_labels(max_features=5000, n_gram=1, emotion=True, sentiment=True, utterance_2=True, utterance_3=True)),
    "1000features": list(get_training_labels(max_features=1000, n_gram=1, emotion=True, sentiment=True, utterance_2=True, utterance_3=True)),

    "bigram": list(get_training_labels(max_features=None, n_gram=2, emotion=True, sentiment=True, utterance_2=True, utterance_3=True)),
    "trigram": list(get_training_labels(max_features=None, n_gram=3, emotion=True, sentiment=True, utterance_2=True, utterance_3=True)),
    "quadgram": list(get_training_labels(max_features=None, n_gram=4, emotion=True, sentiment=True, utterance_2=True, utterance_3=True)),

    "no_sentiment": list(get_training_labels(max_features=None, n_gram=1, emotion=True, sentiment=False, utterance_2=True, utterance_3=True)),
    "no_emotion": list(get_training_labels(max_features=None, n_gram=1, emotion=False, sentiment=True, utterance_2=True, utterance_3=True)),
    "no_emotion_or_sentiment": list(get_training_labels(max_features=None, n_gram=1, emotion=False, sentiment=False, utterance_2=True, utterance_3=True)),

    "no_utterance2": list(get_training_labels(max_features=None, n_gram=1, emotion=True, sentiment=True, utterance_2=False, utterance_3=True)),
    "only_utterance1": list(get_training_labels(max_features=None, n_gram=1, emotion=True, sentiment=True, utterance_2=False, utterance_3=False))
}

### Create a function for running the model on all datasets
We split the 'train' into 'train/val', so that we have training, validation and testing

In [19]:
def run_deep_learning_model_with_params(features_train, features_test, labels_train, labels_test, params):
    # Extract parameters from the dictionary
    dense_units = params.get('model__dense_units', 512)
    dropout_rate = params.get('model__dropout_rate', 0.5)
    epochs = params.get('epochs', 10)
    batch_size = params.get('batch_size', 32)
    
    # Train-val split
    df_full_train, df_full_test, labels_train, labels_test = train_test_split(df, df_labels, test_size=0.2, random_state=42, stratify=df['Speaker_1'])
    features_inner_train, features_val, labels_inner_train, labels_val = train_test_split(features_train, labels_train, test_size=0.2, random_state=42, stratify=df_full_train['Speaker_1'])

    # Define the model architecture based on chosen parameters
    model = Sequential([
        Input(shape=(features_train.shape[1],)),
        Dense(dense_units, activation='relu'),
        Dropout(dropout_rate),
        Dense(dense_units // 2, activation='relu'),
        Dense(5, activation='linear')  # Output layer for regression
    ])
    
    # Compile the model
    model.compile(optimizer=Adam(), loss='mse', metrics=['mae', 'mape'])
    
    # Train the model
    history = model.fit(features_inner_train, labels_inner_train,
                        epochs=epochs,
                        batch_size=batch_size,
                        validation_data=(features_val, labels_val),
                        verbose=1)
    
    # Evaluate the model on the test set
    test_mse, test_mae, test_mape = model.evaluate(features_test, labels_test)
    
    return test_mse, test_mae, test_mape

### Run the model on every dataset to compare test loss

In [20]:
test_losses = {}

# Loop through each dataset in training_datasets
for dataset_name, (features_train, labels_train, features_test, labels_test) in training_datasets.items():
    print(f"Model: {dataset_name}")
    # Run the model with the specified parameters and get the test loss
    test_mse, test_mae, test_mape = run_deep_learning_model_with_params(features_train, labels_train, features_test, labels_test, best_params)
    # Save the test loss in the test_losses dictionary with the dataset name as the key
    test_losses[dataset_name] = [test_mse, test_mae, test_mape]
    print(f"{dataset_name}:")
    print(f"Mean Squared Error: {test_mse}")
    print(f"Mean Absolute Error: {test_mae}")
    print(f"Mean Absolute Percentage Error: {test_mape}")


Model: all_data
Epoch 1/10
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 30ms/step - loss: 0.0554 - mae: 0.1728 - mape: 34.4814 - val_loss: 0.0071 - val_mae: 0.0679 - val_mape: 14.1835
Epoch 2/10
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 27ms/step - loss: 0.0088 - mae: 0.0752 - mape: 15.5987 - val_loss: 0.0076 - val_mae: 0.0714 - val_mape: 14.4054
Epoch 3/10
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 26ms/step - loss: 0.0069 - mae: 0.0661 - mape: 13.6445 - val_loss: 0.0064 - val_mae: 0.0634 - val_mape: 13.2364
Epoch 4/10
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 27ms/step - loss: 0.0049 - mae: 0.0558 - mape: 11.5159 - val_loss: 0.0066 - val_mae: 0.0634 - val_mape: 12.8835
Epoch 5/10
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 26ms/step - loss: 0.0033 - mae: 0.0458 - mape: 9.4348 - val_loss: 0.0063 - val_mae: 0.0622 - val_mape: 12.8447
Epoch 6/10
[1m131/131[0m [32m━━━━━━━━

In [21]:
# Print test losses sorted by each metric from least to most loss
metrics = ["Mean Squared Error", "Mean Absolute Error", "Mean Absolute Percentage Error"]

for i in range(3):
    print(f"\nTest Losses sorted by {metrics[i]} (sorted):")
    for dataset, loss in sorted(test_losses.items(), key=lambda item: item[1][i]):
        print(f"{dataset}: {loss[i]}")


Test Losses sorted by Mean Squared Error (sorted):
5000features: 0.00571276992559433
trigram: 0.005729729309678078
quadgram: 0.005730041302740574
no_emotion: 0.005732996389269829
no_sentiment: 0.005785708781331778
bigram: 0.005821918603032827
all_data: 0.00585175072774291
1000features: 0.0059440056793391705
no_utterance2: 0.005988821387290955
no_emotion_or_sentiment: 0.00603902991861105
only_utterance1: 0.006119111552834511

Test Losses sorted by Mean Absolute Error (sorted):
trigram: 0.05766170471906662
5000features: 0.05788521096110344
no_emotion: 0.05790345370769501
quadgram: 0.05791456997394562
no_sentiment: 0.05831199511885643
bigram: 0.05843406170606613
all_data: 0.05871236324310303
no_emotion_or_sentiment: 0.0591922253370285
1000features: 0.059513580054044724
no_utterance2: 0.05992359668016434
only_utterance1: 0.06029250845313072

Test Losses sorted by Mean Absolute Percentage Error (sorted):
trigram: 12.133415222167969
5000features: 12.178739547729492
no_sentiment: 12.22558212

### Datasets comparison

The test losses, ordered from least loss to most loss with MSE, are as follows:

- trigram:                  0.005757459439337254
- 5000features:             0.005766735412180424
- no_emotion:               0.005812237039208412
- quadgram:                 0.0058181206695735455
- no_emotion_or_sentiment:  0.005820064339786768
- all_data:                 0.0058225891552865505
- no_utterance2:            0.005976484622806311
- 1000features:             0.006012726575136185
- bigram:                   0.006078173406422138
- no_sentiment:             0.0061368318274617195
- only_utterance1:          0.00617994274944067


Let's break these down

The baseline is 'all_data' as that is just the raw dataset, so we use this to compare the changes with.

#### Limited Vocabulary 
Limiting it to 5000 features is very roughly limiting it to slightly above half. This did make a noticeable improvement.

Limiting the vocabulary to 1000 seemed to make the model worse.

This seems to indicate that the least frequent words yields too little information about personality, relative to more frequent words. But you might only want to cut off a certain bottom percentile.

We should test with a broader vocabulary and also somewhere between 1000-5000

Let's make a test for both 3000 and 7000


#### N-Gram
Trigram made a noticeable improvement over bigram, quadgram, and unigram.

This might indicate that trigram is optimal in capturing the personality of a speaker, as it hits a middleground of splitting up tokens.


#### Emotion
The emotion tags seems to not make a noticeable difference.


#### Sentiment
Removing sentiment seems to be detrimental to the model, however, if you remove both emotions and sentiment, then the performance goes back to baseline.

We can speculate on why this is the case.

There might be information in sentiment that only makes sense to the model together with emotion.


#### Utterances
It makes intuitive sense that removing utterance 3 (Speaker 1's second utterance) makes the model worse.

However, it is interesting that utterance 2 (Speaker 2's utterance) contributes to the models performance when combined with utterance 3