In [None]:
# General packages
import os
import pandas as pd
from tqdm import tqdm
import numpy as np
import multiprocessing

# Sci-kit learn packages
from sklearn.metrics import root_mean_squared_error
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor

# Packages for results/plotting
import ipywidgets as widgets
from IPython.display import display
import matplotlib.pyplot as plt
from matplotlib import rc
from matplotlib import ticker
from shap import summary_plot, TreeExplainer

import gensim
from gensim.models import Word2Vec

# Change default font of matplotlib to monospace
plt.rcParams.update(plt.rcParamsDefault)
plt.rcParams["font.family"] = "monospace"

In [None]:
DATASET = 'cmcqrd' # 'usmle', 'bio' or 'cmcqrd'
RESULTS_DATASET = '../data/' + DATASET + '/with_ling_features/'
TARGET_LABEL_COL_NAME = 'Correct_Answer_Rate' # "Correct_Answer_Rate" or "Difficulty" (only for cmcqrd) or "Response_Time"(for usmle only)
REPETITIONS = 10  # Number of repetitions for each experiment, average will be taken
# Number of cores to use for sklearn's n_jobs parameter, whenever possible
NUM_OF_CORES_TO_USE = multiprocessing.cpu_count()
print("Using ", NUM_OF_CORES_TO_USE, " cores.")
# Number of most important features to print for the random forest model
NUMBER_OF_IMPORTANT_FEATURES_TO_PRINT = 10

## Load the data and splits

In [None]:
# Load the data. If the dataset with the cleaned text already exists, load it. Otherwise, create it (this may take a while).
train = pd.read_csv(RESULTS_DATASET + 'train.csv')
test = pd.read_csv(RESULTS_DATASET + 'test.csv', index_col=0)

# Size per split
print("Train size: ", len(train))
print("Test size: ", len(test))
train.head()

___
# Experiments

In [None]:
GLOBAL_ALL_RESULTS = {} # Store all results here

def test_random_forest(features_train, target_train, features_test, target_test, description):
    """Runs a Random Forest model on the given data and returns the RMSE and the top features. Average of REPETITIONS is taken."""
    feature_importances_sum = None

    all_rmses_for_model = []

    # Initialize a sum array for feature importances
    feature_importances_sum = np.zeros(features_train.shape[1])  # Number of features in the dataset)
    shap_values = []
    for repetition in tqdm(range(REPETITIONS)):
        model = RandomForestRegressor(n_jobs=NUM_OF_CORES_TO_USE) # Re-initialize the model every time
        # Fit the random forest. model.fit resets the model every time so it doesn't remember the previous fit.
        model.fit(features_train, target_train)
        # Predict the target values
        predictions = model.predict(features_test)
        rmse = root_mean_squared_error(predictions, target_test)  # Calculate RMSE
        all_rmses_for_model.append(rmse)
        # Collect feature importances
        feature_importances_sum += model.feature_importances_
        ### SHAP
        explainer = TreeExplainer(model, approximate=True)
        shap_values_of_repetition = explainer(features_train)
        shap_values.append(shap_values_of_repetition)


    # Calculate statistics for the model
    average_rmse_for_model = float(np.mean(all_rmses_for_model)) # Mean RMSE
    std_dev_rmse = float(np.std(all_rmses_for_model, ddof=1))  # Sample standard deviation
    std_error_rmse = float(std_dev_rmse / np.sqrt(REPETITIONS))  # Standard error

    # Store a summary of the results
    rmse_results_summary = {
        'rmse': round(average_rmse_for_model, 4),
        'std_dev': round(std_dev_rmse, 4),
        'std_error': round(std_error_rmse, 4)
    }

    # Print the top NUMBER_OF_IMPORTANT_FEATURES_TO_PRINT important features for Random Forest
    # Calculate the average feature importances across all repetitions
    avg_feature_importances = feature_importances_sum / REPETITIONS
    # Get the indices of the top NUMBER_OF_IMPORTANT_FEATURES_TO_PRINT most important features
    indices = np.argsort(avg_feature_importances)[-NUMBER_OF_IMPORTANT_FEATURES_TO_PRINT:][::-1]
    top_features_and_importances = [(features_train.columns[i], float(round(avg_feature_importances[i], 4))) for i in indices]
    
    # Average SHAP values over repetitions
    shap_values = np.array([shap_values[i].values for i in range(REPETITIONS)])
    shap_values = np.mean(shap_values, axis=0)
    
    # Summary plot:
    summary_plot(shap_values, features_train, max_display=NUMBER_OF_IMPORTANT_FEATURES_TO_PRINT, plot_size=[12, 5], show=False)
    plt.suptitle('Effect of Top Features on Predicting Student Success \n(' + description + ')', fontsize=20, x=0.5, y=1.1)
    plt.xticks(fontsize=15)
    plt.yticks(fontsize=16)
    plt.xlabel('Impact on Random Forest Prediction', fontsize=18)
    ax = plt.gca()  # Get current axis
    ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, _: f"{x:.2g}"))
    plt.show()
    plt.close()    
    return rmse_results_summary, top_features_and_importances

## Generate word2vec embeddings

In [None]:
from gensim.models import Word2Vec
import numpy as np
import pandas as pd

def train_word2vec_model(sentences, vector_size=100, window=5, min_count=1, workers=4):
    """
    Train a Word2Vec model on the given tokenized sentences.
    """
    return Word2Vec(sentences=sentences, vector_size=vector_size, window=window, min_count=min_count, workers=workers)

def get_word2vec_emb(column, model):
    """
    Get Word2Vec embeddings for a given column using a pre-trained model.
    
    Args:
        column (pd.Series): Column of text data.
        model (Word2Vec): Pre-trained Word2Vec model.

    Returns:
        pd.DataFrame: DataFrame where each row is a vector representation of the text.
    """
    def get_embedding(tokens):
        # Extract embeddings for words in the model
        valid_vectors = [model.wv[word] for word in tokens if word in model.wv]
        if valid_vectors:
            return np.mean(valid_vectors, axis=0)  # Compute average vector
        else:
            return np.zeros(model.vector_size)  # Use zero vector if no valid words
    
    # Tokenize text
    tokens = column.astype(str).apply(lambda x: x.split())

    # Get embeddings for each row
    embeddings_list = tokens.apply(get_embedding)

    # Convert list of arrays into a DataFrame
    return pd.DataFrame(embeddings_list.tolist(), index=column.index)

# Tokenize sentences from both train and test
all_sentences = train["question_with_options"].astype(str).apply(lambda x: x.split()).tolist() + \
                test["question_with_options"].astype(str).apply(lambda x: x.split()).tolist()

# Train Word2Vec model once
word2vec_model = train_word2vec_model(all_sentences)

# Get embeddings for train and test
train_word2vec = get_word2vec_emb(train["question_with_options"], word2vec_model)
test_word2vec = get_word2vec_emb(test["question_with_options"], word2vec_model)


In [None]:
print(train_word2vec)

## Using All Linguistic Features

In [None]:
linguistic_features= ["Word_Count", "Word_Count_No_stop_words", "Avg_Word_Length", "Sentence_Count", "Avg_Sent_Length_in_Words", "Noun_Count", "Verb_Count", "Adjective_Count", "Adverb_Count", "Number_of_NPs", "Number_of_PPs", "Number_of_VPs", "Temporal_Connectives_Count", "Causal_Connectives_Count", "Exemplifying_Connectives_Count", "Additive_Connectives_Count", "Contrastive_Connectives_Count"]

In [None]:
# Concatenate the features
features_train = pd.concat([train[linguistic_features], train_word2vec], axis=1)
features_train.columns = features_train.columns.astype(str)
target_train = train[TARGET_LABEL_COL_NAME]
features_test = pd.concat([test[linguistic_features], test_word2vec], axis=1)
features_test.columns = features_test.columns.astype(str)
target_test = test[TARGET_LABEL_COL_NAME]


description = "All Linguistic Features"
GLOBAL_ALL_RESULTS[description] = test_random_forest(features_train, target_train, features_test, target_test, description)


___
# Results RMSE Overview and Top Features

In [None]:
# Now we filter results to only the ones we are interested in
checkboxes = [widgets.Checkbox(value=True, description=label, layout=widgets.Layout(
    width='1000px')) for label in GLOBAL_ALL_RESULTS]
output = widgets.VBox(children=checkboxes)
display(output)

In [None]:
selected_keys = []
for i in range(0, len(checkboxes)):
    if checkboxes[i].value == True:
        selected_keys = selected_keys + [checkboxes[i].description]

for key in selected_keys:
    print(key)
    print("RMSE: ", GLOBAL_ALL_RESULTS[key][0]['rmse'], "±", GLOBAL_ALL_RESULTS[key][0]['std_error'], " (STDEV: ", GLOBAL_ALL_RESULTS[key][0]['std_dev'], ")")
    print("--------------------")
    if key == 'DummyRegressor':
        # there are no features to plot
        continue
    ### Plotting ###
    # Set the font
    plt.rcParams["font.family"] = "monospace"
    
    # Create the main plot
    fig, ax = plt.subplots(figsize=(10, 5))
    GLOBAL_ALL_RESULTS[key][1].sort(key=lambda x: x[1]) # descending order by importance
    ax.barh([x[0] for x in GLOBAL_ALL_RESULTS[key][1]], [x[1] for x in GLOBAL_ALL_RESULTS[key][1]], color='#478058')
    
    # Set title and labels
    title_text = 'Feature Contributions to Model Performance\n' + '(' + key + ')' # title text
    fig.suptitle(title_text, ha='center', fontsize=20, y=1.01) # set title
    ax.set_xlabel('Feature Importance', fontsize=18) # x label
    max_importance = max([x[1] for x in GLOBAL_ALL_RESULTS[key][1]])
    for i, (feature, importance) in enumerate(GLOBAL_ALL_RESULTS[key][1]):
        if importance > max_importance/2:
            text_length = len(feature) * 0.002 # This is very hacky: manually tune this
            ax.text(0.01, i, feature, ha='left', va='center', fontsize=10, color='white') # aligned to the right of the bar
        else:
            ax.text(0.0001+importance, i, feature, ha='left', va='center', fontsize=10, color='black') # might need to adjust the 0.0001

    ax.set_yticks([])
    plt.show()