In [7]:
import pandas as pd 
import pickle

Ok, lets now put together a few machine learning models to have as a benchmark. These will aggregate across node features (leaving edge features out for now) to set a baseline for GNNs. 

Lets start by loading in the data 

In [6]:
node_features_df = pd.read_pickle('../graphs/data/node_features_df.pkl')
node_features_df.head()

Unnamed: 0,Meeting,ParticipantID,Start Time,time_portion,speaking_turns_proportion,rate_of_speech,lexical_richness,positive_sentiment_proportion,negative_sentiment_proportion,average_word_rarity,first_person_pronoun_usage,second_person_pronoun_usage,third_person_pronoun_usage,agreement_words_proportion,disagreement_words_proportion
0,IS1000a,A,1200.0,0.061873,0.0,5.594406,0.25,0.0,0.0,7.4e-05,0.0,0.0,0.0,0.0,0.0
1,IS1000a,B,1200.0,0.306594,0.2,3.302286,0.641026,0.030303,0.0,0.00018,0.0,0.025641,0.0,0.08547,0.0
2,IS1000a,C,1200.0,0.504586,0.6,2.538158,0.52027,0.024194,0.008065,0.000223,0.006757,0.054054,0.0,0.013514,0.0
3,IS1000a,D,1200.0,0.350035,0.2,2.917182,0.533898,0.057143,0.0,0.000228,0.0,0.050847,0.0,0.025424,0.025424
4,IS1000a,A,1320.0,0.167932,0.2,4.951561,0.586957,0.012821,0.0,0.000285,0.0,0.0,0.0,0.021739,0.01087


we an now comprss features with the df below

In [8]:

grouped = node_features_df.groupby(['Meeting', 'Start Time'])
rows = []

for (meeting, start_time), group in grouped:
    if len(group) == 4: 
        # Compute the min, max, mean, and std for each of the 12 features
        stats = {
            'Meeting': meeting,
            'Start Time': start_time
        }
        for feature in group.columns[2:]:  
            stats[f'{feature}_min'] = group[feature].min()
            stats[f'{feature}_max'] = group[feature].max()
            stats[f'{feature}_mean'] = group[feature].mean()
            stats[f'{feature}_std'] = group[feature].std()

        
        rows.append(stats)

# Create a new DataFrame from the list of rows
compressed_node_features_df= pd.DataFrame(rows)
compressed_node_features_df.head()

Unnamed: 0,Meeting,Start Time,Start Time_min,Start Time_max,Start Time_mean,Start Time_std,time_portion_min,time_portion_max,time_portion_mean,time_portion_std,...,third_person_pronoun_usage_mean,third_person_pronoun_usage_std,agreement_words_proportion_min,agreement_words_proportion_max,agreement_words_proportion_mean,agreement_words_proportion_std,disagreement_words_proportion_min,disagreement_words_proportion_max,disagreement_words_proportion_mean,disagreement_words_proportion_std
0,IS1000a,1200.0,1200.0,1200.0,1200.0,0.0,0.061873,0.504586,0.305772,0.183461,...,0.0,0.0,0.0,0.08547,0.031102,0.037704,0.0,0.025424,0.006356,0.012712
1,IS1000a,1320.0,1320.0,1320.0,1320.0,0.0,0.167932,0.389732,0.281295,0.091612,...,0.005288,0.006166,0.021739,0.04878,0.035875,0.01114,0.0,0.01087,0.005098,0.005913
2,IS1000b,300.0,300.0,300.0,300.0,0.0,0.0,0.574545,0.260003,0.280661,...,0.003676,0.007353,0.0,0.09589,0.025811,0.046848,0.0,0.036765,0.010904,0.017541
3,IS1000b,420.0,420.0,420.0,420.0,0.0,0.011573,0.79613,0.258217,0.364977,...,0.005016,0.006176,0.037037,0.111111,0.073717,0.037866,0.0,0.045455,0.013215,0.021775
4,IS1000b,600.0,600.0,600.0,600.0,0.0,0.206471,0.282437,0.244748,0.032408,...,0.0,0.0,0.039474,0.115044,0.074254,0.031857,0.01,0.039474,0.024606,0.013244


lets load in the cohesion annotations 

In [11]:
# Load question-level scores df 
question_level_df = pd.read_pickle("../Cohesion_Annotations/Question_Split_data.pkl")
category_level_df = pd.read_pickle("../Cohesion_Annotations/Cohesion_split_data.pkl")


In [22]:
def filter_and_append_category_level(category_level_df, compressed_node_features_df, column_name, kappa_constraint):
    # Filter category_level_df for rows where 'Meeting' and 'Start' match compressed_node_features_df
    filtered_df = category_level_df[
        category_level_df[['Meeting', 'Start']].apply(tuple, axis=1).isin(
            compressed_node_features_df[['Meeting', 'Start Time']].apply(tuple, axis=1)
        )
    ]
    
    # Further filter rows where the kappa score for the given column is above the kappa_constraint
    filtered_df = filtered_df[filtered_df[column_name + '_Kappa'] >= kappa_constraint]
    
    # Merge the filtered category_level_df with compressed_node_features_df based on 'Meeting' and 'Start'
    merged_df = pd.merge(compressed_node_features_df, filtered_df[['Meeting', 'Start', column_name + '_Average']], 
                        left_on=['Meeting', 'Start Time'], right_on=['Meeting', 'Start'], how='inner')
    
    # Return the resulting dataframe with the appended average score column
    return merged_df.drop(columns='Start')



In [28]:
def binarize_and_filter(df, column_name, lower_threshold, upper_threshold):
    # Create a new binary column based on the thresholds
    df['Binary_Column'] = df[column_name].apply(lambda x: 0 if x < lower_threshold else (1 if x > upper_threshold else None))

    # Drop rows where the Binary_Column is None (values between the thresholds)
    df_filtered = df.dropna(subset=['Binary_Column'])

    return df_filtered

In [44]:
labeled_df = filter_and_append_category_level(category_level_df, compressed_node_features_df, 'Cohesion', 0.2)

In [45]:
len(labeled_df[labeled_df['Cohesion_Average'] < 3.5])

15

In [46]:
binarized_labeled_df = binarize_and_filter(labeled_df, 'Cohesion_Average', 3.5, 4.5)

In [34]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Assuming df is your dataframe and 'Binary_Column' is your target variable
X = binarized_labeled_df.drop(columns=['Binary_Column'])  # Drop the target variable
y = binarized_labeled_df['Binary_Column']  # Target variable

# Split the data into training and testing sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a naive classifier: predict only the majority class (in this case, '1')
majority_class = y_train.mode()[0]  # Find the most frequent class in the training set

# Create a dummy prediction for the test set (predicting the majority class for all)
y_pred = [majority_class] * len(y_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=1)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Output the results
print(f"Naive Classifier Benchmark (Majority Class: {majority_class})")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")


Naive Classifier Benchmark (Majority Class: 1.0)
Accuracy: 0.8462
Precision: 0.8462
Recall: 1.0000
F1-Score: 0.9167


Lets start wit

okay