In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd

# Mount Google Drive (if not already mounted)
drive.mount('/content/drive')

# Define file paths for predictions
pred_video_file_path = '/content/drive/MyDrive/project/pred_video.csv'
pred_audio_file_path = '/content/drive/MyDrive/project/pred_audio.csv'
pred_text_file_path = '/content/drive/MyDrive/project/pred_text.csv'


# Load predictions as pandas DataFrames
pred_video_df = pd.read_csv(pred_video_file_path)
pred_audio_df = pd.read_csv(pred_audio_file_path)
pred_text_df = pd.read_csv(pred_text_file_path)



In [None]:
import pandas as pd

# Load the true labels into a pandas DataFrame
true_labels_df = pd.read_csv('/content/drive/MyDrive/Data/annotation_testing.csv')

# Display the first few rows of the DataFrame
print(true_labels_df.head())


In [None]:
# Remove file extensions from the 'Filename' column
pred_video_df['Filename'] = pred_video_df['Filename'].str.replace('.json', '', regex=False)
pred_audio_df['Filename'] = pred_audio_df['Filename'].str.replace('.wav', '', regex=False)
pred_text_df['Filename'] = pred_text_df['Filename'].str.replace('.mp4', '', regex=False)
true_labels_df['Filename'] = true_labels_df['Filename'].str.replace('.mp4', '', regex=False)

# Display the first few rows of each DataFrame after removing extensions
print("Video Predictions:")
print(pred_video_df.head())

print("\nAudio Predictions:")
print(pred_audio_df.head())

print("\nText Predictions:")
print(pred_text_df.head())

print("\nAnnotation Testing Labels:")
print(true_labels_df.head())


#Merge

In [None]:
print("Video Predictions:")
print(pred_video_df.head())

print("\nAudio Predictions:")
print(pred_audio_df.head())

print("\nText Predictions:")
print(pred_text_df.head())

print("\nAnnotation Testing Labels:")
print(true_labels_df.head())

In [None]:
def merge_predictions_and_labels(video_df, audio_df, text_df, labels_df):

    # Merge video, audio, and text predictions on 'filename'
    combined_df = pd.merge(video_df, audio_df, on='Filename', suffixes=('_video', '_audio'))
    combined_df = pd.merge(combined_df, text_df, on='Filename')
    combined_df.rename(columns={'leadership_score': 'leadership_score_text'}, inplace=True)

    # Merge the combined predictions with the true labels
    final_df = pd.merge(combined_df, labels_df, on='Filename', suffixes=('_text', '_true'))

    return final_df

# Merge DataFrames
final_df = merge_predictions_and_labels(pred_video_df, pred_audio_df, pred_text_df, true_labels_df)

# Display the final DataFrame
print(final_df)

In [None]:
print(pred_video_df['Filename'].head())
print(pred_audio_df['Filename'].head())
print(pred_text_df['Filename'].head())
print(true_labels_df['Filename'].head())


In [None]:
final_df = final_df.drop(['layer_5', 'layer_7'], axis=1)

In [None]:
print(final_df.columns)
columns_to_drop = ['layer_5', 'layer_7']
final_df = final_df.drop(columns=[col for col in columns_to_drop if col in final_df.columns])


# **Average**

In [None]:
import numpy as np
import pandas as pd
from sklearn import metrics
## Extract the predicted scores
predicted_video = final_df['leadership_score_video'].values
predicted_audio = final_df['leadership_score_audio'].values
predicted_text = final_df['leadership_score_text'].values

# Calculate average predicted scores
average_predicted_scores = (predicted_video + predicted_audio + predicted_text) / 3

# Extract the true labels
y_true = final_df['leadership_score'].values

# Calculate regression metrics based on average predicted scores
mae = metrics.mean_absolute_error(y_true, average_predicted_scores)
mse = metrics.mean_squared_error(y_true, average_predicted_scores)
r2 = metrics.r2_score(y_true, average_predicted_scores)

# Print regression metrics
print('Mean Absolute Error (MAE):', mae)
print('Mean Squared Error (MSE):', mse)
print('R^2 Score:', r2)

In [None]:
import numpy as np
import pandas as pd
from sklearn import metrics

# Assuming final_df contains the data
predicted_extraversion = (final_df['extraversion_video'].values + final_df['extraversion_audio'].values + final_df['extraversion_text'].values) / 3

predicted_neuroticism = (final_df['neuroticism_video'].values + final_df['neuroticism_audio'].values + final_df['neuroticism_text'].values) / 3

predicted_agreeableness = (final_df['agreeableness_video'].values + \
                           final_df['agreeableness_audio'].values + \
                           final_df['agreeableness_text'].values) / 3

predicted_conscientiousness = (final_df['conscientiousness_video'].values + \
                              final_df['conscientiousness_audio'].values + \
                              final_df['conscientiousness_text'].values) / 3

predicted_openness = (final_df['openness_video'].values + \
                     final_df['openness_audio'].values + \
                     final_df['openness_text'].values) / 3

# Extract the true labels
extraversion_true = final_df['extraversion_true'].values
neuroticism_true = final_df['neuroticism_true'].values
agreeableness_true = final_df['agreeableness_true'].values
conscientiousness_true = final_df['conscientiousness_true'].values
openness_true = final_df['openness_true'].values

# Calculate regression metrics for each personality score
metrics_extraversion = {
    'MAE': metrics.mean_absolute_error(extraversion_true, predicted_extraversion),
    'MSE': metrics.mean_squared_error(extraversion_true, predicted_extraversion),
    'R^2': metrics.r2_score(extraversion_true, predicted_extraversion)
}

metrics_neuroticism = {
    'MAE': metrics.mean_absolute_error(neuroticism_true, predicted_neuroticism),
    'MSE': metrics.mean_squared_error(neuroticism_true, predicted_neuroticism),
    'R^2': metrics.r2_score(neuroticism_true, predicted_neuroticism)
}

metrics_agreeableness = {
    'MAE': metrics.mean_absolute_error(agreeableness_true, predicted_agreeableness),
    'MSE': metrics.mean_squared_error(agreeableness_true, predicted_agreeableness),
    'R^2': metrics.r2_score(agreeableness_true, predicted_agreeableness)
}

metrics_conscientiousness = {
    'MAE': metrics.mean_absolute_error(conscientiousness_true, predicted_conscientiousness),
    'MSE': metrics.mean_squared_error(conscientiousness_true, predicted_conscientiousness),
    'R^2': metrics.r2_score(conscientiousness_true, predicted_conscientiousness)
}

metrics_openness = {
    'MAE': metrics.mean_absolute_error(openness_true, predicted_openness),
    'MSE': metrics.mean_squared_error(openness_true, predicted_openness),
    'R^2': metrics.r2_score(openness_true, predicted_openness)
}

# Print regression metrics for each personality score
print('Metrics for Extraversion:')
for metric_name, value in metrics_extraversion.items():
    print(f'{metric_name}: {value:.4f}')

print('\nMetrics for Neuroticism:')
for metric_name, value in metrics_neuroticism.items():
    print(f'{metric_name}: {value:.4f}')

print('\nMetrics for Agreeableness:')
for metric_name, value in metrics_agreeableness.items():
    print(f'{metric_name}: {value:.4f}')

print('\nMetrics for Conscientiousness:')
for metric_name, value in metrics_conscientiousness.items():
    print(f'{metric_name}: {value:.4f}')

print('\nMetrics for Openness:')
for metric_name, value in metrics_openness.items():
    print(f'{metric_name}: {value:.4f}')


# Weighted

In [None]:
from itertools import product
import matplotlib.pyplot as plt
import numpy as np
from sklearn import metrics

# Extract the predicted scores
predicted_video = final_df['leadership_score_video'].values
predicted_audio = final_df['leadership_score_audio'].values
predicted_text = final_df['leadership_score_text'].values

# Extract the true labels
y_true = final_df['leadership_score'].values

# Generate all possible weight combinations
def generate_triplets():
    numbers = [0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]
    triplets = []

    for triplet in product(numbers, repeat=3):
        if sum(triplet) == 1:
            triplets.append(triplet)

    return triplets

# Function to calculate metrics for each weight set
def calculate_metrics(weight_sets, predicted_video, predicted_audio, predicted_text, y_true):
    mae_scores = []
    mse_scores = []
    r2_scores = []

    for weights in weight_sets:
        w_video, w_audio, w_text = weights
        average_predicted_scores = (w_video * predicted_video + w_audio * predicted_audio + w_text * predicted_text)

        mae = metrics.mean_absolute_error(y_true, average_predicted_scores)
        mse = metrics.mean_squared_error(y_true, average_predicted_scores)
        r2 = metrics.r2_score(y_true, average_predicted_scores)

        mae_scores.append(mae)
        mse_scores.append(mse)
        r2_scores.append(r2)

    return mae_scores, mse_scores, r2_scores

# Generate weight sets
weight_sets = generate_triplets()

# Calculate metrics
mae_scores, mse_scores, r2_scores = calculate_metrics(weight_sets, predicted_video, predicted_audio, predicted_text, y_true)

# Sort indices for MAE and MSE (smallest first)
sorted_indices_mae = np.argsort(mae_scores)[:10]
sorted_indices_mse = np.argsort(mse_scores)[:10]

# Sort indices for R2 (largest first)
sorted_indices_r2 = np.argsort(r2_scores)[-10:]

# Extract sorted scores and labels
sorted_mae_scores = [mae_scores[i] for i in sorted_indices_mae]
sorted_mse_scores = [mse_scores[i] for i in sorted_indices_mse]
sorted_r2_scores = [r2_scores[i] for i in sorted_indices_r2]

sorted_weight_labels_mae = [f'Weights: {weight_sets[i]}' for i in sorted_indices_mae]
sorted_weight_labels_mse = [f'Weights: {weight_sets[i]}' for i in sorted_indices_mse]
sorted_weight_labels_r2 = [f'Weights: {weight_sets[i]}' for i in sorted_indices_r2]

# Plotting the sorted metrics
fig, axs = plt.subplots(3, 1, figsize=(10, 20))

# MAE plot
axs[0].barh(sorted_weight_labels_mae[::-1], sorted_mae_scores[::-1], color='blue', alpha=0.7)
axs[0].set_title('Top 10 Smallest Mean Absolute Error (MAE)')
axs[0].set_xlabel('MAE')
axs[0].set_ylabel('Weight Sets')

# MSE plot
axs[1].barh(sorted_weight_labels_mse[::-1], sorted_mse_scores[::-1], color='green', alpha=0.7)
axs[1].set_title('Top 10 Smallest Mean Squared Error (MSE)')
axs[1].set_xlabel('MSE')
axs[1].set_ylabel('Weight Sets')

# R2 plot
axs[2].barh(sorted_weight_labels_r2, sorted_r2_scores, color='orange', alpha=0.7)
axs[2].set_title('Top 10 Largest R-squared (R2)')
axs[2].set_xlabel('R2')
axs[2].set_ylabel('Weight Sets')

plt.tight_layout()
plt.show()

In [None]:
# Sort the weight sets based on MAE (ascending), MSE (ascending), and R2 (descending)
sorted_indices_mae = np.argsort(mae_scores)
sorted_indices_mse = np.argsort(mse_scores)
sorted_indices_r2 = np.argsort(r2_scores)[::-1]  # Descending order for R2

# Extract the top 5 weight sets based on each metric
top_n = 5

top_indices_mae = sorted_indices_mae[:top_n]
top_indices_mse = sorted_indices_mse[:top_n]
top_indices_r2 = sorted_indices_r2[:top_n]

# Print the top 5 weight sets and their corresponding metrics
print("Top 5 Weight Sets based on MAE:")
for idx in top_indices_mae:
    print(f"Weights: {weight_sets[idx]} - MAE: {mae_scores[idx]:.4f}, MSE: {mse_scores[idx]:.4f}, R2: {r2_scores[idx]:.4f}")

print("\nTop 5 Weight Sets based on MSE:")
for idx in top_indices_mse:
    print(f"Weights: {weight_sets[idx]} - MAE: {mae_scores[idx]:.4f}, MSE: {mse_scores[idx]:.4f}, R2: {r2_scores[idx]:.4f}")

print("\nTop 5 Weight Sets based on R2:")
for idx in top_indices_r2:
    print(f"Weights: {weight_sets[idx]} - MAE: {mae_scores[idx]:.4f}, MSE: {mse_scores[idx]:.4f}, R2: {r2_scores[idx]:.4f}")

leadership level

In [None]:
print(weight_sets[top_indices_mae[0]])

In [None]:
import pandas as pd

# Read the predicted weighted results
weighted_predictions_df = pd.read_csv('/content/weighted_predictions.csv')

# Reading real tags
true_labels_df = pd.read_csv('/content/drive/MyDrive/Data/annotation_testing.csv')

# Make sure the column names are consistent and all lowercase
weighted_predictions_df.columns = weighted_predictions_df.columns.str.lower()
true_labels_df.columns = true_labels_df.columns.str.lower()

# Display data to confirm
print(weighted_predictions_df.head())
print(true_labels_df.head())
from sklearn import metrics

# Defining the Big Five Personality Traits
traits = ['extraversion', 'neuroticism', 'agreeableness', 'conscientiousness', 'openness']

# Calculating MAE and MSE
def calculate_mae_mse(pred_df, true_df, traits):
    results = {}
    for trait in traits:
        y_true = true_df[trait].values
        y_pred = pred_df[f'{trait}_weighted'].values
        mae = metrics.mean_absolute_error(y_true, y_pred)
        mse = metrics.mean_squared_error(y_true, y_pred)
        results[trait] = {'MAE': mae, 'MSE': mse}
    return results

# Calculate MAE and MSE for weighted forecasts
weighted_metrics = calculate_mae_mse(weighted_predictions_df, true_labels_df, traits)

# Print the MAE and MSE of the weighted forecast
for trait, metrics in weighted_metrics.items():
    print(f"\nWeighted Metrics for {trait.capitalize()}:")
    for metric_name, value in metrics.items():
        print(f"{metric_name}: {value:.4f}")


In [None]:
def calculate_leadership_score(df):
    # Convert column names to lowercase
    df.columns = df.columns.str.lower()
    df.rename(columns={'extroversion': 'extraversion'}, inplace=True)

    # Calculate leadership score
    df['leadership_score'] = (df['extraversion']*0.3261) + \
                             (df['neuroticism']*(-0.087)) + \
                             (df['agreeableness']*0.1957) + \
                             (df['conscientiousness']*0.2283) + \
                             (df['openness']*0.163)
    return df

def leadership_level(score):
    if 0.8 <= score <= 1.0:
        return 'Level 1 (0.8-1)'
    elif 0.6 <= score < 0.8:
        return 'Level 2 (0.6-0.8)'
    elif 0.4 <= score < 0.6:
        return 'Level 3 (0.4-0.6)'
    elif 0.2 <= score < 0.4:
        return 'Level 4 (0.2-0.4)'
    elif 0 <= score < 0.2:
        return 'Level 5 (0-0.2)'
    else:
        return 'Unknown'


In [None]:
# Using the combination with the smallest MAE
best_weights = weight_sets[top_indices_mae[0]]

# Define the list of traits and their respective columns
traits = ['extraversion', 'neuroticism', 'agreeableness', 'conscientiousness', 'openness']
trait_columns = ['{}_video', '{}_audio', '{}_text']

# Calculate predicted scores for each Big Five personality trait using the best weights
for trait in traits:
    final_df[f'{trait}_pred'] = sum(best_weights[i] * final_df[trait_col.format(trait)] for i, trait_col in enumerate(trait_columns))

# Define weights for the Big Five personality traits
weights = {
    'extraversion': 0.3,
    'neuroticism': 0.1,
    'agreeableness': 0.2,
    'conscientiousness': 0.25,
    'openness': 0.15
}

# Calculate leadership score
final_df['leadership_score'] = sum(weights[trait] * final_df[f'{trait}_pred'] for trait in traits)

# Define a function for leadership levels
def leadership_level(score):
    if 0.8 <= score <= 1.0:
        return 'Level 1 (0.8-1)'
    elif 0.6 <= score < 0.8:
        return 'Level 2 (0.6-0.8)'
    elif 0.4 <= score < 0.6:
        return 'Level 3 (0.4-0.6)'
    elif 0.2 <= score < 0.4:
        return 'Level 4 (0.2-0.4)'
    elif 0 <= score < 0.2:
        return 'Level 5 (0-0.2)'
    else:
        return 'Unknown'

# Apply the function to compute leadership levels
final_df['leadership_level'] = final_df['leadership_score'].apply(leadership_level)

# Print the results
print(final_df[['filename', 'leadership_score', 'leadership_level']])

# Generate histogram for leadership scores
plt.figure(figsize=(10, 6))
plt.hist(final_df['leadership_score'], bins=20, color='blue', alpha=0.7)
plt.title('Distribution of Leadership Scores')
plt.xlabel('Leadership Score')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

# Generate bar plot for leadership levels
plt.figure(figsize=(10, 6))
final_df['leadership_level'].value_counts().sort_index().plot(kind='bar', color='green', alpha=0.7)
plt.title('Distribution of Leadership Levels')
plt.xlabel('Leadership Level')
plt.ylabel('Count')
plt.grid(True)
plt.show()


In [None]:
# Define the Big Five personality traits and their corresponding columns
traits = ['extraversion', 'neuroticism', 'agreeableness', 'conscientiousness', 'openness']

# Function to calculate leadership score
def calculate_leadership_score(df):
    weights = {
        'extraversion': 0.3,
        'neuroticism': 0.1,
        'agreeableness': 0.2,
        'conscientiousness': 0.25,
        'openness': 0.15
    }
    df['leadership_score'] = sum(weights[trait] * df[f'{trait}_pred'] for trait in traits)
    return df

# Function to define leadership levels
def leadership_level(score):
    if 0.8 <= score <= 1.0:
        return 'Level 1 (0.8-1)'
    elif 0.6 <= score < 0.8:
        return 'Level 2 (0.6-0.8)'
    elif 0.4 <= score < 0.6:
        return 'Level 3 (0.4-0.6)'
    elif 0.2 <= score < 0.4:
        return 'Level 4 (0.2-0.4)'
    elif 0 <= score < 0.2:
        return 'Level 5 (0-0.2)'
    else:
        return 'Unknown'

# Read true labels from CSV
true_labels_df = pd.read_csv('/content/drive/MyDrive/Data/annotation_testing.csv')

# Use true labels directly as predicted scores for each Big Five personality trait
for trait in traits:
    true_labels_df[f'{trait}_pred'] = true_labels_df[trait]

# Calculate leadership score
true_labels_df = calculate_leadership_score(true_labels_df)

# Apply function to compute leadership levels
true_labels_df['leadership_level'] = true_labels_df['leadership_score'].apply(leadership_level)

# Print results
print(true_labels_df[['Filename', 'leadership_score', 'leadership_level']])

# Generate histogram of true leadership scores
plt.figure(figsize=(10, 6))
plt.hist(true_labels_df['leadership_score'], bins=20, color='red', alpha=0.7)
plt.title('Distribution of True Leadership Scores')
plt.xlabel('Leadership Score')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

# Generate bar plot of true leadership levels
plt.figure(figsize=(10, 6))
true_labels_df['leadership_level'].value_counts().sort_index().plot(kind='bar', color='purple', alpha=0.7)
plt.title('Distribution of True Leadership Levels')
plt.xlabel('Leadership Level')
plt.ylabel('Count')
plt.grid(True)
plt.show()

# Compare distribution of predicted and true leadership scores
plt.figure(figsize=(10, 6))
plt.hist(final_df['leadership_score'], bins=20, color='blue', alpha=0.7, label='Predicted')
plt.hist(true_labels_df['leadership_score'], bins=20, color='red', alpha=0.5, label='True')
plt.title('Comparison of Leadership Scores Distribution')
plt.xlabel('Leadership Score')
plt.ylabel('Frequency')
plt.legend()
plt.grid(True)
plt.show()

# Compare distribution of predicted and true leadership levels
plt.figure(figsize=(10, 6))
final_df['leadership_level'].value_counts().sort_index().plot(kind='bar', color='blue', alpha=0.7, position=0, width=0.4, label='Predicted')
true_labels_df['leadership_level'].value_counts().sort_index().plot(kind='bar', color='red', alpha=0.5, position=1, width=0.4, label='True')
plt.title('Comparison of Leadership Levels Distribution')
plt.xlabel('Leadership Level')
plt.ylabel('Count')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
import pandas as pd
import numpy as np

# Assuming you have already found the best weights based on MAE
best_weights = weight_sets[top_indices_mae[0]]  # Using the weights with the smallest MAE

# Define Big Five personality traits and their corresponding columns
traits = ['extraversion', 'neuroticism', 'agreeableness', 'conscientiousness', 'openness']
trait_columns = ['{}_video', '{}_audio', '{}_text']

# Calculate weighted predictions for each Big Five trait
for trait in traits:
    final_df[f'{trait}_weighted'] = sum(best_weights[i] * final_df[trait_col.format(trait)] for i, trait_col in enumerate(trait_columns))

# Calculate average predictions for each Big Five trait
for trait in traits:
    final_df[f'{trait}_average'] = final_df[[trait_col.format(trait) for trait_col in trait_columns]].mean(axis=1)

# Create a new DataFrame to save weighted prediction results
weighted_prediction_df = final_df[['filename'] + [f'{trait}_weighted' for trait in traits]]

# Rename columns for clarity
weighted_prediction_df.columns = ['Filename'] + [f'{trait.capitalize()}_Weighted' for trait in traits]

# Export to CSV file
weighted_prediction_df.to_csv('weighted_predictions.csv', index=False)

# Print to confirm results
print(weighted_prediction_df.head())

# Create a new DataFrame to save average prediction results
average_prediction_df = final_df[['filename'] + [f'{trait}_average' for trait in traits]]

# Rename columns for clarity
average_prediction_df.columns = ['Filename'] + [f'{trait.capitalize()}_Average' for trait in traits]

# Export to CSV file
average_prediction_df.to_csv('average_predictions.csv', index=False)

# Print to confirm results
print(average_prediction_df.head())
