In [4]:
import pandas as pd

emotion_lexicon_path = "/home/vincent/AAA_projects/MVCS/DigitalHuman/data/Emotion_Lexicon.csv"
essays_path = "/home/vincent/AAA_projects/MVCS/DigitalHuman/data/essays_utf8.csv"
mairesse_path = "/home/vincent/AAA_projects/MVCS/DigitalHuman/data/mairesse.csv"
mbti_1_path = "/home/vincent/AAA_projects/MVCS/DigitalHuman/data/mbti_1.csv"

emotion_lexicon_df = pd.read_csv(emotion_lexicon_path)
essays_df = pd.read_csv(essays_path)
mairesse_df = pd.read_csv(mairesse_path, header=None)
mbti_1_df = pd.read_csv(mbti_1_path)

current traits used for features: Neuroticism, Extraversion, Openness, Agreeableness, Conscientiousness, Sense of coherence, Balance, Comprehensibility, Manageability, Meaningfulness, Reflection 

essays from https://web.archive.org/web/20160519045708/http://mypersonality.org/wiki/doku.php?id=wcpr13

mairesse from https://github.com/SenticNet/personality-detection

to add resilience traits later - https://www.sciencedirect.com/science/article/abs/pii/S0191886918300576

to add coping strategies and emotional problems traits later - https://link.springer.com/article/10.1007/s12144-022-03944-9

to add MBTI from text - https://www.personalitycafe.com/login/

In [5]:
import pandas as pd

# Set the first row of mairesse_df as the column headers
mairesse_df.columns = [f"mairesse_{i}" for i in range(mairesse_df.shape[1])]
mairesse_df = mairesse_df[1:]

# Convert `#AUTHID` and the corresponding column in mairesse_df to string type for accurate sorting
essays_df['#AUTHID'] = essays_df['#AUTHID'].astype(str)
mairesse_df[mairesse_df.columns[0]] = mairesse_df[mairesse_df.columns[0]].astype(str)

# Sort both DataFrames
essays_df.sort_values(by="#AUTHID", inplace=True)
mairesse_df.sort_values(by="mairesse_0", inplace=True)

# Merge DataFrames
essays_mairesse_df = pd.merge(essays_df, mairesse_df, left_on="#AUTHID", right_on="mairesse_0", how='inner')

# Specify the path where you want to save the merged DataFrame
save_path = '/home/vincent/AAA_projects/MVCS/DigitalHuman/data/merged_data.csv'

# Save the merged DataFrame to a CSV file
essays_mairesse_df.to_csv(save_path, index=False)
print(essays_mairesse_df.head())
print("Merged DataFrame saved successfully.")

           #AUTHID                                               TEXT cEXT  \
0  1997_012113.txt  The lights  are all out here in Hardin House. ...    y   
1  1997_012750.txt  I have watching Comedy Central for the past ho...    y   
2  1997_030596.txt  Well, I figured I should write this right now ...    y   
3  1997_033283.txt  I don't like having to write an my couch. I ne...    y   
4  1997_053414.txt  This blank screen is staring at me and my fing...    y   

  cNEU cAGR cCON cOPN       mairesse_0  mairesse_1  mairesse_2  ...  \
0    n    y    y    n  1997_012113.txt   -0.906225   -0.282501  ...   
1    n    n    n    y  1997_012750.txt   -0.493431   -1.352080  ...   
2    y    y    y    y  1997_030596.txt    1.118787   -0.338031  ...   
3    n    y    n    y  1997_033283.txt   -0.527438    0.542460  ...   
4    n    y    y    y  1997_053414.txt    0.122799    0.596609  ...   

   mairesse_75  mairesse_76  mairesse_77  mairesse_78  mairesse_79  \
0    -0.422267     0.744863    -0.

# Correlations between Neuroticism, Extraversion, Openness, Agreeableness, Conscientiousness, -to- Sense of coherence, Balance, Comprehensibility, Manageability, Meaningfulness, Reflection 
(https://journals.sagepub.com/doi/10.1177/1359105319884597)

In [None]:
import csv

# Open a new CSV file for writing
with open('psych_traits_digital_human_dict.csv', 'w', newline='') as f:
    # Create a CSV writer
    writer = csv.writer(f)

    # Write header
    writer.writerow(['Category', 'Trait', 'r_effect', 'g_effect', 'CI_r', 'CI_g', 'k', 'Q', 'I2', 'τ2'])

    # Loop through the nested dictionary
    for category, traits in data.items():
        for trait, values in traits.items():
            # Create a list consisting of the category, trait, and various metrics
            row = [category, trait]
            for metric, value in values.items():
                row.append(value)

            # Write the row to the CSV
            writer.writerow(row)

In [17]:
# Weights for r and g based on their perceived importance
weights = {'r_effect': 0.5, 'g_effect': 0.5}

# Researched traits (in your case, these are the big five personality traits)
researched_traits = ["Neuroticism", "Extraversion", "Openness", "Agreeableness", "Conscientiousness"]

# Create an empty dictionary to store the composite features
composite_features = {}

# Calculate the composite features
for psychological_construct, values in data.items():
    composite_features[psychological_construct] = {}
    for researched_trait in researched_traits:
        val = values.get(researched_trait)
        if val is not None:
            r = val['r_effect']
            g = val['g_effect']
            k = val['k']
            Q = val['Q']
            I2 = val['I2']
            τ2 = val['τ2']
            
            composite_feature = weights['r_effect'] * r + weights['g_effect'] * g
            # Store composite feature along with k, Q, I2, τ2
            composite_features[psychological_construct][researched_trait] = {
                'composite_feature': composite_feature,
                'k': k,
                'Q': Q,
                'I2': I2,
                'τ2': τ2
            }

# Display the composite features along with k, Q, I2, τ2
for psychological_construct, traits in composite_features.items():
    print(f"Composite features for {psychological_construct}:")
    for researched_trait, metrics in traits.items():
        print(f"  - {researched_trait}:")
        print(f"    - Composite Feature: {metrics['composite_feature']}")
        print(f"    - k: {metrics['k']}")
        print(f"    - Q: {metrics['Q']}")
        print(f"    - I2: {metrics['I2']}")
        print(f"    - τ2: {metrics['τ2']}")
    print()

# Display the composite features alone (without k, Q, I2, τ2)
for psychological_construct, traits in composite_features.items():
    print(f"Composite features for {psychological_construct}:")
    for researched_trait, metrics in traits.items():
        composite_feature = metrics['composite_feature']
        print(f"  - {researched_trait}: {composite_feature}")
    print()

Composite features for Sense of coherence:
  - Neuroticism:
    - Composite Feature: -0.905
    - k: 21
    - Q: 369.75
    - I2: 94.59
    - τ2: 0.03
  - Extraversion:
    - Composite Feature: 0.41000000000000003
    - k: 17
    - Q: 119.87
    - I2: 86.65
    - τ2: 0.01
  - Openness:
    - Composite Feature: 0.2
    - k: 12
    - Q: 45.62
    - I2: 75.89
    - τ2: 0.0
  - Agreeableness:
    - Composite Feature: 0.43
    - k: 11
    - Q: 23.35
    - I2: 57.16
    - τ2: 0.0
  - Conscientiousness:
    - Composite Feature: 0.475
    - k: 12
    - Q: 62.27
    - I2: 82.34
    - τ2: 0.01

Composite features for Balance:
  - Neuroticism:
    - Composite Feature: -0.095
    - k: 2
    - Q: 3.52
    - I2: 70.17
    - τ2: 0.01
  - Extraversion:
    - Composite Feature: 0.07500000000000001
    - k: 2
    - Q: 0.06
    - I2: 0
    - τ2: 0.0
  - Openness:
    - Composite Feature: 0.10500000000000001
    - k: 2
    - Q: 0.06
    - I2: 0
    - τ2: 0.0
  - Agreeableness:
    - Composite Feature: 0.0

In [18]:
import csv
import os

# Define the file name for the CSV
csv_file_name = "composite_features.csv"

# Specify the target directory
target_directory = "/home/vincent/AAA_projects/MVCS/DigitalHuman/data"

# Create the full path to the CSV file
csv_file_path = os.path.join(target_directory, csv_file_name)

# Open the CSV file for writing
with open(csv_file_path, mode='w', newline='') as csv_file:
    # Define the CSV writer
    csv_writer = csv.writer(csv_file)

    # Write the header row
    header = ["Psychological Construct", "Researched Trait", "Composite Feature", "k", "Q", "I2", "τ2"]
    csv_writer.writerow(header)

    # Write the data rows
    for psychological_construct, traits in composite_features.items():
        for researched_trait, metrics in traits.items():
            row = [
                psychological_construct,
                researched_trait,
                metrics['composite_feature'],
                metrics['k'],
                metrics['Q'],
                metrics['I2'],
                metrics['τ2']
            ]
            csv_writer.writerow(row)

print(f"CSV file '{csv_file_path}' has been created.")

CSV file '/home/vincent/AAA_projects/University/MSc_AI/Big Data Analysis/project2data/personality-prediction-from-text/data/training/composite_features.csv' has been created.


# Sub-Attribute Weights

In [11]:
# Create an empty dictionary to store the Sub-Attribute Weights
sub_attribute_weights = {}

# Calculate the Sub-Attribute Weights
for psychological_construct, traits in composite_features.items():
    sub_attribute_weights[psychological_construct] = {}
    for researched_trait, metrics in traits.items():
        
        # Fetch k and I2 values
        k = metrics['k']
        I2 = metrics['I2']
        
        # Normalize k and I2 (This is a simple example; you'll actually need to find min and max across all traits)
        # normalized_k = (k - min_k) / (max_k - min_k)
        # normalized_I2 = (I2 - min_I2) / (max_I2 - min_I2)
        
        # For this example, we'll assume that k and I2 are already in a normalized form (between 0 and 1)
        
        # Calculate Sub-Attribute Weight as an average of k and I2
        sub_attribute_weight = (k + I2) / 2
        
        # Store Sub-Attribute Weight
        sub_attribute_weights[psychological_construct][researched_trait] = sub_attribute_weight

# Display the Sub-Attribute Weights
for psychological_construct, traits in sub_attribute_weights.items():
    print(f"Sub-Attribute Weights for {psychological_construct}:")
    for researched_trait, weight in traits.items():
        print(f"  - {researched_trait}: {weight}")

Sub-Attribute Weights for Sense of coherence:
  - Neuroticism: 57.795
  - Extraversion: 51.825
  - Openness: 43.945
  - Agreeableness: 34.08
  - Conscientiousness: 47.17
Sub-Attribute Weights for Balance:
  - Neuroticism: 36.085
  - Extraversion: 1.0
  - Openness: 1.0
  - Agreeableness: 1.0
  - Conscientiousness: 1.0
Sub-Attribute Weights for Comprehensibility:
  - Neuroticism: 29.09
  - Extraversion: 24.95
  - Openness: 1.0
  - Agreeableness: 1.0
  - Conscientiousness: 28.705
Sub-Attribute Weights for Manageability:
  - Neuroticism: 44.62
  - Extraversion: 3.73
  - Openness: 2.0
  - Agreeableness: 39.995
  - Conscientiousness: 27.42
Sub-Attribute Weights for Meaningfulness:
  - Neuroticism: 40.96
  - Extraversion: 1.0
  - Openness: 1.0
  - Agreeableness: 1.0
  - Conscientiousness: 1.0
Sub-Attribute Weights for Reflection:
  - Neuroticism: 37.57
  - Extraversion: 1.5
  - Openness: 1.5
  - Agreeableness: 1.5
  - Conscientiousness: 19.46


In [19]:
import csv
import os

# Define the file name for the CSV
csv_file_name_sub_attr_weights = "sub_attribute_weights.csv"

# Specify the target directory (same directory as the previous CSV)
target_directory = "/home/vincent/AAA_projects/MVCS/DigitalHuman/data"

# Create the full path to the CSV file
csv_file_path_sub_attr_weights = os.path.join(target_directory, csv_file_name_sub_attr_weights)

# Open the CSV file for writing
with open(csv_file_path_sub_attr_weights, mode='w', newline='') as csv_file:
    # Define the CSV writer
    csv_writer = csv.writer(csv_file)

    # Write the header row
    header = ["Psychological Construct", "Researched Trait", "Sub-Attribute Weight"]
    csv_writer.writerow(header)

    # Write the data rows
    for psychological_construct, traits in sub_attribute_weights.items():
        for researched_trait, weight in traits.items():
            row = [
                psychological_construct,
                researched_trait,
                weight
            ]
            csv_writer.writerow(row)

print(f"CSV file '{csv_file_path_sub_attr_weights}' has been created.")

CSV file '/home/vincent/AAA_projects/University/MSc_AI/Big Data Analysis/project2data/personality-prediction-from-text/data/training/sub_attribute_weights.csv' has been created.


In [12]:
# Create an empty dictionary to store the Composite-Trait Scores
composite_trait_scores = {}

# Calculate the Composite-Trait Scores
for psychological_construct in composite_features.keys():
    composite_trait_score = 0  # Initialize to zero for each psychological construct
    
    for researched_trait in composite_features[psychological_construct].keys():
        # Fetch Composite Feature and Sub-Attribute Weight
        composite_feature = composite_features[psychological_construct][researched_trait]['composite_feature']
        sub_attribute_weight = sub_attribute_weights[psychological_construct][researched_trait]
        
        # Calculate the component of the Composite-Trait Score related to this researched trait
        composite_trait_score += composite_feature * sub_attribute_weight
    
    # Store the Composite-Trait Score for the psychological construct
    composite_trait_scores[psychological_construct] = composite_trait_score

# Display the Composite-Trait Scores
for psychological_construct, score in composite_trait_scores.items():
    print(f"Composite-Trait Score for {psychological_construct}: {score}")

Composite-Trait Score for Sense of coherence: 14.792924999999997
Composite-Trait Score for Balance: -3.113075
Composite-Trait Score for Comprehensibility: 1.2803749999999994
Composite-Trait Score for Manageability: -6.549999999999997
Composite-Trait Score for Meaningfulness: -20.1436
Composite-Trait Score for Reflection: -20.240400000000005


In [20]:
import csv
import os

# Define the file name for the CSV
csv_file_name_comp_trait_scores = "composite_trait_scores.csv"

# Specify the target directory (same directory as the previous CSV)
target_directory = "/home/vincent/AAA_projects/MVCS/DigitalHuman/data"

# Create the full path to the CSV file
csv_file_path_comp_trait_scores = os.path.join(target_directory, csv_file_name_comp_trait_scores)

# Open the CSV file for writing
with open(csv_file_path_comp_trait_scores, mode='w', newline='') as csv_file:
    # Define the CSV writer
    csv_writer = csv.writer(csv_file)

    # Write the header row
    header = ["Psychological Construct", "Composite-Trait Score"]
    csv_writer.writerow(header)

    # Write the data rows
    for psychological_construct, score in composite_trait_scores.items():
        row = [
            psychological_construct,
            score
        ]
        csv_writer.writerow(row)

print(f"CSV file '{csv_file_path_comp_trait_scores}' has been created.")

CSV file '/home/vincent/AAA_projects/University/MSc_AI/Big Data Analysis/project2data/personality-prediction-from-text/data/training/composite_trait_scores.csv' has been created.


In [13]:
import numpy as np
# Calculate the mean and standard deviation for the composite-trait scores
mean_score = np.mean(list(composite_trait_scores.values()))
std_deviation = np.std(list(composite_trait_scores.values()))

# Calculate the Adjusted Scores
adjusted_scores = {}
for construct, score in composite_trait_scores.items():
    adjusted_scores[construct] = (score - mean_score) / std_deviation

# Display the Adjusted Scores
for construct, score in adjusted_scores.items():
    print(f"Adjusted Score for {construct}: {score}")

Adjusted Score for Sense of coherence: 1.6737618528160483
Adjusted Score for Balance: 0.20859166566826876
Adjusted Score for Comprehensibility: 0.5680885917670295
Adjusted Score for Manageability: -0.07263697531591971
Adjusted Score for Meaningfulness: -1.1849422057421488
Adjusted Score for Reflection: -1.1928629291932777


In [21]:
import csv
import os

# Define the file name for the CSV
csv_file_name_adjusted_scores = "adjusted_scores.csv"

# Specify the target directory (same directory as the previous CSV)
target_directory = "/home/vincent/AAA_projects/MVCS/DigitalHuman/data"

# Create the full path to the CSV file
csv_file_path_adjusted_scores = os.path.join(target_directory, csv_file_name_adjusted_scores)

# Open the CSV file for writing
with open(csv_file_path_adjusted_scores, mode='w', newline='') as csv_file:
    # Define the CSV writer
    csv_writer = csv.writer(csv_file)

    # Write the header row
    header = ["Psychological Construct", "Adjusted Score"]
    csv_writer.writerow(header)

    # Write the data rows
    for construct, score in adjusted_scores.items():
        row = [
            construct,
            score
        ]
        csv_writer.writerow(row)

print(f"CSV file '{csv_file_path_adjusted_scores}' has been created.")

CSV file '/home/vincent/AAA_projects/University/MSc_AI/Big Data Analysis/project2data/personality-prediction-from-text/data/training/adjusted_scores.csv' has been created.


In [34]:
import pandas as pd

# Define the file paths for the DataFrames
merged_data_path = "/home/vincent/AAA_projects/MVCS/DigitalHuman/data/merged_data.csv"
composite_features_path = "/home/vincent/AAA_projects/MVCS/DigitalHuman/data/composite_features.csv"
sub_attribute_weights_path = "/home/vincent/AAA_projects/MVCS/DigitalHuman/data/sub_attribute_weights.csv"
composite_trait_scores_path = "/home/vincent/AAA_projects/MVCS/DigitalHuman/data/composite_trait_scores.csv"

# Read the DataFrames from the CSV files
merged_data_df = pd.read_csv(merged_data_path)
composite_features_df = pd.read_csv(composite_features_path)
sub_attribute_weights_df = pd.read_csv(sub_attribute_weights_path)
composite_trait_scores_df = pd.read_csv(composite_trait_scores_path)

In [37]:
# Replace these with the actual column names from your DataFrame
columns_to_change = ['cEXT', 'cNEU', 'cAGR', 'cCON', 'cOPN']

# Convert 'y' to 1 and 'n' to 0 for only the specified columns
merged_data_df[columns_to_change] = merged_data_df[columns_to_change].replace({'y': 1, 'n': 0})

# Display the DataFrame to check changes
print("Modified DataFrame:")
print(merged_data_df)

# Extract the columns of interest
subset_df = merged_data_df[columns_to_change]

# Drop duplicate rows to find unique combinations
unique_combinations_df = subset_df.drop_duplicates()

# Count the number of unique combinations
count_combinations = unique_combinations_df.shape[0]

print(f"Number of unique combinations of 1's and 0's: {count_combinations}")

Modified DataFrame:
              #AUTHID                                               TEXT  \
0     1997_033283.txt  I don't like having to write an my couch. I ne...   
1     1997_053414.txt  This blank screen is staring at me and my fing...   
2     1997_057160.txt  I suppose we all get caught up in a web our fi...   
3     1997_057748.txt  here I am typing this thing for my psychology ...   
4     1997_058607.txt  Ok. Here we go. Well I really don't have much ...   
...               ...                                                ...   
2458       2004_9.txt  So I'm not sure on what I'm supposed to be typ...   
2459      2004_93.txt  my boyfriend is in the room and he stinks like...   
2460      2004_94.txt  So this is the third time that I have tried to...   
2461      2004_97.txt  I don't know why, but for some reason I am ext...   
2462      2004_99.txt  I am currently thinking about how ready I am t...   

      cEXT  cNEU  cAGR  cCON  cOPN       mairesse_0  mairesse_1  ma

In [43]:
import pandas as pd

# Assuming weights is a dictionary of dictionaries,
# where the first key is the Psychological Construct,
# and the second key is the trait, and the value is the weight
weights = {
    'Sense of coherence': {'Neuroticism': 57.795, 'Extraversion': 51.825, 'Openness': 43.945, 'Agreeableness': 34.08, 'Conscientiousness': 47.17},
    'Balance': {'Neuroticism': 36.085, 'Extraversion': 1.0, 'Openness': 1.0, 'Agreeableness': 1.0, 'Conscientiousness': 1.0},
    'Comprehensibility': {'Neuroticism': 29.09, 'Extraversion': 24.95, 'Openness': 1.0, 'Agreeableness': 1.0, 'Conscientiousness': 28.705},
    'Manageability': {'Neuroticism': 44.62, 'Extraversion': 3.73, 'Openness': 2.0, 'Agreeableness': 39.995, 'Conscientiousness': 27.42},
    'Meaningfulness': {'Neuroticism': 40.96, 'Extraversion': 1.0, 'Openness': 1.0, 'Agreeableness': 1.0, 'Conscientiousness': 1.0},
    'Reflection': {'Neuroticism': 37.57, 'Extraversion': 1.5, 'Openness': 1.5, 'Agreeableness': 1.5, 'Conscientiousness': 19.46}
}

# Map the column names to the trait names
column_to_trait = {'cEXT': 'Extraversion', 'cNEU': 'Neuroticism', 'cAGR': 'Agreeableness', 'cCON': 'Conscientiousness', 'cOPN': 'Openness'}

def calculate_combinations(row):
    combination_results = {}
    for construct, traits in weights.items():
        weighted_sum = 0
        for col, trait in column_to_trait.items():
            weighted_sum += row[col] * traits[trait]
        combination_results[construct] = weighted_sum
    return pd.Series(combination_results)

# Assuming merged_data_df is your DataFrame with the traits columns
merged_data_df = merged_data_df[['cEXT', 'cNEU', 'cAGR', 'cCON', 'cOPN', 'Sense of coherence', 'Balance', 'Comprehensibility', 'Manageability', 'Meaningfulness', 'Reflection']]

# Calculate the combination results
combination_results = merged_data_df.apply(calculate_combinations, axis=1)

# Concatenate the combination results back to the original DataFrame
merged_data_df = pd.concat([merged_data_df[['cEXT', 'cNEU', 'cAGR', 'cCON', 'cOPN']], combination_results], axis=1)

print(merged_data_df)

      cEXT  cNEU  cAGR  cCON  cOPN  Sense of coherence  Balance  \
0        1     0     1     0     1             129.850    3.000   
1        1     0     1     1     1             177.020    4.000   
2        1     0     1     1     1             177.020    4.000   
3        0     1     0     0     1             101.740   37.085   
4        0     1     0     0     1             101.740   37.085   
...    ...   ...   ...   ...   ...                 ...      ...   
2458     1     0     1     1     0             133.075    3.000   
2459     1     1     0     1     1             200.735   39.085   
2460     0     0     1     1     1             125.195    3.000   
2461     1     0     1     0     1             129.850    3.000   
2462     0     1     0     1     0             104.965   37.085   

      Comprehensibility  Manageability  Meaningfulness  Reflection  
0                26.950         45.725            3.00        4.50  
1                55.655         73.145            4.00   

In [57]:
# Specify the directory where you want to save the result
output_directory = "/home/vincent/AAA_projects/MVCS/DigitalHuman/data/merged"

# Save the DataFrame to a CSV file in the specified directory
output_file_path = f"{output_directory}/merged_data_with_traits.csv"
merged_data_with_new_advanced_features_df.to_csv(output_file_path, index=False)

print(f"Saved the result to {output_file_path}")

Saved the result to /home/vincent/AAA_projects/University/MSc_AI/Big Data Analysis/project2data/personality-prediction-from-text/data/training/merged/merged_data_with_traits.csv


In [44]:
import pandas as pd

# Assuming merged_data_df contains Big5 traits and calculated psychological construct scores
# Assuming adjusted_scores_df contains the adjusted scores for each psychological construct

# Convert the adjusted_scores_df into a dictionary for easy lookup
adjusted_scores_dict = {
    'Sense of coherence': 1.6737618528160483,
    'Balance': 0.20859166566826876,
    'Comprehensibility': 0.5680885917670295,
    'Manageability': -0.07263697531591971,
    'Meaningfulness': -1.1849422057421488,
    'Reflection': -1.1928629291932777
}

# Function to create new features based on interactions between traits and psychological constructs
def create_interaction_features(row):
    new_features = {}
    for col in column_to_trait.keys():  # Loop through the Big5 trait columns
        for construct in adjusted_scores_dict.keys():  # Loop through the psychological constructs
            # Create interaction terms
            interaction_term = row[col] * row[construct] * adjusted_scores_dict[construct]
            new_features[f"{col}_{construct}_interaction"] = interaction_term
    return pd.Series(new_features)

# Apply the function to create new interaction features
new_features_df = merged_data_df.apply(create_interaction_features, axis=1)

# Concatenate the new features to the original DataFrame
merged_data_with_new_features_df = pd.concat([merged_data_df, new_features_df], axis=1)

print(merged_data_with_new_features_df)

      cEXT  cNEU  cAGR  cCON  cOPN  Sense of coherence  Balance  \
0        1     0     1     0     1             129.850    3.000   
1        1     0     1     1     1             177.020    4.000   
2        1     0     1     1     1             177.020    4.000   
3        0     1     0     0     1             101.740   37.085   
4        0     1     0     0     1             101.740   37.085   
...    ...   ...   ...   ...   ...                 ...      ...   
2458     1     0     1     1     0             133.075    3.000   
2459     1     1     0     1     1             200.735   39.085   
2460     0     0     1     1     1             125.195    3.000   
2461     1     0     1     0     1             129.850    3.000   
2462     0     1     0     1     0             104.965   37.085   

      Comprehensibility  Manageability  Meaningfulness  ...  \
0                26.950         45.725            3.00  ...   
1                55.655         73.145            4.00  ...   
2     

In [58]:
# Specify the directory where you want to save the result
output_directory = "/home/vincent/AAA_projects/MVCS/DigitalHuman/data/merged"

# Save the DataFrame to a CSV file in the specified directory
output_file_path = f"{output_directory}/merged_data_with_traitbig5_interactions.csv"
merged_data_with_new_features_df.to_csv(output_file_path, index=False)

print(f"Saved the result to {output_file_path}")

Saved the result to /home/vincent/AAA_projects/University/MSc_AI/Big Data Analysis/project2data/personality-prediction-from-text/data/training/merged/merged_data_with_traitbig5_interactions.csv


In [54]:
import pandas as pd

# Assuming you've already read the DataFrame
composite_trait_scores_df = pd.read_csv('/home/vincent/AAA_projects/MVCS/DigitalHuman/data/composite_features.csv')

# Create the composite_feature_dict from the DataFrame
composite_feature_dict = {}
for index, row in composite_trait_scores_df.iterrows():
    construct = row['Psychological Construct']
    trait = row['Researched Trait']
    composite_feature = row['Composite Feature']
    
    if construct not in composite_feature_dict:
        composite_feature_dict[construct] = {}
    
    composite_feature_dict[construct][trait] = composite_feature

# Function to create new features
def create_advanced_interaction_features(row):
    new_features = {}
    for col in column_to_trait.keys():  # Assuming column_to_trait is a mapping from your columns to Big5 traits
        for construct in composite_feature_dict.keys():  # Loop through the psychological constructs
            # Retrieve the corresponding Composite Feature value
            composite_value = composite_feature_dict[construct].get(column_to_trait[col], 0)
            
            # Create new interaction terms
            interaction_term = row[col] * row[construct] * composite_value
            new_features[f"{col}_{construct}_advanced_interaction"] = interaction_term
    return pd.Series(new_features)

# Assuming merged_data_df contains your original data
# Apply the function to create new advanced interaction features
new_advanced_features_df = merged_data_df.apply(create_advanced_interaction_features, axis=1)

# Concatenate the new features to the original DataFrame
merged_data_with_new_advanced_features_df = pd.concat([merged_data_df, new_advanced_features_df], axis=1)

print(merged_data_with_new_advanced_features_df)

      cEXT  cNEU  cAGR  cCON  cOPN  Sense of coherence  Balance  \
0        1     0     1     0     1             129.850    3.000   
1        1     0     1     1     1             177.020    4.000   
2        1     0     1     1     1             177.020    4.000   
3        0     1     0     0     1             101.740   37.085   
4        0     1     0     0     1             101.740   37.085   
...    ...   ...   ...   ...   ...                 ...      ...   
2458     1     0     1     1     0             133.075    3.000   
2459     1     1     0     1     1             200.735   39.085   
2460     0     0     1     1     1             125.195    3.000   
2461     1     0     1     0     1             129.850    3.000   
2462     0     1     0     1     0             104.965   37.085   

      Comprehensibility  Manageability  Meaningfulness  ...  \
0                26.950         45.725            3.00  ...   
1                55.655         73.145            4.00  ...   
2     

In [59]:
# Specify the directory where you want to save the result
output_directory = "/home/vincent/AAA_projects/MVCS/DigitalHuman/data/merged/completed"

# Save the DataFrame to a CSV file in the specified directory
output_file_path = f"{output_directory}/merged_data_with_advanced_interactions.csv"
merged_data_with_new_advanced_features_df.to_csv(output_file_path, index=False)

print(f"Saved the result to {output_file_path}")

Saved the result to /home/vincent/AAA_projects/University/MSc_AI/Big Data Analysis/project2data/personality-prediction-from-text/data/training/merged/completed/merged_data_with_advanced_interactions.csv


In [28]:
# Print the shape (rows and columns) of each DataFrame
print(f"Shape of merged_data_df: {merged_data_df.shape}")
print(f"Shape of composite_features_df: {composite_features_df.shape}")
print(f"Shape of sub_attribute_weights_df: {sub_attribute_weights_df.shape}")
print(f"Shape of composite_trait_scores_df: {composite_trait_scores_df.shape}")

Shape of merged_data_df: (2463, 92)
Shape of composite_features_df: (30, 7)
Shape of sub_attribute_weights_df: (30, 3)
Shape of composite_trait_scores_df: (6, 2)


In [27]:
# Print the first two rows of each DataFrame
print("First two rows of merged_data_df:")
print(merged_data_df.head(2))

print("\nFirst two rows of composite_features_df:")
print(composite_features_df.head(2))

print("\nFirst two rows of sub_attribute_weights_df:")
print(sub_attribute_weights_df.head(2))

print("\nFirst two rows of composite_trait_scores_df:")
print(composite_trait_scores_df.head(2))

First two rows of merged_data_df:
           #AUTHID                                               TEXT cEXT  \
0  1997_033283.txt  I don't like having to write an my couch. I ne...    y   
1  1997_053414.txt  This blank screen is staring at me and my fing...    y   

  cNEU cAGR cCON cOPN       mairesse_0  mairesse_1  mairesse_2  ...  \
0    n    y    n    y  1997_033283.txt   -0.527438    0.542460  ...   
1    n    y    y    y  1997_053414.txt    0.122799    0.596609  ...   

   mairesse_75  mairesse_76  mairesse_77  mairesse_78  mairesse_79  \
0    -0.422267    -1.028267    -0.451544    -0.408772    -0.548258   
1    -0.422267     0.513222    -0.451544     1.416726     0.420822   

   mairesse_80  mairesse_81  mairesse_82  mairesse_83  mairesse_84  
0     4.717551     4.945683     4.712785     4.988888     4.397205  
1     4.983241     3.990245     4.710706     4.786429     5.008889  

[2 rows x 92 columns]

First two rows of composite_features_df:
  Psychological Construct Research

# Probability scores for traits for Sense of coherence, Balance, Comprehensibility, Manageability, Meaningfulness, Reflection 

In [20]:
import pandas as pd
import numpy as np

# File paths
sub_attr_weights_path = "/home/vincent/AAA_projects/MVCS/DigitalHuman/data/sub_attribute_weights.csv"
psych_traits_path = "/home/vincent/AAA_projects/MVCS/DigitalHuman/data/psych_traits_digital_human_dict.csv"
composite_trait_scores_path = "/home/vincent/AAA_projects/MVCS/DigitalHuman/data/composite_trait_scores.csv"
adjusted_scores_path = "/home/vincent/AAA_projects/MVCS/DigitalHuman/data/adjusted_scores.csv"
merged_data_path = "/home/vincent/AAA_projects/MVCS/DigitalHuman/data/merged/completed/merged_data_with_advanced_interactions.csv"

# Importing CSV files into pandas DataFrames
sub_attr_weights_df = pd.read_csv(sub_attr_weights_path)
psych_traits_df = pd.read_csv(psych_traits_path)
composite_trait_scores_df = pd.read_csv(composite_trait_scores_path)
adjusted_scores_df = pd.read_csv(adjusted_scores_path)
merged_data_df = pd.read_csv(merged_data_path)


merged_data_df.rename(columns={
    'cEXT': 'Extraversion',
    'cNEU': 'Neuroticism',
    'cAGR': 'Agreeableness',
    'cCON': 'Conscientiousness',
    'cOPN': 'Openness'
}, inplace=True)

# Create a list of the composite traits
composite_traits = ['Sense of coherence', 'Balance', 'Comprehensibility', 'Manageability', 'Meaningfulness', 'Reflection']

# Function to calculate custom probability metric
def calculate_custom_probability(value, weight):
    return 1 / (1 + np.exp(-value * weight))

# Your sub_attr_weights, replace with actual values
sub_attr_weights = {
    'Sense of coherence': 0.8,
    'Balance': 0.7,
    'Comprehensibility': 0.6,
    'Manageability': 0.9,
    'Meaningfulness': 0.85,
    'Reflection': 0.75
}

# Calculate custom probability metric for each composite trait
for trait in composite_traits:
    merged_data_df[f'{trait}_Custom_Probability'] = merged_data_df[trait].apply(lambda x: calculate_custom_probability(x, sub_attr_weights[trait]))

# Show first few rows to verify
print(merged_data_df.head())


   Extraversion  Neuroticism  Agreeableness  Conscientiousness  Openness  \
0             1            0              1                  0         1   
1             1            0              1                  1         1   
2             1            0              1                  1         1   
3             0            1              0                  0         1   
4             0            1              0                  0         1   

   Sense of coherence  Balance  Comprehensibility  Manageability  \
0              129.85    3.000             26.950         45.725   
1              177.02    4.000             55.655         73.145   
2              177.02    4.000             55.655         73.145   
3              101.74   37.085             30.090         46.620   
4              101.74   37.085             30.090         46.620   

   Meaningfulness  ...  cOPN_Comprehensibility_advanced_interaction  \
0            3.00  ...                                     9.29

# Vectorization

In [None]:
Bag-of-Words (BoW)
TF-IDF (Term Frequency-Inverse Document Frequency)
Word Embeddings
BERT or other Transformer Embedding

# Lexicons

In [None]:
LIWC (Linguistic Inquiry and Word Count):
AFINN
NRC Emotion Lexicon

# concatenate Vectorization and Lexicons 