# Study 3: LIWC Data Preprocessing

In [239]:
#Loading necessary packages

import pandas as pd
import numpy as np
from datetime import datetime
import statsmodels.api as sm
import statsmodels.formula.api as smf

pd.set_option('display.max_rows', 100, 'display.max_columns', 100)

### Steps

1. Loading in LIWC Data and Preprocessing
2. Merging LIWC Data with Relevant Survey Data from Previous Step
3. Calculating non LIWC scores via TextBlob (sentiment and subjectivity)
4. Computing Linguistic SImilarity to the Preferred Message from the Message Choice Task
5. Melting LIWC dataframe from Wide to Long Format (DF1: Message Preference Outcome) for Mixed Level Analysis
6. Melting LIWC dataframe from Wide to Long Format (DF2 & 3: Perceived Personalization and Perceived Message Effectiveness Outcome) for Mixed Level Analysis

# 1. LIWC Data Preprocessing

In [1]:
import pandas as pd

In [2]:
# Reading in the LIWC results that were run on the exporte Chat Data
results_liwc = pd.read_csv('C:/Users/77197jsc/OneDrive - Erasmus University Rotterdam/Documents/Study 3/Data/whatsdata_january_2024/liwc_final2.csv')
all_schools_final2 = pd.read_csv('C:/Users/77197jsc/OneDrive - Erasmus University Rotterdam/Documents/Study 3/Data/whatsdata_january_2024/all_schools2.csv')
print(len(results_liwc))

191


In [3]:
#Lets have a look into the data
results_liwc.head(2)

Unnamed: 0,Movez_code,Message,Number of messages,No_char,No_words,Message_en,Message_checked,School_mail,Segment,WPS,...,want,acquire,lack,fulfill,fatigue,reward,risk,curiosity,allure,Conversation
0,1016110.0,Maar voel weer goed dus kan miegen weer pesten...,26,2098,389,But feel good again so can bully miegen again;...,But feel good again so can bully miegen again;...,d.holterman@wpkeesboeke.nl,1,122.67,...,1.36,0.27,0.54,0.0,0.27,0.0,0.0,0.0,10.33,3.8
1,1049218.0,Je ben al 00 min aan het kakken; Broer schiet ...,551,4494,794,Hahahaha isg; Lekka; You've been pooping for 0...,Hahahaha isg; Lekka; You've been pooping for 0...,l113319@gsr.nl,1,60.0,...,0.24,0.36,0.0,0.12,0.0,0.0,0.12,0.0,10.24,2.62


In [4]:
print(results_liwc.columns)

Index(['Movez_code', 'Message', 'Number of messages', 'No_char', 'No_words',
       'Message_en', 'Message_checked', 'School_mail', 'Segment', 'WPS',
       'BigWords', 'Drives', 'cogproc', 'tone_pos', 'tone_neg', 'emo_pos',
       'emo_neg', 'Social', 'need', 'want', 'acquire', 'lack', 'fulfill',
       'fatigue', 'reward', 'risk', 'curiosity', 'allure', 'Conversation'],
      dtype='object')


In [5]:
#First we are transforming the DF into long format

liwc_long = pd.melt(results_liwc, id_vars=['Movez_code', 'Message', 'No_char', "No_words", 'Number of messages', 'Message_en',
       'Segment',], value_vars=['BigWords', 'Drives', 'cogproc', 'tone_pos',
       'tone_neg', 'emo_pos', 'emo_neg', 'Social', 'need', 'want', 'acquire',
       'lack', 'fulfill', 'fatigue', 'reward', 'risk', 'curiosity', 'allure',
       'Conversation'], var_name = "Message_pref", value_name="LIWC")

print(len(liwc_long))

3629


In [6]:
#Creating standardized values for all LIWC scores

# List of columns to standardize
cols_to_standardize = [
    "BigWords", "Drives", "cogproc", "tone_pos", "tone_neg", "emo_pos", "emo_neg",
    "Social", "need", "want", "acquire", "lack", "fulfill", "fatigue",
    "curiosity", "reward", "risk", "allure", "Conversation"
]

# Apply standardization
for col in cols_to_standardize:
    z_col = f"{col}_z"
    results_liwc[z_col] = (results_liwc[col] - results_liwc[col].mean()) / results_liwc[col].std()


In [7]:
#From there we can create a new variable psycholinguistic similarity. For this we substract aprticipants LIWC score from the LIWC score of the experimental message

# Experimental message LIWC scores
experimental_scores = {
    "Drives": 15.38,
    "cogproc": 15.38,
    "tone_pos": 16.00,
    "tone_neg": 16.67,
    "emo_pos": 16.00,
    "emo_neg": 16.00,
    "Social": 14.81,
    "need": 14.81,
    "want": 15.38,
    "acquire": 15.38,
    "lack": 14.81,
    "fulfill": 16.00,
    "fatigue": 14.81,
    "reward": 14.81,
    "risk": 15.38,
    "curiosity": 16.00,
    "allure": 16.00,
    "WPS": 43.00,
    "Conversation": 14.81,
    "BigWords": 21.74,
}

# Compute match scores
for feature, exp_value in experimental_scores.items():
    results_liwc[f"match_{feature}"] = exp_value - results_liwc[feature]



In [8]:
# Lets also create a psycholingustic similarity variable in reference to the control base message

# Control message LIWC scores
control_scores = {
    "Drives": 0,
    "cogproc": 0,
    "tone_pos": 0,
    "tone_neg": 0,
    "emo_pos": 0,
    "emo_neg": 0,
    "Social": 0,
    "need": 0,
    "want": 0,
    "acquire": 3.85,
    "lack": 0,
    "fulfill": 0,
    "fatigue": 0,
    "reward": 0,
    "risk": 0,
    "curiosity": 0,
    "allure": 0,
    "WPS": 8.33,
    "Conversation": 0,
    "BigWords": 0,
}

# Compute control match scores
for feature, ctrl_value in control_scores.items():
    col_to_use = "tone_pos" if feature == "intensity" else feature
    results_liwc[f"match_{feature}_ctr"] = ctrl_value - results_liwc[col_to_use]


In [9]:
#Lets assign the Message LIWC Scores to the Dataframe as a Reference

# Experimental message LIWC values
expmsg_values = {
    "Drives": 15.38, "cogproc": 15.38, "tone_pos": 16, "tone_neg": 16.67, "emo_pos": 16,
    "emo_neg": 16, "Social": 14.81, "need": 14.81, "want": 15.38, "acquire": 15.38,
    "lack": 14.81, "fulfill": 16.00, "fatigue": 14.81, "reward": 14.81, "risk": 15.38,
    "curiosity": 16, "allure": 16, "WPS": 14.33, "Conversation": 14.81, "BigWords": 21.74,
    "intensity": 20.00
}

# Control message LIWC values
ctrmsg_values = {
    "Drives": 0, "cogproc": 0, "tone_pos": 0, "tone_neg": 0, "emo_pos": 0,
    "emo_neg": 0, "Social": 0, "need": 0, "want": 0, "acquire": 3.85,
    "lack": 0, "fulfill": 0, "fatigue": 0, "reward": 0, "risk": 0,
    "curiosity": 0, "allure": 0, "WPS": 8.33, "Conversation": 0, "BigWords": 0,
    "intensity": 7.69
}

# Assign values to DataFrame columns
for feature, val in expmsg_values.items():
    results_liwc[f"expmsg_{feature}"] = val

for feature, val in ctrmsg_values.items():
    results_liwc[f"ctrmsg_{feature}"] = val


  results_liwc[f"ctrmsg_{feature}"] = val
  results_liwc[f"ctrmsg_{feature}"] = val
  results_liwc[f"ctrmsg_{feature}"] = val
  results_liwc[f"ctrmsg_{feature}"] = val


In [10]:
# List of LIWC features to summarize
liwc_features = [
    "WPS", "BigWords", "Drives", "cogproc", "tone_pos", "tone_neg", "emo_pos", "emo_neg",
    "Social", "need", "want", "acquire", "lack", "fulfill", "fatigue", "curiosity",
    "reward", "risk", "allure", "Conversation"
]

# Print mean values so that we can filter out categories that dont appear often in our data
for feature in liwc_features:
    print(f"{feature}: {results_liwc[feature].mean():.2f}")


WPS: 79.32
BigWords: 10.35
Drives: 3.36
cogproc: 11.69
tone_pos: 3.19
tone_neg: 0.96
emo_pos: 0.92
emo_neg: 0.49
Social: 11.88
need: 1.16
want: 0.53
acquire: 0.76
lack: 0.29
fulfill: 0.05
fatigue: 0.07
curiosity: 0.18
reward: 0.06
risk: 0.11
allure: 9.61
Conversation: 3.18


# 2. Merging LIWC Data With Survey Data

In [11]:
# Finally we can merge the school dataframe with the LIWC dataframe
results_liwc["Movez_code"] = results_liwc["Movez_code"].astype(float)
all_schools_final2["Movez_code"] = all_schools_final2["Movez_code"].astype(float)

# Merging on either 'Movez_code' or 'School_mail'
complete_df = pd.merge(all_schools_final2, results_liwc, how='inner', on='School_mail')

print(len(complete_df))

171


In [12]:
all_schools_final2.columns

Index(['StartDate_left', 'EndDate_left', 'Status_left', 'Progress_left',
       'Duration (in seconds)_left', 'Finished_left', 'RecordedDate_left',
       'ResponseId_left', 'DistributionChannel_left', 'UserLanguage_left',
       ...
       'need', 'lack', 'fatigue', 'Drives', 'curiosity', 'Conversation',
       'fulfill', 'intensity', 'Pref_value_mean', 'Similarity_mean'],
      dtype='object', length=200)

In [13]:
# Lets rename some column names

complete_df["intensity_y"] = complete_df["tone_pos_y"]
complete_df = complete_df.rename(columns={"intensity": "intensity_x"})

In [14]:
#Finally i will compute a Mean LIWC score across all categories

complete_df["Message_choice_mean"] = complete_df[["cogproc_x", "emo_pos_x", "emo_neg_x", "allure_x", "acquire_x", "BigWords_x", "want_x", "WPS_x", "Social_x", "risk_x", "reward_x", "tone_pos_x", "tone_neg_x", "need_x", "lack_x", "fatigue_x", "Drives_x", "curiosity_x", "Conversation_x", "fulfill_x", "intensity_x"]].mean(axis=1)

## 3. Computing the non-LIWC Subjectivity and Sentiment Score from TextBlob

In [15]:
import sys
!{sys.executable} -m pip install textblob
from textblob import TextBlob



In [16]:
def get_sentiment_scores(text):
    blob = TextBlob(text)
    return pd.Series({'Intensity': blob.sentiment.polarity, 'Subjectivity': blob.sentiment.subjectivity})

In [17]:
complete_df[['Sentiment', 'Subjectivity']] = complete_df['Message_checked'].apply(get_sentiment_scores)

## 4. Computing Psycholinguistic Similarity to the Preferred Message

In [18]:
import numpy as np

# List of column names to process
columns = [
    "cogproc", "emo_pos", "emo_neg", "allure", "acquire", "BigWords", "want", "WPS",
    "Social", "risk", "reward", "tone_pos", "tone_neg", "need", "lack", "fatigue",
    "Drives", "curiosity", "Conversation", "fulfill", "intensity"
]

# Loop through each column to compute the new values
for col in columns:
    # Construct the column names for the experimental and control messages
    expmsg_col = f"expmsg_{col}"
    ctrmsg_col = f"ctrmsg_{col}"
    original_col = f"{col}_x"
    LIWC_col = f"{col}_y"
    
    # Define the new column name
    new_col = f"{col}_chosenmatch"
    
    # Compute the new column based on the condition
    complete_df[new_col] = complete_df.apply(
        lambda row: (
            abs(row[expmsg_col] - row[LIWC_col]) if row[original_col] == 1 else
            abs(row[ctrmsg_col] - row[LIWC_col]) if row[original_col] == 0 else
            np.nan
        ),
        axis=1
    )

In [19]:
# Lets compute a mean score for all preferred messages across categories

complete_df["Chosen_message_mean"] = complete_df[[ 'cogproc_chosenmatch', 'emo_pos_chosenmatch','emo_neg_chosenmatch','allure_chosenmatch','acquire_chosenmatch','BigWords_chosenmatch','want_chosenmatch','WPS_chosenmatch','Social_chosenmatch','risk_chosenmatch','reward_chosenmatch','tone_pos_chosenmatch','tone_neg_chosenmatch','need_chosenmatch','lack_chosenmatch','fatigue_chosenmatch','Drives_chosenmatch','curiosity_chosenmatch','Conversation_chosenmatch','fulfill_chosenmatch','intensity_chosenmatch']].mean(axis=1)

In [20]:
#Lets make the df shorter an only keep relevant categories
complete_df_short = complete_df[["Movez_code_x", "Sex", "Gender", "age", "FAS", "Message_choice_mean", "Chosen_message_mean", "Similarity_mean"]]

# We then calculate a similarity score substracted from 100 to increase interpretabiltiy
complete_df_short["Chosen_message_mean100"] = 100 - complete_df_short["Chosen_message_mean"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  complete_df_short["Chosen_message_mean100"] = 100 - complete_df_short["Chosen_message_mean"]


In [21]:
complete_df_short['Sex'] = complete_df_short['Sex'].map({'Vrouw': 2, 'Man': 1}).fillna(0).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  complete_df_short['Sex'] = complete_df_short['Sex'].map({'Vrouw': 2, 'Man': 1}).fillna(0).astype(int)


In [22]:
#complete_df_short.to_csv("complete_wide_small.csv", encoding='utf-8', index=False)

## 5. Melting complete LIWC dataframe from Wide to Long Format (Message Preference Outcome)

In [23]:
# Define the list of linguistic categories (original columns)
columns = [
    "cogproc", "emo_pos", "emo_neg", "allure", "acquire", "BigWords", "want", "WPS",
    "Social", "risk", "reward", "tone_pos", "tone_neg", "need", "lack", "fatigue",
    "Drives", "curiosity", "Conversation", "fulfill", "intensity"
]

# Prefix and participant ID columns

prefix = "match_"
id_columns = ["Movez_code_x", "Sex", "Gender", "age", "FAS"]

# Step 1: Filter for columns that match exact prefix + category, and not ending in _ctr
value_vars = [
    col for col in complete_df.columns
    for cat in columns
    if col.startswith(f"{prefix}{cat}") and not col.endswith("_ctr")
]

# Remove accidental duplicates
value_vars = list(set(value_vars))

print(f"✅ Filtered value_vars (columns to melt): {len(value_vars)}")
# Optional: print(value_vars)

# Step 2: Melt dataframe
melted = complete_df.melt(
    id_vars=id_columns,
    value_vars=value_vars,
    var_name="column",
    value_name="value"
)

print(f"📊 Melted dataframe shape: {melted.shape}")  # Expecting ~171*21 = 3591

# Step 3: Extract linguistic category
def extract_category(col_name):
    for cat in columns:
        if col_name.startswith(f"{prefix}{cat}"):
            return cat
    return None

melted["Linguistic Category"] = melted["column"].apply(extract_category)

# Step 4: Check for unmatched categories
missing = melted[melted["Linguistic Category"].isna()]
print(f"⚠️ Rows with no matched category: {len(missing)}")
if not missing.empty:
    print(missing.head())

# Step 5: Check for duplicates before pivot
dupes = melted.duplicated(subset=id_columns + ["Linguistic Category"])
print(f"⚠️ Duplicate participant-category combinations before pivot: {dupes.sum()}")

# Step 6: Pivot table using participant ID only (avoid accidental over-indexing)
pivoted = melted.pivot_table(
    index=["Movez_code_x", "Linguistic Category"],
    values="value",
    aggfunc="first",
    dropna=False
).reset_index()

pivoted.columns.name = None

print(f"✅ Final pivoted shape: {pivoted.shape}")  # Should be (171*21 = 3591, 3)

pivoted.columns.name = None

print(f"✅ Final pivoted shape: {pivoted.shape}")  # Should be 171 * 21 = 3591


✅ Filtered value_vars (columns to melt): 20
📊 Melted dataframe shape: (3420, 7)
⚠️ Rows with no matched category: 0
⚠️ Duplicate participant-category combinations before pivot: 20
✅ Final pivoted shape: (3400, 3)
✅ Final pivoted shape: (3400, 3)


In [24]:
# Define the list of linguistic categories (original columns)
columns = [
    "cogproc", "emo_pos", "emo_neg", "allure", "acquire", "BigWords", "want", "WPS",
    "Social", "risk", "reward", "tone_pos", "tone_neg", "need", "lack", "fatigue",
    "Drives", "curiosity", "Conversation", "fulfill", "intensity"
]

# Suffix and participant ID columns
suffix = "_x"
id_columns = ["Movez_code_x", "Sex", "Gender", "age", "FAS"]

# Step 1: Filter columns that match exact category + suffix, and not ending in '_ctr'
value_vars = [
    col for col in complete_df.columns
    if any(col == f"{cat}{suffix}" for cat in columns) and not col.endswith("_ctr")
]

# Remove duplicates (just in case)
value_vars = list(set(value_vars))

print(f"✅ Filtered value_vars (columns to melt): {len(value_vars)}")
# Optional: print(value_vars)

# Step 2: Melt dataframe
melted2 = complete_df.melt(
    id_vars=id_columns,
    value_vars=value_vars,
    var_name="column",
    value_name="value"
)

print(f"📊 Melted dataframe shape: {melted2.shape}")  # Expecting ~171*21 = 3591

# Step 3: Extract linguistic category
def extract_category(col_name):
    for cat in columns:
        if col_name == f"{cat}{suffix}":
            return cat
    return None

melted2["Linguistic Category"] = melted2["column"].apply(extract_category)

# Step 4: Check for unmatched categories
missing = melted2[melted2["Linguistic Category"].isna()]
print(f"⚠️ Rows with no matched category: {len(missing)}")
if not missing.empty:
    print(missing.head())

# Step 5: Check for duplicates before pivot
dupes = melted2.duplicated(subset=id_columns + ["Linguistic Category"])
print(f"⚠️ Duplicate participant-category combinations before pivot: {dupes.sum()}")

# Step 6: Pivot table using participant ID only
pivoted2 = melted2.pivot_table(
    index=["Movez_code_x", "Linguistic Category"],
    values="value",
    aggfunc="first",
    dropna=False
).reset_index()

pivoted2.columns.name = None

print(f"✅ Final pivoted shape: {pivoted2.shape}")  # Should be 171 * 21 = 3591

message_pref = pivoted2

✅ Filtered value_vars (columns to melt): 21
📊 Melted dataframe shape: (3591, 7)
⚠️ Rows with no matched category: 0
⚠️ Duplicate participant-category combinations before pivot: 21
✅ Final pivoted shape: (3570, 3)


In [25]:
#Some renaming of columns

pivoted = pivoted.rename(columns={"value": "Similarity: EXP"}) 
message_pref = message_pref.rename(columns={"value": "Message_pref"})

In [26]:
#Now we can have a look into the data structure

pivoted["Linguistic Category"].value_counts()

Linguistic Category
BigWords        170
Conversation    170
tone_pos        170
tone_neg        170
risk            170
reward          170
need            170
lack            170
fulfill         170
fatigue         170
emo_pos         170
emo_neg         170
curiosity       170
cogproc         170
allure          170
acquire         170
WPS             170
Social          170
Drives          170
want            170
Name: count, dtype: int64

In [27]:
# And the particiaptn information
message_pref.head()

Unnamed: 0,Movez_code_x,Linguistic Category,Message_pref
0,1016110.0,BigWords,0.0
1,1016110.0,Conversation,0.0
2,1016110.0,Drives,0.0
3,1016110.0,Social,1.0
4,1016110.0,WPS,0.0


In [28]:
# Lets also gather the participant-level information into one dataframe

# Step 1: Get participant-level info
participant_info = complete_df[["Movez_code_x", "Sex", "Gender", "age", "FAS"]].drop_duplicates()

# Step 2: Merge demographics directly into the long-format dataframe
pivoted_with_demo1 = pd.merge(pivoted, participant_info, on="Movez_code_x", how="left")
print(len(pivoted_with_demo1))
pivoted_with_demo2 = pd.merge(pivoted_with_demo1, message_pref, on=["Movez_code_x", "Linguistic Category"], how="left")
print(len(pivoted_with_demo2))

# Again we will compute a score relative to 100 for similarity
pivoted_with_demo2["Similarity: EXP_100"] = 100 - pivoted_with_demo2["Similarity: EXP"]

3400
3400


In [29]:
# Lets have a look into the df with demographic information merged
pivoted_with_demo2.head()

Unnamed: 0,Movez_code_x,Linguistic Category,Similarity: EXP,Sex,Gender,age,FAS,Message_pref,Similarity: EXP_100
0,1016110.0,BigWords,13.32,Man,Man,14.0,1.833333,0.0,86.68
1,1016110.0,Conversation,11.01,Man,Man,14.0,1.833333,0.0,88.99
2,1016110.0,Drives,11.3,Man,Man,14.0,1.833333,0.0,88.7
3,1016110.0,Social,3.67,Man,Man,14.0,1.833333,1.0,96.33
4,1016110.0,WPS,-79.67,Man,Man,14.0,1.833333,0.0,179.67


In [30]:
#pivoted_with_demo2.to_csv("complete_df_EXPSIM4.csv")

In [31]:
#complete_df["Pref_value"] = complete_df[["cogproc", "emo_pos", "emo_neg", "allure", "acquire", "BigWords", "want", "subj", "WPS", "Social", "risk", "reward", "tone_pos", "tone_neg", "need", "lack", "fatigue", "Drives", "curiosity", "Conversation", "fulfill"]].mean(axis=1)
#complete_df.to_csv("complete_wide.csv", encoding='utf-8', index=False)

## 6. Melting complete LIWC dataframe from Wide to Long Format (PEM and PP Outcome)

In [32]:
complete_df_eff = pd.melt(complete_df, id_vars=['Movez_code_x', "Sex", "age", "FAS", "Gender"], value_vars = ['eff_WPS', 'eff_acquire', 'eff_allure', 'eff_bigwords', 'eff_cog', 'eff_convers', 'eff_curious', 'eff_drive', 'eff_fatigue', 'eff_fulfill', 'eff_intensity', 'eff_lack', 'eff_need', 'eff_negemo', 'eff_negtone', 'eff_posemo', 'eff_postone', 'eff_reward', 'eff_risk', 'eff_social', 'eff_subj', 'eff_want'],var_name= "Linguistic category", value_name='Eff_value')
complete_df_eff["Linguistic category"] = complete_df_eff["Linguistic category"].replace(['eff_WPS', 'eff_acquire', 'eff_allure', 'eff_bigwords', 'eff_cog', 'eff_convers', 'eff_curious', 'eff_drive', 'eff_fatigue', 'eff_fulfill', 'eff_intensity', 'eff_lack', 'eff_need', 'eff_negemo', 'eff_negtone', 'eff_posemo', 'eff_postone', 'eff_reward', 'eff_risk', 'eff_social', 'eff_subj', 'eff_want'], ['WPS', 'acquire', 'allure', 'BigWords', "cogproc", 'Conversation', 'curiosity', 'Drives', 'fatigue', "fulfill", 'intensity', 'lack', 'need', 'emo_neg', 'tone_neg', 'emo_pos', 'tone_pos', 'reward', 'risk', "Social", "subj", "want"])
complete_df_eff = complete_df_eff[complete_df_eff["Movez_code_x"].isna() == False]
complete_df_eff = complete_df_eff[complete_df_eff["Eff_value"].isna() == False]

In [33]:
complete_df_eff.head(5)

Unnamed: 0,Movez_code_x,Sex,age,FAS,Gender,Linguistic category,Eff_value
30,3504199.0,Man,14.0,2.0,Man,WPS,2.0
40,9089051.0,Vrouw,13.0,1.0,Vrouw,WPS,1.0
46,6989421.0,Man,14.0,1.0,Man,WPS,1.0
51,1054448.0,Vrouw,15.0,1.5,Vrouw,WPS,4.0
56,2141126.0,Man,14.0,2.0,Man,WPS,3.0


In [34]:
#Lets rename some colimn names so that they are uniform

complete_df = complete_df.rename(columns= {"pp_bigwords": 'pp_BigWords', "pp_convers": 'pp_Conversation', "pp_drive": 'pp_Drives', "pp_social": 'pp_Social', "pp_cog": 'pp_cogproc', 'pp_curious': 'pp_curiosity', "pp_negemo": 'pp_emo_neg', "pp_posemo": 'pp_emo_pos', "pp_negtone": 'pp_tone_neg', "pp_postone": 'pp_tone_pos'})
complete_df = complete_df.rename(columns= {"eff_bigwords": 'eff_BigWords', "eff_convers": 'eff_Conversation', "eff_drive": 'eff_Drives', "eff_social": 'eff_Social', "eff_cog": 'eff_cogproc', 'eff_curious': 'eff_curiosity', "eff_negemo": 'eff_emo_neg', "eff_posemo": 'eff_emo_pos', "eff_negtone": 'eff_tone_neg', "eff_postone": 'eff_tone_pos'})

In [35]:
#Again we calcualte linguistic similarity to the Preferred Message

# Define linguistic categories of interest
columns = [
    "cogproc", "emo_pos", "emo_neg", "allure", "acquire", "BigWords", "want", "WPS",
    "Social", "risk", "reward", "tone_pos", "tone_neg", "need", "lack", "fatigue",
    "Drives", "curiosity", "Conversation", "fulfill", "intensity"
]

# Define additional columns to retain
retain_cols = ["Movez_code_x", "Sex", "age", "FAS", "Gender"]

# Melt eff_ columns (exclude any ending in _z)
eff_columns = [
    col for col in complete_df.columns 
    if col.startswith("eff_") 
]
df_eff = complete_df.melt(
    id_vars=retain_cols,
    value_vars=eff_columns,
    var_name="Linguistic Category",
    value_name="Eff value"
)
df_eff["Linguistic Category"] = df_eff["Linguistic Category"].str.replace("eff_", "", regex=False)

# Melt _chosenmatch columns (exclude any ending in _chosenmatch_z)
chosenmatch_columns = [
    col for col in complete_df.columns 
    if col.endswith("_chosenmatch")
]
df_chosen = complete_df.melt(
    id_vars=retain_cols,
    value_vars=chosenmatch_columns,
    var_name="Linguistic Category",
    value_name="Similarity: CHOSEN"
)
df_chosen["Linguistic Category"] = df_chosen["Linguistic Category"].str.replace("_chosenmatch", "", regex=False)

# Merge both long-format DataFrames on participant info and variable name
df_long_eff = pd.merge(
    df_eff,
    df_chosen,
    on=retain_cols + ["Linguistic Category"],
    how="outer"
)

In [36]:
df_long_eff.head(10)

Unnamed: 0,Movez_code_x,Sex,age,FAS,Gender,Linguistic Category,Eff value,Similarity: CHOSEN
0,1016110.0,Man,14.0,1.833333,Man,BigWords,,8.42
1,1016110.0,Man,14.0,1.833333,Man,Conversation,2.0,3.8
2,1016110.0,Man,14.0,1.833333,Man,Drives,3.0,4.08
3,1016110.0,Man,14.0,1.833333,Man,Social,,3.67
4,1016110.0,Man,14.0,1.833333,Man,WPS,3.0,114.34
5,1016110.0,Man,14.0,1.833333,Man,acquire,,15.11
6,1016110.0,Man,14.0,1.833333,Man,allure,,5.67
7,1016110.0,Man,14.0,1.833333,Man,cogproc,,2.61
8,1016110.0,Man,14.0,1.833333,Man,curiosity,2.0,0.0
9,1016110.0,Man,14.0,1.833333,Man,emo_neg,,0.54


In [37]:
# We repeat the same thing for Perceived Personalization

# Columns to melt
columns = [
    "cogproc", "emo_pos", "emo_neg", "allure", "acquire", "BigWords", "want", "WPS",
    "Social", "risk", "reward", "tone_pos", "tone_neg", "need", "lack", "fatigue",
    "Drives", "curiosity", "Conversation", "fulfill", "intensity"
]

# Additional columns to retain
retain_cols = ["Movez_code_x", "Sex", "age", "FAS", "Gender"]

# Melt eff_ columns
pp_columns = [col for col in complete_df.columns if col.startswith("pp_")]
df_pp = complete_df.melt(
    id_vars=retain_cols,
    value_vars=pp_columns,
    var_name="Linguistic Category",
    value_name="PP value"
)
df_pp["Linguistic Category"] = df_pp["Linguistic Category"].str.replace("pp_", "", regex=False)

# Melt _chosenmatch_z columns
chosenmatch_columns = [col for col in complete_df.columns if col.endswith("_chosenmatch")]
df_chosen = complete_df.melt(
    id_vars=retain_cols,
    value_vars=chosenmatch_columns,
    var_name="Linguistic Category",
    value_name="Similarity: CHOSEN"
)
df_chosen["Linguistic Category"] = df_chosen["Linguistic Category"].str.replace("_chosenmatch", "", regex=False)

# Combine melted DataFrames
df_long_pp = pd.merge(df_pp, df_chosen, on=retain_cols + ["Linguistic Category"], how="outer")


In [38]:
df_long_pp.head()

Unnamed: 0,Movez_code_x,Sex,age,FAS,Gender,Linguistic Category,PP value,Similarity: CHOSEN
0,1016110.0,Man,14.0,1.833333,Man,BigWords,,8.42
1,1016110.0,Man,14.0,1.833333,Man,Conversation,3.0,3.8
2,1016110.0,Man,14.0,1.833333,Man,Drives,4.0,4.08
3,1016110.0,Man,14.0,1.833333,Man,Social,,3.67
4,1016110.0,Man,14.0,1.833333,Man,WPS,2.0,114.34


In [39]:
#Again we transform the scores substracting from 100

df_long_pp["Similarity: CHOSEN_100"] = 100 - df_long_pp["Similarity: CHOSEN"]
df_long_eff["Similarity: CHOSEN_100"] = 100 - df_long_eff["Similarity: CHOSEN"]

In [40]:
df_long_pp.head()

Unnamed: 0,Movez_code_x,Sex,age,FAS,Gender,Linguistic Category,PP value,Similarity: CHOSEN,Similarity: CHOSEN_100
0,1016110.0,Man,14.0,1.833333,Man,BigWords,,8.42,91.58
1,1016110.0,Man,14.0,1.833333,Man,Conversation,3.0,3.8,96.2
2,1016110.0,Man,14.0,1.833333,Man,Drives,4.0,4.08,95.92
3,1016110.0,Man,14.0,1.833333,Man,Social,,3.67,96.33
4,1016110.0,Man,14.0,1.833333,Man,WPS,2.0,114.34,-14.34


In [41]:
#df_long_eff.to_csv("eff_long.csv")
#df_long_pp.to_csv("pp_long.csv")