In [None]:
# examined whether significant differences exist in decision time across six post types (independent variables)  with The Kruskal-Wallis H test or four levels of trust (dependent variable).
# Figure. Decision Time Distribution by Posts (Veracity * Modality).


import pandas as pd
from scipy.stats import kruskal, mannwhitneyu
import seaborn as sns
import matplotlib.pyplot as plt
from google.colab import files
import io

# Step 1: Upload the Excel file
uploaded = files.upload()

# Step 2: Read the uploaded Excel file
filename = list(uploaded.keys())[0]
df = pd.read_csv(io.BytesIO(uploaded[filename]), encoding='ISO-8859-1')

# Step 3: Extract the relevant columns
categories = [
    'Trusted_Headline_with_map', 'Suspicious_Headline_with_map',
    'Trusted_Headline_with_photo', 'Suspicious_Headline_with_photo',
    'Trusted_Headline_only', 'Suspicious_Headline_only'
]
df_relevant = df[['First Time to Interact (MS)'] + categories]

# Step 4: Convert non-numeric data to NaN and drop missing values
df_relevant_cleaned = df_relevant.apply(pd.to_numeric, errors='coerce').dropna()

# Step 5: Remove outliers where 'First Time to Interact (MS)' > 150000
df_relevant_cleaned = df_relevant_cleaned[df_relevant_cleaned['First Time to Interact (MS)'] <= 150000]

# Step 6: Create a new 'category' column to indicate the type
category_map = {
    'Trusted_Headline_with_map': 'Trusted_Headline_with_map',
    'Suspicious_Headline_with_map': 'Suspicious_Headline_with_map',
    'Trusted_Headline_with_photo': 'Trusted_Headline_with_photo',
    'Suspicious_Headline_with_photo': 'Suspicious_Headline_with_photo',
    'Trusted_Headline_only': 'Trusted_Headline_only',
    'Suspicious_Headline_only': 'Suspicious_Headline_only'
}

df_relevant_cleaned['category'] = 'None'
for col in categories:
    df_relevant_cleaned.loc[df_relevant_cleaned[col] == 1, 'category'] = category_map[col]

# Step 7: Manually specify the correct order for the x-axis labels
category_order = [
    'Trusted_Headline_with_map', 'Suspicious_Headline_with_map',
    'Trusted_Headline_with_photo', 'Suspicious_Headline_with_photo',
    'Trusted_Headline_only', 'Suspicious_Headline_only'
]

plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = 'DejaVu Serif'

# Step 9: Generate the violin plot with the correct x-label order
plt.figure(figsize=(14, 7))
violin = sns.violinplot(
    x='category',
    y='First Time to Interact (MS)',
    data=df_relevant_cleaned,
    order=category_order,
    inner=None,
    alpha=0.9, color='#BCC7DA'
)
sns.swarmplot(
    x='category',
    y='First Time to Interact (MS)',
    data=df_relevant_cleaned,
    color='k',
    alpha=0.4
)

# Step 10: Calculate mean and median for each category
mean_values = df_relevant_cleaned.groupby('category')['First Time to Interact (MS)'].mean()
median_values = df_relevant_cleaned.groupby('category')['First Time to Interact (MS)'].median()

# Add mean and median lines to the violin plot
for i, category in enumerate(mean_values.index):
    violin.axhline(mean_values[category], xmin=i/6+0.05, xmax=(i+1)/6-0.05, color='blue', linestyle='-', linewidth=2, label='Mean' if i == 0 else "")
    violin.axhline(median_values[category], xmin=i/6+0.05, xmax=(i+1)/6-0.05, color='green', linestyle='--', linewidth=2, label='Median' if i == 0 else "")

# Step 11: Add a legend and labels
# plt.legend(loc='center left', bbox_to_anchor=(1.05, 0.5))
plt.ylabel('User Decision Time (MS)')
plt.ylim(0, 100000)
plt.title('User Decision Time by Posts (Veracity * Modality)')

# Update x-axis labels with two rows
xtick_labels = [
    'Trusted\nHeadline with map', 'Suspicious\nHeadline with map',
    'Trusted\nHeadline with photo', 'Suspicious\nHeadline with photo',
    'Trusted\nHeadline only', 'Suspicious\nHeadline only'
]
plt.xticks(ticks=range(len(category_order)), labels=xtick_labels, rotation=0)

plt.show()

# Step 12: Display statistical description (mean, median, etc.) for each category
stat_description = df_relevant_cleaned.groupby('category')['First Time to Interact (MS)'].describe()
print(stat_description)


In [None]:
# examined whether significant differences exist in decision time across four levels of trust (dependent variable) with The Kruskal-Wallis H test
# Figure. Decision time for four levels of trust

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from google.colab import files
import io

# Step 1: Upload the Excel file
uploaded = files.upload()

# Step 2: Read the uploaded Excel file
filename = list(uploaded.keys())[0]
df = pd.read_excel(io.BytesIO(uploaded[filename]))

# Step 3: Extract the relevant columns
df_relevant = df[['First Time to Interact (MS)', 'Trust']]

# Step 4: Convert non-numeric data to NaN and drop missing values
df_relevant_cleaned = df_relevant.apply(pd.to_numeric, errors='coerce').dropna()

# Step 5: Remove outliers where 'First Time to Interact (MS)' > 150000
df_relevant_cleaned = df_relevant_cleaned[df_relevant_cleaned['First Time to Interact (MS)'] <= 150000]

# Seaborn style
sns.set(style="whitegrid")

# Serif font
plt.rcParams['font.family'] = 'serif'

trust_mapping = {-2: "Unreliable", -1: "Skeptical", 1: "Fairly Reliable", 2: "Fully Reliable"}
df_relevant_cleaned['Trust Label'] = df_relevant_cleaned['Trust'].map(trust_mapping)

order = ["Unreliable", "Skeptical", "Fairly Reliable", "Fully Reliable"]

# create violin plot
plt.figure(figsize=(12, 6))
violin = sns.violinplot(x='Trust Label', y='First Time to Interact (MS)', data=df_relevant_cleaned, inner=None, alpha=0.4, order=order)
sns.swarmplot(x='Trust Label', y='First Time to Interact (MS)', data=df_relevant_cleaned, color='k', alpha=0.4, order=order)

# calculate mean and median value
mean_values = df_relevant_cleaned.groupby('Trust Label')['First Time to Interact (MS)'].mean()
median_values = df_relevant_cleaned.groupby('Trust Label')['First Time to Interact (MS)'].median()

for i, trust_label in enumerate(order):
    violin.axhline(mean_values[trust_label], xmin=i/4+0.05, xmax=(i+1)/4-0.05, color='blue', linestyle='-', linewidth=2, label='Mean' if i == 0 else "")
    violin.axhline(median_values[trust_label], xmin=i/4+0.05, xmax=(i+1)/4-0.05, color='green', linestyle='--', linewidth=2, label='Median' if i == 0 else "")

# # add legend
# plt.legend(loc='upper right')

# add title and label
plt.xlabel('')
plt.ylabel('First Time to Interact (MS)')
plt.ylim(0, 100000)

plt.show()


In [None]:
#Main effects

import numpy as np
import pandas as pd
from sklearn.cross_decomposition import PLSRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from google.colab import files

# ----------------------------------------
# STEP 1: Load your dataset
# ----------------------------------------
uploaded = files.upload()
df = pd.read_csv('open_data.csv', encoding='ISO-8859-1')  # Adjust file name if necessary

# Define independent variables and dependent variable
independent_vars = [
    'Trusted_Headline_with_map',
    'Suspicious_Headline_with_map',
    'Suspicious_Headline_with_photo',
    'Trusted_Headline_only',
    'Suspicious_Headline_only',
    'Interest_Maps',
    'Proficiency_Maps',
    'Experienced_Climate',
    'Interest_Climate',
    'Trust_Researchers',
    'Trust_Climate'
]

dependent_var = 'Trust'

# Check for missing columns
missing_columns = [col for col in independent_vars + [dependent_var] if col not in df.columns]
if missing_columns:
    raise ValueError(f"Missing columns in dataset: {missing_columns}")

# Extract X (independent variables) and Y (dependent variable)
X = df[independent_vars].values
Y = df[dependent_var].values.reshape(-1, 1)

# ----------------------------------------
# STEP 2: Scaling
# ----------------------------------------
scaler_X = StandardScaler()
scaler_Y = StandardScaler()

X_scaled = scaler_X.fit_transform(X)
Y_scaled = scaler_Y.fit_transform(Y)

# ----------------------------------------
# STEP 3: Fit PLS Regression
# ----------------------------------------
pls = PLSRegression(n_components=5)  # Using 5 components for the analysis
pls.fit(X_scaled, Y_scaled)

# Predict on the training set to calculate R² and MSE
Y_pred_scaled = pls.predict(X_scaled)
Y_pred = scaler_Y.inverse_transform(Y_pred_scaled)

mse = mean_squared_error(Y, Y_pred)
r2 = r2_score(Y, Y_pred)

# ----------------------------------------
# STEP 4: Print Performance Metrics
# ----------------------------------------
print("In-Sample MSE:", mse)
print("In-Sample R²:", r2)

# ----------------------------------------
# STEP 5: Compute B (Unstandardized Coefficients), β (Standardized Coefficients), and S.E.
# ----------------------------------------
# Unstandardized coefficients
B = scaler_Y.scale_ * pls.coef_.flatten() / scaler_X.scale_

# Standardized coefficients (PLS coefficients are already standardized)
beta = pls.coef_.flatten()

# Bootstrap to calculate standard errors
n_bootstraps = 1000
n_samples = X_scaled.shape[0]
boot_coefs = np.zeros((n_bootstraps, len(independent_vars)))

rng = np.random.default_rng(seed=42)

for i in range(n_bootstraps):
    indices = rng.integers(0, n_samples, n_samples)
    X_boot = X_scaled[indices, :]
    Y_boot = Y_scaled[indices, :]
    pls_boot = PLSRegression(n_components=2)
    pls_boot.fit(X_boot, Y_boot)
    boot_coefs[i, :] = pls_boot.coef_.ravel()

# Standard errors (S.E.)
se = np.std(boot_coefs, axis=0)

# ----------------------------------------
# STEP 6: Confidence Intervals
# ----------------------------------------
def compute_ci(coefs, level=95):
    alpha = (100 - level) / 2.0
    lower = np.percentile(coefs, alpha, axis=0)
    upper = np.percentile(coefs, 100 - alpha, axis=0)
    return lower, upper

ci_95_lower, ci_95_upper = compute_ci(boot_coefs, 95)
ci_99_lower, ci_99_upper = compute_ci(boot_coefs, 99)
ci_999_lower, ci_999_upper = compute_ci(boot_coefs, 99.9)

# ----------------------------------------
# STEP 7: Results Table
# ----------------------------------------
results = []
for idx, feature in enumerate(independent_vars):
    # Determine significance
    if ci_999_lower[idx] > 0 or ci_999_upper[idx] < 0:
        significance = "***"
    elif ci_99_lower[idx] > 0 or ci_99_upper[idx] < 0:
        significance = "**"
    elif ci_95_lower[idx] > 0 or ci_95_upper[idx] < 0:
        significance = "*"
    else:
        significance = ""

    results.append({
        'Feature': feature,
        'B (Unstandardized)': B[idx],
        'β (Standardized)': beta[idx],
        'S.E.': se[idx],
        '95% CI': (ci_95_lower[idx], ci_95_upper[idx]),
        '99% CI': (ci_99_lower[idx], ci_99_upper[idx]),
        '99.9% CI': (ci_999_lower[idx], ci_999_upper[idx]),
        'Significance': significance
    })

df_results = pd.DataFrame(results)

# Display the results
pd.set_option('display.float_format', '{:.4f}'.format)
print(df_results[['Feature', 'B (Unstandardized)', 'β (Standardized)', 'S.E.', '95% CI', '99% CI', '99.9% CI', 'Significance']])


In [None]:
#path a

import numpy as np
import pandas as pd
from sklearn.cross_decomposition import PLSRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from google.colab import files

# ----------------------------------------
# STEP 1: Load your dataset
# ----------------------------------------
uploaded = files.upload()
df = pd.read_csv('open_data.csv', encoding='ISO-8859-1')  # Adjust if necessary

categories = [
    'Trusted_Headline_with_map',
    'Suspicious_Headline_with_map',
    # 'Trusted_Headline_with_photo',
    'Suspicious_Headline_with_photo',
    'Trusted_Headline_only',
    'Suspicious_Headline_only'
]

moderators = [
    'Interest_Maps',
    'Proficiency_Maps',
    'Experienced_Climate',
    'Interest_Climate',
    'Trust_Researchers',
    'Trust_Climate'
]

# Check required columns
missing_categories = [c for c in categories if c not in df.columns]
missing_moderators = [m for m in moderators if m not in df.columns]
if missing_categories:
    print("Missing category columns:", missing_categories)
if missing_moderators:
    print("Missing moderator columns:", missing_moderators)

if 'Time' not in df.columns:
    raise ValueError("The column 'Time' is not found in the dataset.")

# ----------------------------------------
# STEP 2: Create Interaction Terms for Moderation
# ----------------------------------------
interaction_terms = []
for c in categories:
    for m in moderators:
        inter_name = f"{c}_x_{m}"
        df[inter_name] = df[c] * df[m]
        interaction_terms.append(inter_name)

X_features = categories + moderators + interaction_terms
X = df[X_features].values
Y = df["Time"].values.reshape(-1, 1)  # "Time" as the dependent variable

# ----------------------------------------
# STEP 3: Scaling the Entire Dataset
# ----------------------------------------
scaler_X = StandardScaler()
scaler_Y = StandardScaler()

X_scaled = scaler_X.fit_transform(X)
Y_scaled = scaler_Y.fit_transform(Y)

# ----------------------------------------
# STEP 4: Fit the PLS Model on the Entire Dataset
# input the n_components value
# ----------------------------------------
pls = PLSRegression(n_components= ) # inputr a value
pls.fit(X_scaled, Y_scaled)

# Predict on the same training data
Y_pred_scaled = pls.predict(X_scaled)
Y_pred = scaler_Y.inverse_transform(Y_pred_scaled)

mse = mean_squared_error(Y, Y_pred)
r2 = r2_score(Y, Y_pred)

print("In-Sample MSE:", mse)
print("In-Sample R²:", r2)

# Calculate unstandardized coefficients (B)
B = scaler_Y.scale_ * pls.coef_.flatten() / scaler_X.scale_

# Standardized coefficients (β)
beta = pls.coef_.flatten()

# ----------------------------------------
# STEP 5: Bootstrap for Confidence Intervals
# ----------------------------------------
n_bootstraps = 1000
n_samples = X_scaled.shape[0]
boot_coefs = []

rng = np.random.default_rng(seed=42)

for i in range(n_bootstraps):
    # Sample with replacement from the entire dataset
    indices = rng.integers(0, n_samples, n_samples)
    X_boot = X_scaled[indices, :]
    y_boot = Y_scaled[indices, :]

    # Fit PLS on bootstrap sample
    # inputr n_components value
    pls_boot = PLSRegression(n_components= )
    pls_boot.fit(X_boot, y_boot)
    boot_coefs.append(pls_boot.coef_.ravel())

boot_coefs = np.array(boot_coefs)  # shape: (n_bootstraps, n_features)

# Standard errors (S.E.)
se = np.std(boot_coefs, axis=0)

# Compute confidence intervals
def compute_ci(coefs, level=95):
    alpha = (100 - level) / 2.0
    lower = np.percentile(coefs, alpha, axis=0)
    upper = np.percentile(coefs, 100 - alpha, axis=0)
    return lower, upper

ci_95_lower, ci_95_upper = compute_ci(boot_coefs, 95)
ci_99_lower, ci_99_upper = compute_ci(boot_coefs, 99)
ci_999_lower, ci_999_upper = compute_ci(boot_coefs, 99.9)

# ----------------------------------------
# STEP 6: Results Table
# ----------------------------------------
results = []
for idx, feature in enumerate(X_features):
    # Determine significance
    if ci_999_lower[idx] > 0 or ci_999_upper[idx] < 0:
        significance = "***"
    elif ci_99_lower[idx] > 0 or ci_99_upper[idx] < 0:
        significance = "**"
    elif ci_95_lower[idx] > 0 or ci_95_upper[idx] < 0:
        significance = "*"
    else:
        significance = ""

    # Determine direction
    direction = "Positive" if beta[idx] > 0 else "Negative"

    results.append({
        'Variable': feature,
        'Coefficient (β: Standardized)': beta[idx],
        'B (Unstandardized)': B[idx],
        'S.E.': se[idx],
        'Direction': direction,
        '95% CI': f"[{ci_95_lower[idx]:.4f}, {ci_95_upper[idx]:.4f}]",
        '99% CI': f"[{ci_99_lower[idx]:.4f}, {ci_99_upper[idx]:.4f}]",
        '99.9% CI': f"[{ci_999_lower[idx]:.4f}, {ci_999_upper[idx]:.4f}]",
        'Significance': significance
    })

df_results = pd.DataFrame(results)

# ----------------------------------------
# STEP 7: Save to CSV
# ----------------------------------------
output_file = "PLS_Results.csv"
df_results.to_csv(output_file, index=False)
print(f"Results saved to {output_file}")



In [None]:
#path b

import numpy as np
import pandas as pd
from sklearn.cross_decomposition import PLSRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from google.colab import files

# ----------------------------------------
# STEP 1: Load your dataset
# ----------------------------------------
uploaded = files.upload()
df = pd.read_csv('open_data.csv', encoding='ISO-8859-1')  # Adjust if necessary

categories = [
    'Trusted_Headline_with_map',
    'Suspicious_Headline_with_map',
    'Suspicious_Headline_with_photo',
    'Trusted_Headline_only',
    'Suspicious_Headline_only'
]

moderators = [
    'Interest_Maps',
    'Proficiency_Maps',
    'Experienced_Climate',
    'Interest_Climate',
    'Trust_Researchers',
    'Trust_Climate'
]

# Check required columns
missing_categories = [c for c in categories if c not in df.columns]
missing_moderators = [m for m in moderators if m not in df.columns]
if missing_categories:
    print("Missing category columns:", missing_categories)
if missing_moderators:
    print("Missing moderator columns:", missing_moderators)

if 'Time' not in df.columns:
    raise ValueError("The column 'Time' is not found in the dataset.")

if 'Trust' not in df.columns:
    raise ValueError("The column 'Trust' is not found in the dataset.")

# ----------------------------------------
# STEP 2: Create Interaction Terms for Moderation and Mediation
# ----------------------------------------
interaction_terms = []
for c in categories:
    for m in moderators:
        inter_name = f"{c}_x_{m}"
        df[inter_name] = df[c] * df[m]
        interaction_terms.append(inter_name)

mediator_interactions = []
for m in moderators:
    inter_name = f"Time_x_{m}"
    df[inter_name] = df['Time'] * df[m]
    mediator_interactions.append(inter_name)

X_features = categories + moderators + interaction_terms + ['Time'] + mediator_interactions
X = df[X_features].values
Y = df["Trust"].values.reshape(-1, 1)

# ----------------------------------------
# STEP 3: Scaling the entire dataset
# ----------------------------------------
scaler_X = StandardScaler()
scaler_Y = StandardScaler()

X_scaled = scaler_X.fit_transform(X)
Y_scaled = scaler_Y.fit_transform(Y)

# ----------------------------------------
# STEP 4: Fit the PLS Model on the entire dataset
# Input the n_components value
# ----------------------------------------
pls = PLSRegression(n_components=)
pls.fit(X_scaled, Y_scaled)

Y_pred_scaled = pls.predict(X_scaled)
Y_pred = scaler_Y.inverse_transform(Y_pred_scaled)

mse = mean_squared_error(Y, Y_pred)
r2 = r2_score(Y, Y_pred)

print("In-Sample MSE:", mse)
print("In-Sample R²:", r2)

# Calculate unstandardized coefficients (B)
B = scaler_Y.scale_ * pls.coef_.flatten() / scaler_X.scale_

# Standardized coefficients (β)
beta = pls.coef_.flatten()

# ----------------------------------------
# STEP 5: Bootstrap for Confidence Intervals
# ----------------------------------------
n_bootstraps = 1000
n_samples = X_scaled.shape[0]
boot_coefs = []

rng = np.random.default_rng(seed=42)

for i in range(n_bootstraps):
    # Sample with replacement from the entire dataset
    indices = rng.integers(0, n_samples, n_samples)
    X_boot = X_scaled[indices, :]
    Y_boot = Y_scaled[indices, :]

    # Fit PLS on bootstrap sample
    # Input the n_components value
    pls_boot = PLSRegression(n_components=)
    pls_boot.fit(X_boot, Y_boot)
    boot_coefs.append(pls_boot.coef_.ravel())

boot_coefs = np.array(boot_coefs)  # shape: (n_bootstraps, n_features)

# Standard errors (S.E.)
se = np.std(boot_coefs, axis=0)

# Compute confidence intervals
def compute_ci(coefs, level=95):
    alpha = (100 - level) / 2.0
    lower = np.percentile(coefs, alpha, axis=0)
    upper = np.percentile(coefs, 100 - alpha, axis=0)
    return lower, upper

ci_95_lower, ci_95_upper = compute_ci(boot_coefs, 95)
ci_99_lower, ci_99_upper = compute_ci(boot_coefs, 99)
ci_999_lower, ci_999_upper = compute_ci(boot_coefs, 99.9)

# ----------------------------------------
# STEP 6: Results Table
# ----------------------------------------
results = []
for idx, feature in enumerate(X_features):
    # Determine significance
    if ci_999_lower[idx] > 0 or ci_999_upper[idx] < 0:
        significance = "***"
    elif ci_99_lower[idx] > 0 or ci_99_upper[idx] < 0:
        significance = "**"
    elif ci_95_lower[idx] > 0 or ci_95_upper[idx] < 0:
        significance = "*"
    else:
        significance = ""

    # Determine direction
    direction = "Positive" if beta[idx] > 0 else "Negative"

    results.append({
        'Variable': feature,
        'Coefficient (β: Standardized)': beta[idx],
        'B (Unstandardized)': B[idx],
        'S.E.': se[idx],
        'Direction': direction,
        '95% CI': f"[{ci_95_lower[idx]:.4f}, {ci_95_upper[idx]:.4f}]",
        '99% CI': f"[{ci_99_lower[idx]:.4f}, {ci_99_upper[idx]:.4f}]",
        '99.9% CI': f"[{ci_999_lower[idx]:.4f}, {ci_999_upper[idx]:.4f}]",
        'Significance': significance
    })

df_results = pd.DataFrame(results)

# ----------------------------------------
# STEP 7: Save to CSV
# ----------------------------------------
output_file = "PLS_Results.csv"
df_results.to_csv(output_file, index=False)
print(f"Results saved to {output_file}")
