In [None]:
## check assumptions before computing PCA

## start by parsing data (example here on portrayed emotion data)
## you will need: portrayed_emotions_git.csv and 
## viewer_emotions_git

import re, numpy as np
import pandas as pd

# ------------------------------------------------------------------
# A. read raw file (unchanged)
df = pd.read_csv("portrayed_emotions_git.csv")

# ------------------------------------------------------------------
# B. convert cells that contain several numbers to their mean
#    – looks for commas, semicolons, pipes, or whitespace as separators
#    – leaves single numbers untouched
#    – returns NaN on parsing problems so they can be handled later
# ------------------------------------------------------------------
_multi_val_pattern = re.compile(r'[,\|;]|\s+')

def _average_if_multi(x):
    if pd.isna(x):
        return np.nan
    if isinstance(x, str):
        parts = [p for p in _multi_val_pattern.split(x) if p]
        if len(parts) > 1:                       # ⇠ multi‑value cell
            try:
                nums = list(map(float, parts))
                return float(np.mean(nums))
            except ValueError:                   # contains non‑numeric text
                return np.nan
        else:                                    # single value as text
            try:
                return float(parts[0])
            except ValueError:
                return np.nan
    return x                                     # already numeric

df = df.applymap(_average_if_multi)

# ------------------------------------------------------------------
# C. keep only numeric variables to analyze
df = df.select_dtypes(include="number")

# ------------------------------------------------------------------
# D. handle missing values
df = df.dropna()           # or any imputation strategy you prefer

## now: check assumptions -- with KMO and Bartlett's tests

##KMO test
from factor_analyzer import calculate_kmo

kmo_per_item, kmo_overall = calculate_kmo(df.values)

print(f"Overall KMO = {kmo_overall:0.3f}")          # e.g. 0.77

# to see which items are dragging things down:
kmo_table = pd.Series(kmo_per_item, index=df.columns).sort_values()
print(kmo_table)

# Bartlett’s test
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
chi2, p = calculate_bartlett_sphericity(df)
print(f"Bartlett χ² = {chi2:0.1f}, p = {p:0.3e}")

In [None]:
## run PCA on portrayed and viewer emotions (separately) 
## here for portrayed; switch out csv to run for viewer emotions 

import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

def parse_and_average(cell):
    if pd.isna(cell):
        return np.nan
    s = str(cell).strip()
    if not s:
        return np.nan
    parts = s.split(',')
    numeric_values = []
    for part in parts:
        part = part.strip()
        if part:
            try:
                numeric_values.append(float(part))
            except ValueError:
                return np.nan
    if len(numeric_values) == 0:
        return np.nan
    return sum(numeric_values) / len(numeric_values)

# 1. Read the CSV
df = pd.read_csv("portrayed_emotions_git.csv", header=0)

# 2. Parse multiple values by averaging
df_parsed = df.apply(lambda series: series.map(parse_and_average))

# 3. Convert to numeric-only columns, fill missing data
df_numeric = df_parsed.select_dtypes(include=[np.number]).copy()
df_numeric.fillna(df_numeric.mean(), inplace=True)

# 4. Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df_numeric)

# 5. Perform PCA
pca = PCA()
pca.fit(scaled_data)
pca_scores = pca.transform(scaled_data)

# 6. Scree plot
plt.plot(range(1, pca.n_components_ + 1),
         pca.explained_variance_ratio_,
         marker='o')
plt.title("Scree Plot")
plt.xlabel("Principal Component")
plt.ylabel("Variance Explained")
plt.show()

# 7. PCA loadings
loadings = pd.DataFrame(
    pca.components_.T,
    columns=[f"PC{i+1}" for i in range(pca.n_components_)],
    index=df_numeric.columns
)
print("PCA Loadings:")
print(loadings)

print("\nExplained Variance Ratio:")
for i, ratio in enumerate(pca.explained_variance_ratio_):
    print(f"PC{i+1}: {ratio:.4f}")

# Extract and save the first 4 PCs’ loadings

loadings_first_4 = loadings[[f"PC{i+1}" for i in range(4)]]
loadings_first_4.to_csv("portrayed_first_4_pcs_loadings.csv")
print("\nLoadings for the first 4 PCs saved to 'portrayed_first_4_pcs_loadings.csv'.")

# Create a DataFrame of the first 4 PC scores for each observation

pca_scores_df = pd.DataFrame(
    pca_scores,
    columns=[f"PC{i+1}" for i in range(pca.n_components_)]
)
pca_scores_first_4 = pca_scores_df[[f"PC{i+1}" for i in range(4)]]

# Save the first 4 PC scores
pca_scores_first_4.to_csv("portrayed_first_4_pcs_scores.csv", index=False)
print("Scores for the first 4 PCs saved to 'portrayed_first_4_pcs_scores.csv'.")


In [None]:
# compute the pairwise correlations of raters rating the same scene, 
# then correct using Spearman-Brown prophecy formula 
## this is an example for PC1 in portrayed, but files can be edited for all PCs

##frist have to assign the correct labels to the rows in PC1

import pandas as pd

# 1) Load a CSV file with 512 rows
df = pd.read_csv("portrayed_first_4_pcs_scores.csv")

# Safety check
assert len(df) == 512, "Your CSV must have exactly 512 rows."

# 2) Assign each row a group index (0..7) based on its row index
df['group_index'] = df.index // 64   # rows [0..63] -> group 0, [64..127] -> 1, etc.

# 3) Assign each row a couplet index (0..31) within its group
df['couplet_index'] = (df.index % 64) // 2


# Here I define, for each of the 8 groups (0..7),
# which couplet indices belong to AM_high, AM_low, MD_high, MD_low.
# this is from the MATLAB code for peaks and troughs


groups_couplet_labels = {
    0: {  # This is group_index 0 (rows 0..63) -> 32 couplets labeled as you choose
        "AM_high": [0, 1, 3, 5, 7, 10, 14, 17, 18, 19, 20, 22, 25, 27, 29, 30],
        "AM_low":  [2, 4, 6, 8, 9, 11, 12, 13, 15, 16, 21, 23, 24, 26, 28, 31],
        "MD_high": [],
        "MD_low":  []
    },
    1: {  # group_index 1 (rows 64..127)
        "AM_high": [0, 3, 5, 7, 10, 12, 13, 16, 18, 19, 20, 24, 26, 28, 29, 30],
        "AM_low":  [1, 2, 4, 6, 8, 9, 11, 14, 15, 17, 21, 22, 23, 25, 27, 31],
        "MD_high": [],
        "MD_low":  []
    },
    2: {
        "AM_high": [1, 2, 3, 7, 9, 13, 16, 17, 19, 20, 22, 23, 25, 28, 29, 30],
        "AM_low":  [0, 4, 5, 6, 8, 10, 11, 12, 14, 15, 18, 21, 24, 26, 27, 31],
        "MD_high": [],
        "MD_low":  []
    },
    3: {
        "AM_high": [0, 3, 4, 5, 6, 9, 10, 12, 14, 17, 18, 21, 23, 27, 28, 30],
        "AM_low":  [1, 2, 7, 8, 11, 13, 15, 16, 19, 20, 22, 24, 25, 26, 29, 31],
        "MD_high": [],
        "MD_low":  []
    },
    4: {
        "AM_high": [],
        "AM_low":  [],
        "MD_high": [2, 4, 5, 6, 8, 9, 11, 13, 15, 16, 18, 20, 21, 22, 24, 29],
        "MD_low":  [0, 1, 3, 7, 10, 12, 14, 17, 19, 23, 25, 26, 27, 28, 30, 31]
    },
    5: {
        "AM_high": [],
        "AM_low":  [],
        "MD_high": [1, 2, 4, 5, 6, 9, 10, 11, 14, 20, 21, 23, 24, 25, 26, 29],
        "MD_low":  [0, 3, 7, 8, 12, 13, 15, 16, 17, 18, 19, 22, 27, 28, 30, 31]
    },
    6: {
        "AM_high": [],
        "AM_low":  [],
        "MD_high": [0, 3, 4, 8, 10, 11, 13, 14, 15, 20, 21, 23, 26, 27, 28, 29],
        "MD_low":  [1, 2, 5, 6, 7, 9, 12, 16, 17, 18, 19, 22, 24, 25, 30, 31]
    },
    7: {
        "AM_high": [],
        "AM_low":  [],
        "MD_high": [2, 4, 7, 8, 9, 10, 14, 15, 17, 18, 20, 21, 23, 24, 28, 30],
        "MD_low":  [0, 1, 3, 5, 6, 11, 12, 13, 16, 19, 22, 25, 26, 27, 29, 31]
    }
}


# 4) Build a 'label' column based on group_index and couplet_index
#    using the dictionary above.

# Initialize an empty column (or you can do it in one step).
df['label'] = None

for group_idx, label_dict in groups_couplet_labels.items():
    # For each label (AM_high, AM_low, MD_high, MD_low), get the list of couplets
    for label_name, couplets_list in label_dict.items():
        mask = (df['group_index'] == group_idx) & (df['couplet_index'].isin(couplets_list))
        df.loc[mask, 'label'] = label_name


# 5) Save to a new CSV and print
df.to_csv("portrayed_PC_scores_labeled_by_movie.csv", index=False)
print(df.head(70))


In [None]:
# now we can compute the pairwise correlations of raters rating the same scene, 
# then correct using Spearman-Brown prophecy formula 
## (again: this is an example for PC1 in portrayed, but files can be edited for all PCs)

import pandas as pd
from scipy.stats import pearsonr

def eight_pairwise_correlations(
    csv_path: str,
    column_name: str,
    *,
    output_csv_path: str | None = None,
) -> pd.DataFrame:
    """
    Compute eight Pearson correlations down a single column (512 rows).

    For each 64‑row block:
      even‑indexed rows (0,2,…,62)  vs.  odd‑indexed rows (1,3,…,63)

    Parameters
    ----------
    csv_path          Path to the source CSV.
    column_name       Header label of the column to analyse.
    output_csv_path   If provided, write the 8‑row results table to this file.

    Returns
    -------
    pandas.DataFrame with columns: block, pearson_r, p_value
    """
    # ---- read and pull the chosen column ----
    col = pd.read_csv(csv_path)[column_name].to_numpy(dtype=float)

    if col.size < 512:
        raise ValueError(f"Column '{column_name}' has {col.size} rows; need ≥ 512.")

    # ---- crunch the eight correlations ----
    rows_per_block = 64
    results = []

    for block in range(8):
        seg = col[block * rows_per_block : (block + 1) * rows_per_block]

        even = seg[::2]   # 32 values
        odd  = seg[1::2]  # 32 values

        if even.size != 32 or odd.size != 32:
            raise ValueError(f"Block {block+1} doesn’t have 32 values per group.")

        r, p = pearsonr(even, odd)
        results.append({"block": block + 1, "pearson_r": r, "p_value": p})

    out_df = pd.DataFrame(results)

    # save -- to then do SB correction below
    if output_csv_path:
        out_df.to_csv(output_csv_path, index=False)
        print(f"Results saved to '{output_csv_path}'")

    return out_df

if __name__ == "__main__":
    tbl = eight_pairwise_correlations(
        "portrayed_PC_scores_labeled_by_movie.csv",
        column_name="PC1",              # <- just the header label
        output_csv_path="PC1_portrayed_pearson_corrs.csv" # omit this line if you don’t want a file
    )
    print(tbl)


## then correct with SB 
import pandas as pd

# Load the CSV (adjust the path as needed)
df = pd.read_csv("PC1_portrayed_pearson_corrs.csv")

# Calculate Spearman-Brown (2-rater) corrected reliability.
sb_values = (2 * df["pearson_r"]) / (1 + df["pearson_r"])

# Insert it right after the 'mean_correlation' column.
df.insert(
    loc=df.columns.get_loc("pearson_r") + 1,  # position right after
    column="sb_corrected",
    value=sb_values
)

# Save back to CSV with the new column
df.to_csv("portrayed_SB_corrected_PC1_pearson_correlations.csv", index=False)

# view 
print(df.head())

