In [1]:
# Import necessary libraries and modules
import pandas as pd
import numpy as np

import pingouin as pg
from scipy.stats import chi2_contingency

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Read data into DataFrames
df4 = pd.read_csv('../Data/clean/Cohort4Clean.csv')
df5 = pd.read_csv('../Data/clean/Cohort5Clean.csv')
pics = pd.read_csv('../pics/pics.csv')

# Concatenate df4 and df5 to create a combined DataFrame
df = pd.concat([df4,df5])

# Drop duplicate rows based on the 'ID' column
df = df.drop_duplicates(subset=['ID'])

# Print the shape (number of rows and columns) of the combined and deduplicated DataFrame
print("total number of participant: ", df.shape[0])

total number of participant:  1380


# TEC vs PTSD

## Number of pPTSD

In [3]:
df.groupby(['PTSD']).count()['Age']

PTSD
False    925
True     455
Name: Age, dtype: int64

## Age

In [4]:
print(df.groupby(['PTSD']).mean()['Age'])
print(df.groupby(['PTSD']).std()['Age'])
pg.ttest(x=df[df['PTSD']==True]['Age'], y=df[df['PTSD']==False]['Age'])

PTSD
False    41.463784
True     36.087912
Name: Age, dtype: float64
PTSD
False    13.330872
True     10.914413
Name: Age, dtype: float64


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-7.979082,1079.231999,two-sided,3.74408e-15,"[-6.7, -4.05]",0.427128,1527000000000.0,1.0


## Sex

In [5]:
df_s = df.groupby(['PTSD','Sex']).count()['ID'].reset_index()
print("% females TEC: {:.2f},\n% females PTSD: {:.2f}".format(
    (df_s['ID'][1]/(df_s['ID'][1]+df_s['ID'][0])*100),
     (df_s['ID'][3]/(df_s['ID'][2]+df_s['ID'][3])*100)))
df_s

% females TEC: 46.16,
% females PTSD: 50.55


Unnamed: 0,PTSD,Sex,ID
0,False,1.0,498
1,False,2.0,427
2,True,1.0,225
3,True,2.0,230


In [6]:
contingency_table = ([df_s['ID'][0],df_s['ID'][1]], [df_s['ID'][2], df_s['ID'][3]])

# Perform the Chi-squared test
chi2, p, dof, expected = chi2_contingency(contingency_table)

print("Chi-squared test: {:.2f}, p-value: {:.2f}".format(chi2, p))

Chi-squared test: 2.18, p-value: 0.14


## PCL-5 scores

In [7]:
print(df.groupby(['PTSD']).mean()['PCL'])
print(df.groupby(['PTSD']).std()['PCL'])
print(df.groupby(['PTSD']).min()['PCL'])
print(df.groupby(['PTSD']).max()['PCL'])
pg.ttest(x=df[df['PTSD']==True]['PCL'], y=df[df['PTSD']==False]['PCL'])

PTSD
False    17.825946
True     46.468132
Name: PCL, dtype: float64
PTSD
False    10.138808
True      9.699822
Name: PCL, dtype: float64
PTSD
False     0.0
True     33.0
Name: PCL, dtype: float64
PTSD
False    56.0
True     76.0
Name: PCL, dtype: float64


  bf10 = 1 / ((1 + t**2 / df) ** (-(df + 1) / 2) / integr)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,50.798569,939.745351,two-sided,9.581611000000001e-272,"[27.54, 29.75]",2.865276,inf,1.0


## Emotional Numbing

In [8]:
print(df.groupby(['PTSD']).mean()['EmotionalNumbing'])
print(df.groupby(['PTSD']).std()['EmotionalNumbing'])
print(df.groupby(['PTSD']).min()['EmotionalNumbing'])
print(df.groupby(['PTSD']).max()['EmotionalNumbing'])
pg.ttest(x=df[df['PTSD']==True]['EmotionalNumbing'], y=df[df['PTSD']==False]['EmotionalNumbing'])

PTSD
False    2.950270
True     7.912088
Name: EmotionalNumbing, dtype: float64
PTSD
False    2.630925
True     2.503075
Name: EmotionalNumbing, dtype: float64
PTSD
False    0.0
True     0.0
Name: EmotionalNumbing, dtype: float64
PTSD
False    12.0
True     12.0
Name: EmotionalNumbing, dtype: float64


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,34.035336,944.460483,two-sided,2.4160629999999998e-166,"[4.68, 5.25]",1.916129,3.744e+180,1.0


## Educational background

In [9]:
df_edu = df.groupby(['PTSD','education']).count()['ID'].reset_index()
df_edu

Unnamed: 0,PTSD,education,ID
0,False,1.0,12
1,False,2.0,125
2,False,3.0,189
3,False,4.0,120
4,False,5.0,322
5,False,6.0,155
6,False,7.0,2
7,True,1.0,5
8,True,2.0,85
9,True,3.0,133


In [10]:
contingency_table = pd.pivot_table(df_edu, values='ID', index='PTSD', columns='education', aggfunc='sum', fill_value=0)

chi2, p, dof, expected = chi2_contingency(contingency_table)

print("Chi-squared test: {:.2f}, p-value: {:.2f}".format(chi2, p))

Chi-squared test: 30.26, p-value: 0.00


In [11]:
residuals = contingency_table - pd.DataFrame(expected, index=contingency_table.index, columns=contingency_table.columns)
print("\nResiduals (Observed - Expected):")
residuals


Residuals (Observed - Expected):


education,1.0,2.0,3.0,4.0,5.0,6.0,7.0
PTSD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
False,0.605072,-15.76087,-26.833333,8.061594,11.655797,21.612319,0.65942
True,-0.605072,15.76087,26.833333,-8.061594,-11.655797,-21.612319,-0.65942


## Ethnicity

In [12]:
df_eth = df.groupby(['PTSD','Ethnicity']).count()['ID'].reset_index()
df_eth

Unnamed: 0,PTSD,Ethnicity,ID
0,False,1.0,90
1,False,2.0,835
2,True,1.0,46
3,True,2.0,409


In [13]:
contingency_table = pd.pivot_table(df_eth, values='ID', index='PTSD', columns='Ethnicity', aggfunc='sum', fill_value=0)

chi2, p, dof, expected = chi2_contingency(contingency_table)

print("Chi-squared test: {:.2f}, p-value: {:.2f}".format(chi2, p))

Chi-squared test: 0.02, p-value: 0.90


## Race

In [14]:
df['raceadj'] = pd.to_numeric(df['race'], errors='coerce').fillna(8)
df_rce = df.groupby(['PTSD','raceadj']).count()['ID'].reset_index()
df_rce

Unnamed: 0,PTSD,raceadj,ID
0,False,1.0,741
1,False,2.0,62
2,False,3.0,5
3,False,4.0,41
4,False,6.0,14
5,False,7.0,9
6,False,8.0,53
7,True,1.0,346
8,True,2.0,44
9,True,3.0,4


In [15]:
contingency_table = pd.pivot_table(df_rce, values='ID', index='PTSD', columns='raceadj', aggfunc='sum', fill_value=0)

chi2, p, dof, expected = chi2_contingency(contingency_table)

print("Chi-squared test: {:.2f}, p-value: {:.2f}".format(chi2, p))

Chi-squared test: 8.17, p-value: 0.23


## Marital status

In [16]:
df_mrtl = df.groupby(['PTSD','marital']).count()['ID'].reset_index()
df_mrtl

Unnamed: 0,PTSD,marital,ID
0,False,1.0,345
1,False,2.0,145
2,False,3.0,14
3,False,4.0,100
4,False,5.0,321
5,True,1.0,135
6,True,2.0,83
7,True,3.0,6
8,True,4.0,44
9,True,5.0,187


In [17]:
contingency_table = pd.pivot_table(df_mrtl, values='ID', index='PTSD', columns='marital', aggfunc='sum', fill_value=0)

chi2, p, dof, expected = chi2_contingency(contingency_table)

print("Chi-squared test: {:.2f}, p-value: {:.2f}".format(chi2, p))

Chi-squared test: 10.17, p-value: 0.04


In [18]:
residuals = contingency_table - pd.DataFrame(expected, index=contingency_table.index, columns=contingency_table.columns)
print("\nResiduals (Observed - Expected):")
residuals


Residuals (Observed - Expected):


marital,1.0,2.0,3.0,4.0,5.0
PTSD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
False,23.26087,-7.826087,0.594203,3.478261,-19.507246
True,-23.26087,7.826087,-0.594203,-3.478261,19.507246


# Valid vs Outlier subjects

In [19]:
# Read data into DataFrames
df_o4 = pd.read_csv('../Data/outliers/Cohort4outliers.csv')
df_o5 = pd.read_csv('../Data/outliers/Cohort5outliers.csv')
pics = pd.read_csv('../pics/pics.csv')

# Concatenate df4 and df5 to create a combined DataFrame
df_o = pd.concat([df_o4,df_o5])

# Drop duplicate rows based on the 'ID' column
df_o = df_o.drop_duplicates(subset=['ID'])

# Print the shape (number of rows and columns) of the combined and deduplicated DataFrame
print("number of outliers:", df_o.shape[0])


number of outliers: 83


## Age

In [20]:
pg.ttest(x=df.Age, y=df_o.Age)

Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-0.391569,91.479856,two-sided,0.696288,"[-3.56, 2.39]",0.045562,0.134,0.068792


## Sex

In [21]:
# Differnce in sex?
m = df[df.Sex==1].shape[0]
used = [m,df.shape[0]-m]
m = df_o[df_o.Sex==1].shape[0]
out = [m,df_o.shape[0]-m]

contingency_table = np.array([used, out])

# Perform the Chi-squared test
chi2, p, dof, expected = chi2_contingency(contingency_table)

print("Chi-squared test:{:.2f}, p-value: {:.2f}".format(chi2, p))

Chi-squared test:2.23, p-value: 0.14


## Extreme responses

In [22]:
df['extreme'] = (df[df.columns[3:43]] == 0).sum(axis=1) + (df[df.columns[3:43]] == 100).sum(axis=1)
df_o['extreme'] = (df_o[df_o.columns[3:43]] == 0).sum(axis=1) + (df_o[df_o.columns[3:43]] == 100).sum(axis=1)

pg.ttest(x=df.extreme, y=df_o.extreme)

Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-4.161046,85.510996,two-sided,7.5e-05,"[-6.41, -2.26]",0.74022,487.557,1.0


In [23]:
print("number of extreme response valid: {:.2f},\nnumber of extreme response outliers: {:.2f}".format(
    df.extreme.sum()/df.extreme.count(),
    df_o.extreme.sum()/df_o.extreme.count()))

number of extreme response valid: 5.23,
number of extreme response outliers: 9.57
