In [None]:
import os
import csv
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from scipy import stats
from scipy.stats import spearmanr
import ukbiobank.utils.utils
from ukbiobank.utils import fieldNamesToIds
from ukbiobank.utils import loadCsv
from ukbiobank.utils import addFields

# Upload cognitive performance data (Instance 2)

In [None]:
csv_path = '/ukbbdata/FULL/ukb.csv'
ukb = ukbiobank.ukbio(ukb_csv=csv_path)
# Cognitive Performance Instance 2
df = ukbiobank.utils.utils.loadCsv(ukbio=ukb, fields=['eid', 
20023,
4282,
4285,
20016,
20128,
6348,
6350,
6349,
6351,
6373,
6374,
23323,
23324,
21004,
6383,
20197,
20018,
4291,
4294,
4292,
399,
400], instance=2)
# Add fields: Cognitive Performance Instance 2
cog_ac_i2 = addFields(ukbio=ukb, df=df, fields=['eid',
20023,
4282,
4285,
20016,
20128,
6348,
6350,
6349,
6351,
6373,
6374,
23323,
23324,
21004,
6383,
6348,
20197,
20018,
4291,
4294,
4292,
399,
400], instances=2)
# Convert Field IDs to Field Names
cog_ac_i2_names = ukbiobank.utils.utils.fieldIdsToNames(ukbio=ukb, df=cog_ac_i2)
# Picture Vocabulary was not uploaded via UK BB utils, upload them separately
pv = pd.read_csv('/ukbbdata/FULL/ukb.csv', usecols = ['eid', '26302-2.0', '26306-2.0'])

# Preprocess and prepare data

In [None]:
# Convert Field IDs to Field Names
cog_ac_i2_names = ukbiobank.utils.utils.fieldIdsToNames(ukbio=ukb, df=cog_ac_i2)
cog_i2_full = pd.merge(cog_ac_i2_names, pv, on="eid")
cog_2 = cog_i2_full.rename(columns={"26302-2.0": "Specific cognitive ability-2.0", "26306-2.0": "Response delay interval-2.0"})

# Save cognitive data
cog_2.to_csv('/Cog-Ment/CSVs/cog_2_FULL.csv')

In [None]:
# Inspect data
print((cog_2 < 0).sum().sort_values())
print((cog_2 == 0).sum().sort_values())

#Replace negatives with NAs
cog_2[cog_2 < 0] = np.nan

In [None]:
# Count NAs
print((cog_2.isna()).sum().sort_values())

Change response encoding for Prospective memory Initial answer: 1 if the participant correctly touched the orange circle on the first attempt, and 0 if they touched any other shape

Now: 0=blue square, 1=pink star, 2=grey cross, 3=orange circle

Categorical variables in the target set
1. Prospective memory: initial answer: 1 = correct, 0 - incorrect
2. Prospective memory: Final attempt correct: 1 - yes, 0 - no
3. Prospective memory result: 0 = Instruction not recalled, either skipped or incorrect, 1 = Correct recall on first attempt, 2 = Correct recall on second attempt => 0 = Instruction not recalled, either skipped or incorrect, 1 = Correct recall on second attempt, 2 = Correct recall on first attempt

In [None]:
# Change response encoding for Prospective memory Initial answer
cog_2["PM: initial answer-2.0"] = cog_2["PM: initial answer-2.0"].replace(1, 0)
cog_2["PM: initial answer-2.0"] = cog_2["PM: initial answer-2.0"].replace(2, 0)
cog_2["PM: initial answer-2.0"] = cog_2["PM: initial answer-2.0"].replace(3, 1)
cog_2["Prospective memory result-2.0"] = cog_2["Prospective memory result-2.0"].replace(1, 3)
cog_2["Prospective memory result-2.0"] = cog_2["Prospective memory result-2.0"].replace(2, 1)
cog_2["Prospective memory result-2.0"] = cog_2["Prospective memory result-2.0"].replace(3, 2)

In [None]:
# Calculate the % of NAs and sort in an ascending order
percent_missing_cog_2 = cog_2.isnull().sum() * 100 / len(cog_2)
missing_value_cog_2 = pd.DataFrame({'percent_missing': percent_missing_cog_2}).sort_values(by="percent_missing", ascending=True)
with pd.option_context('display.max_rows', None):
    display(missing_value_cog_2)

In [None]:
# Calculate the difference between the whole set and instance 2
dif_missing_cog_2 = -1*(cog_2.isnull().sum() - len(cog_2))
dif_missing_cog_2 = pd.DataFrame({'diff': dif_missing_cog_2}).sort_values(by="diff", ascending=True)
with pd.option_context('display.max_rows', None):
    display(dif_missing_cog_2)

In [None]:
# Drop columns
cog_2_drop = cog_2.drop(['Time to complete round (Field ID: 400)-2.3', 'Number of incorrect matches in round (Field ID: 399)-2.3'], axis=1)
print('Zero values\n',(cog_2_drop == 0).sum().sort_values())

In [None]:
# Replace 0 with NAs columns
cog_2_drop["Fluid intelligence score (Field ID: 20016)-2.0"] = cog_2_drop["Fluid intelligence score (Field ID: 20016)-2.0"].replace(0, np.nan)
cog_2_drop["Specific cognitive ability-2.0"] = cog_2_drop["Specific cognitive ability-2.0"].replace(0, np.nan)
cog_2_drop["Time to complete test-2.0"] = cog_2_drop["Time to complete test-2.0"].replace(0, np.nan)
cog_2_drop["Duration to complete numeric path (trail #1) (Field ID: 6348)-2.0"] = cog_2_drop["Duration to complete numeric path (trail #1) (Field ID: 6348)-2.0"].replace(0, np.nan)
cog_2_drop["Number of puzzles attempted-2.0"] = cog_2_drop["Number of puzzles attempted-2.0"].replace(0, np.nan)
cog_2_drop["Time to complete round (Field ID: 400)-2.1"] = cog_2_drop["Time to complete round (Field ID: 400)-2.1"].replace(0, np.nan)
cog_2_drop["Time to complete round (Field ID: 400)-2.2"] = cog_2_drop["Time to complete round (Field ID: 400)-2.2"].replace(0, np.nan)
cog_2_drop["Duration to complete alphanumeric path (trail #2) (Field ID: 6350)-2.0"] = cog_2_drop["Duration to complete alphanumeric path (trail #2) (Field ID: 6350)-2.0"].replace(0, np.nan)
cog_2_drop["Number of attempts-2.0"] = cog_2_drop["Number of attempts-2.0"].replace(0, np.nan)
cog_2_drop["Final attempt correct-2.0"] = cog_2_drop["Final attempt correct-2.0"].replace(9, np.nan) #9  means "abandoned"

In [None]:
# Drop NAs
cog_2_drop_na = cog_2_drop.dropna(axis=0)
print((cog_2_drop_na.isna()).sum().sort_values())
print((cog_2_drop_na < 0).sum())
cog_2_drop_na.to_csv('/Cog-Ment/CSVs/cog_2_no_na_instance_2.csv')

In [None]:
# Rename the columns
cog2_rename = cog_2_drop_na.rename(columns={'Number of symbol digit matches attempted (Field ID: 23323)-2.0': "SDS:Numb of symbol digit matches attempted",
 'Number of word pairs correctly associated-2.0': "PAL:Numb of word pairs correctly associated",
 'Number of fluid intelligence questions attempted within time limit-2.0': "FI:Numb of FI questions attempted within time limit",
 'Total errors traversing numeric path (trail #1) (Field ID: 6349)-2.0': "TMT:Total errors traversing numeric path #1",
 'Total errors traversing alphanumeric path (trail #2) (Field ID: 6351)-2.0': "TMT:Total errors traversing alphanumeric path #2",
 'Number of puzzles attempted-2.0': "Tower rearranging:Numb of puzzles attempted",
 'Time to complete round (Field ID: 400)-2.2': "Pairs match:Time to complete round 2",
 'Final attempt correct-2.0': "Prosp memory:Final attempt correct",
 'Number of puzzles viewed-2.0': "Matrix PC:Numb of puzzles viewed",
 'Number of attempts-2.0': "Prosp memory:Numb of attempts",
 'PM: initial answer-2.0': "Prosp memory:Initial answer",
 'Number of incorrect matches in round (Field ID: 399)-2.2': "Pairs match:Numb of incorrect matches in round 2",
 'Time to complete round (Field ID: 400)-2.1': "Pairs match:Time to complete round 1",
 'Time to complete test-2.0': "Numeric memory:Time to complete test",
 'Number of puzzles correct-2.0': "Tower rearranging:Numb of puzzles correct",
 'Prospective memory result-2.0': "Prospective memory result",
 'Number of puzzles correctly solved-2.0': "Matrix PC: Numb of puzzles correctly solved",
 'Duration to complete numeric path (trail #1) (Field ID: 6348)-2.0': "TMT:Duration to complete numeric path #1",
 'Number of incorrect matches in round (Field ID: 399)-2.1': "Pairs match:Numb of incorrect matches in round 1",
 'Number of symbol digit matches made correctly (Field ID: 23324)-2.0': "SDS:Numb of symbol digit matches made correctly",
 'Duration to complete alphanumeric path (trail #2) (Field ID: 6350)-2.0': "TMT:Duration to complete alphanumeric path #2",
 'Mean time to correctly identify matches-2.0': "RT:Mean time to correctly identify matches",
 'Maximum digits remembered correctly (Field ID: 4282)-2.0': "Numeric memory:Max digits remembered correctly",
 'Fluid intelligence score (Field ID: 20016)-2.0': "FIS",
 'Specific cognitive ability-2.0': "Picture vocab:Specific cognitive ability",
 'Response delay interval-2.0': "Picture vocab:Response delay interval"})

# Select columns of interest
cog2_twelve = cog2_rename[['eid',
'RT:Mean time to correctly identify matches',
'FIS',
'Numeric memory:Max digits remembered correctly',
'TMT:Duration to complete numeric path #1', 
'TMT:Duration to complete alphanumeric path #2',
'SDS:Numb of symbol digit matches made correctly',
'PAL:Numb of word pairs correctly associated',
'Tower rearranging:Numb of puzzles correct',
'Prosp memory:Initial answer',
'Matrix PC: Numb of puzzles correctly solved',
'Pairs match:Numb of incorrect matches in round 2',
'Picture vocab:Specific cognitive ability']]

print((cog2_twelve == 0).sum())
cog2_twelve.to_csv('/Cog-Ment/CSVs/cog2_twelve.csv', index=False)
cog2_twelve.to_csv('/Cog-Ment/R/g_factor_5_folds/target.csv', index=False)