In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# Steps for preprocessing:
# 1. Drop columns with more than 50% missing values
# 2. Replace NHANES special codes (7/77/777, 9/99/999) with NaN
# 3. Binary variables: convert to binary Yes/No (1/2) to (1/0)
#    For example, DBQ930 - Main meal planner/preparer (1=Yes, 2=No, 7=Refused, 9=Don't know)
# 4. Categorical variables:
#    - Ordinal variables, e.g. FNQ410 - Food security status: 1=Full food security, 2=Marginal food security, 3=Low food security, 4=Very low food security
#       - Preserve ordinality
#       - Convert to 0-indexed (0-3)
# 5. Continous variables:
#    - Frequency variables, e.g. ALQ121 - Alcohol frequency past 12 months), 0=Never in the past year, 1=Every day, 2=5-6 days/week,...
#      Convert to days per year for continuous analysis
        # alq121_mapping = {
        #     0: 0,      # Never
        #     1: 365,    # Every day
        #     2: 286,    # 5-6 days/week → ~5.5*52
        #     3: 182,    # 3-4 days/week → ~3.5*52
        #     4: 104,    # 2 days/week
        #     5: 78,     # 1 day/week
        #     6: 52,     # 2-3 days/month
        #     7: 36,     # 1 day/month
        #     8: 24,     # 7-11 times/year
        #     9: 15,     # 3-6 times/year
        #     10: 6      # 1-2 times/year
        # }
        # df['ALQ121_days'] = df['ALQ121'].map(alq121_mapping)
#   - Numerical measurements (WHD010=height, WHD020=weight, WHD050=desired weight): 
#     - Check for outliers and reasonable ranges
#     - Standardize units if necessary
#     - Create new features, e.g., BMI from height and weight
# 6. Target variable: DSM-V based depression diagnosis

# Missing values?
# Combining features?
# Encode categorical variables?

### Load dataset

In [3]:
df_depression_data = pd.read_csv('processed_data/depression_data.csv')
numerical_cols = df_depression_data.select_dtypes(include=['number', 'float64', 'int64', 'datetime']).columns.tolist()
categorical_cols = df_depression_data.select_dtypes(include=['category', 'object']).columns.tolist()

print(f"Numerical columns: {numerical_cols}")
print(f"Categorical columns: {categorical_cols}")

Numerical columns: ['SEQN', 'DPQ010', 'DPQ020', 'DPQ030', 'DPQ040', 'DPQ050', 'DPQ060', 'DPQ070', 'DPQ080', 'DPQ090', 'DPQ100', 'ACD010A', 'ACD010B', 'ACD010C', 'ACD040', 'ALQ111', 'ALQ121', 'ALQ130', 'ALQ142', 'ALQ270', 'ALQ280', 'ALQ151', 'ALQ170', 'BPQ020', 'BPQ030', 'BPQ150', 'BPQ080', 'BPQ101D', 'DBQ010', 'DBD030', 'DBD041', 'DBD050', 'DBD055', 'DBD061', 'DBQ073A', 'DBQ073B', 'DBQ073C', 'DBQ073D', 'DBQ073E', 'DBQ073U', 'DBQ301', 'DBQ330', 'DBQ360', 'DBQ370', 'DBD381', 'DBQ390', 'DBQ400', 'DBD411', 'DBQ421', 'DBQ424', 'DBQ930', 'DBQ935', 'DBQ940', 'DBQ945', 'DIQ010', 'DID040', 'DIQ160', 'DIQ180', 'DIQ050', 'DID060', 'DIQ060U', 'DIQ070', 'FNQ021', 'FNQ041', 'FNQ050', 'FNQ060', 'FNQ080', 'FNQ160', 'FNQ100', 'FNQ110', 'FNQ120', 'FNQ170', 'FNQ180', 'FNQ190', 'FNQ130', 'FNQ200', 'FNQ140', 'FNQ150', 'FNDCDI', 'FNQ410', 'FNQ430', 'FNQ440', 'FNQ450', 'FNQ460', 'FNQ470', 'FNQ480', 'FNQ490', 'FNQ510', 'FNQ520', 'FNQ530', 'FNQ540', 'FNDADI', 'FNDAEDI', 'FSD032A', 'FSD032B', 'FSD032C', 'FSD041

### Handle columns with missing values
Replace NHANES codes for refuse/missing wih NaN and drop columns with >45% missing values.

In [None]:
def replace_nhanes_special_codes(df, exclude_cols):
    """
    Replace NHANES special codes with NaN.
    - 7, 77, 777, 7777, 77777 = Refused
    - 9, 99, 999, 9999, 99999, 55555 = Don't know / Missing
    """
    # Define special codes to replace
    refused_codes = [7, 77, 777, 7777, 77777]
    dont_know_codes = [9, 99, 999, 9999, 99999, 55555]
    special_codes = refused_codes + dont_know_codes
    
    replaced_count = 0
    for col in df.select_dtypes(include=[np.number]).columns:
        if col in exclude_cols:
            continue
        # Replace standard special codes
        mask = df[col].isin(special_codes)
        replaced_count += mask.sum()
        df.loc[mask, col] = np.nan
    
    print(f"Replaced {replaced_count} special code values with NaN")
    return df

In [5]:
def drop_high_missing_columns(df, threshold=50):
    """Drop columns with missing percentage above threshold."""
    missing_pct = df.isna().sum() / len(df) * 100
    cols_to_drop = missing_pct[missing_pct > threshold].index.tolist()
    df = df.drop(columns=cols_to_drop)
    print(f"Dropped {len(cols_to_drop)} columns with >{threshold}% missing values")
    print(f"Shape after dropping high-missing columns: {df.shape}")
    return df

In [None]:
# replace NHANES special codes with NaN
exclude_columns = ['SEQN', 'ALQ121', 'ALQ130', 'ALQ142', 'ALQ270', 'ALQ280', 'ALQ170', 'HOD051', 'PAD790Q',
                   'PAD800', 'PAD810Q', 'PAD680', 'RHQ010', 'SLD012', 'SLD013', 'WHD010', 'WHD020', 'WHD050']
df_depression_data = replace_nhanes_special_codes(df_depression_data, exclude_cols=exclude_columns)

# drop columns with more than 50% missing values
df_depression_data = drop_high_missing_columns(df_depression_data, threshold=45)

# drop id column, unnecessary flag column and target leakage columns
df_depression_data.drop(columns=['SEQN', 'SMAQUEX2', 'FNQ530', 'FNQ540'], inplace=True, errors='ignore')
df_depression_data

Replaced 8750 special code values with NaN
Dropped 146 columns with >45% missing values
Shape after dropping high-missing columns: (4167, 119)


Unnamed: 0,DPQ010,DPQ020,DPQ030,DPQ040,DPQ050,DPQ060,DPQ070,DPQ080,DPQ090,DPQ100,...,SMQ681,SMQ846,SMQ851,SMQ863,SMDANY,SMQ020,WHD010,WHD020,WHD050,WHQ070
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,2.0,2.0,2.0,2.0,1.0,70.0,220.0,220.0,2.0
1,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,2.0,2.0,2.0,2.0,2.0,60.0,150.0,165.0,1.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,2.0,2.0,2.0,2.0,2.0,1.0,68.0,200.0,180.0,2.0
3,3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,1.0,2.0,...,1.0,1.0,2.0,2.0,1.0,1.0,69.0,220.0,265.0,2.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,2.0,2.0,2.0,2.0,2.0,61.0,228.0,235.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4162,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,2.0,2.0,2.0,2.0,2.0,67.0,204.0,234.0,2.0
4163,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,2.0,2.0,2.0,2.0,2.0,1.0,67.0,245.0,290.0,1.0
4164,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,2.0,2.0,2.0,2.0,1.0,62.0,169.0,172.0,1.0
4165,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,...,2.0,2.0,2.0,2.0,2.0,1.0,65.0,180.0,180.0,1.0


In [7]:
numerical_cols = df_depression_data.select_dtypes(include=['number', 'float64', 'int64', 'datetime']).columns.tolist()
categorical_cols = df_depression_data.select_dtypes(include=['category', 'object']).columns.tolist()

print(f"Numerical columns: {numerical_cols}")
print(f"Categorical columns: {categorical_cols}")

Numerical columns: ['DPQ010', 'DPQ020', 'DPQ030', 'DPQ040', 'DPQ050', 'DPQ060', 'DPQ070', 'DPQ080', 'DPQ090', 'DPQ100', 'ACD010A', 'ALQ111', 'ALQ121', 'ALQ130', 'ALQ142', 'ALQ151', 'BPQ020', 'BPQ080', 'BPQ101D', 'DBQ930', 'DBQ935', 'DBQ940', 'DBQ945', 'DIQ010', 'DIQ160', 'DIQ180', 'FNQ410', 'FNQ430', 'FNQ440', 'FNQ450', 'FNQ460', 'FNQ470', 'FNQ480', 'FNQ490', 'FNQ510', 'FNQ520', 'FNDADI', 'FNDAEDI', 'FSD032A', 'FSD032B', 'FSD032C', 'FSDAD', 'FSD151', 'FSQ165', 'FSD162', 'HIQ011', 'HIQ210', 'HOD051', 'HSQ590', 'HUQ010', 'HUQ030', 'HUQ042', 'HUQ055', 'HUQ090', 'INDFMMPI', 'INDFMMPC', 'INQ300', 'KIQ022', 'KIQ005', 'KIQ042', 'KIQ044', 'KIQ481', 'MCQ010', 'AGQ030', 'MCQ053', 'MCQ160A', 'MCQ160B', 'MCQ160C', 'MCQ160D', 'MCQ160E', 'MCQ160F', 'MCQ160M', 'MCQ160P', 'MCQ160L', 'MCQ550', 'MCQ560', 'MCQ220', 'OSQ230', 'OCD150', 'OHQ845', 'OHQ620', 'OHQ630', 'OHQ640', 'OHQ660', 'OHQ670', 'OHQ680', 'PAD790Q', 'PAD800', 'PAD810Q', 'PAD680', 'RHQ010', 'RHQ031', 'RHD280', 'RXQ510', 'RXQ033', 'RXQ050', 

### Feature Engineering

In [8]:
def calculate_bmi(df):
    """Calculate BMI from height (WHD010) in inches and weight (WHD020) in pounds."""
    # Calculate BMI from height and weight
    # WHD010 is in inches, WHD020 is in pounds
    df['BMI'] = (df['WHD020'] / (df['WHD010'] ** 2)) * 703

    # BMI categories
    df['BMI_category'] = pd.cut(df['BMI'], 
                                bins=[0, 18.5, 25, 30, float('inf')],
                                labels=['underweight', 'normal', 'overweight', 'obese'])

    df.drop(columns=['WHD010', 'WHD020'], inplace=True)
    return df

In [9]:
def calculate_total_alcohol_consumption(df):
    """Calculate total annual alcohol consumption estimate."""
    # Convert ALQ121 to days per year
    alq121_to_days = {
        0: 0,      # Never
        1: 365,    # Every day (original code 1)
        2: 286,    # Nearly every day
        3: 182,    # 3-4 times/week
        4: 104,    # 2 times/week  
        5: 52,     # Once/week
        6: 30,     # 2-3 times/month
        7: 12,     # Once/month
        8: 9,      # 7-11 times/year
        9: 4.5,    # 3-6 times/year
        10: 1.5    # 1-2 times/year
    }
    df['alcohol_days_per_year'] = df['ALQ121'].map(alq121_to_days)

    # Total alcohol consumption estimate
    df['annual_drinks'] = df['alcohol_days_per_year'] * df['ALQ130']
    
    df.drop(columns=['ALQ121', 'ALQ130'], inplace=True)
    return df

In [10]:
def convert_time_columns(df):
    """Convert time strings (HH:MM) to decimal hours."""
    time_cols = ['SLQ300', 'SLQ310', 'SLQ320', 'SLQ330']
    
    def time_to_hours(time_str):
        if pd.isna(time_str):
            return np.nan
        try:
            h, m = time_str.split(':')
            return int(h) + int(m) / 60
        except:
            return np.nan
    
    for col in time_cols:
        if col in df.columns:
            df[col] = df[col].apply(time_to_hours)
    return df

### Special ordinal cases
- Some features (e.g., DIQ010) are ordinal but unordered: 1: Yes, 2: No, 3: Borderline
- Some features are in reverse order (1: every day, 2: nearly every day, etc.), when it should be higher number = higher frequency/severity
- Most ordinal features have 1: Yes, 2: No. We need to convert 2 to 0 for proper encoding

In [11]:
def remap_ordinal_features(df):
    # for some ordinal features, higher value = less frequent/severe => should be reversed
    remappings = {
        # FNQ510: 1=daily → 5=never (should be: higher = more frequent)
        'FNQ510': {1: 4, 2: 3, 3: 2, 4: 1, 5: 0},  # daily→4, weekly→3, monthly→2, few times→1, never→0
        
        # OHQ620-OHQ680: 1=very often → 5=never (should be: higher = more problems)
        'OHQ620': {1: 4, 2: 3, 3: 2, 4: 1, 5: 0},
        'OHQ630': {1: 4, 2: 3, 3: 2, 4: 1, 5: 0},
        'OHQ640': {1: 4, 2: 3, 3: 2, 4: 1, 5: 0},
        'OHQ660': {1: 4, 2: 3, 3: 2, 4: 1, 5: 0},
        'OHQ670': {1: 4, 2: 3, 3: 2, 4: 1, 5: 0},
        'OHQ680': {1: 4, 2: 3, 3: 2, 4: 1, 5: 0},
        
        # FSD032A-C: 1=often true → 3=never true (should be: higher = more food insecurity)
        'FSD032A': {1: 2, 2: 1, 3: 0},
        'FSD032B': {1: 2, 2: 1, 3: 0},
        'FSD032C': {1: 2, 2: 1, 3: 0},

        # ALQ121, ALQ142: 0=Never, 1=Every day → 10=1-2 times a year (should be: higher = more frequent)
        # 'ALQ121': {0: 0, 10: 1, 9: 2, 8: 3, 7: 4, 6: 5, 5: 6, 4: 7, 3: 8, 2: 9, 1: 10},
        'ALQ142': {0: 0, 10: 1, 9: 2, 8: 3, 7: 4, 6: 5, 5: 6, 4: 7, 3: 8, 2: 9, 1: 10},

        # DIQ010: 1=Yes, 2=No, 3=Borderline
        # Correct order: No(0) → Borderline(1) → Yes(2)
        'DIQ010': {2: 0, 3: 1, 1: 2},  
        
        # FNQ520: 1=a little, 2=a lot, 3=somewhere in between
        # Correct order: little(0) → between(1) → a lot(2)
        'FNQ520': {1: 0, 3: 1, 2: 2},
    }

    for col, mapping in remappings.items():
        if col in df.columns:
            df[col] = df[col].map(mapping)
            
    return df

In [12]:
def convert_binary_columns(df, binary_cols):
    """Convert binary columns from (1,2) to (1,0)."""
    for col in binary_cols:
        if col in df.columns:
            df[col] = df[col].replace({2: 0, 2.0: 0})
    return df

Shift ordinal columns to 0-indexed.

In [13]:
def shift_to_zero_indexed(df, columns):
    """Shift ordinal columns starting at 1 to start at 0."""
    for col in columns:
        if col in df.columns:
            min_val = df[col].dropna().min()
            if min_val >= 1:
                df[col] = df[col] - min_val
    return df

### Identify categorical and numerical columns

In [14]:
TARGET_COLS = ['DPQ010', 'DPQ020', 'DPQ030', 'DPQ040', 'DPQ050', 'DPQ060', 'DPQ070', 'DPQ080', 'DPQ090', 'DPQ100']

In [15]:
def identify_binary_columns(df, exclude_cols):
    """Identify binary columns (values 1 and 2 only)."""
    binary_cols = []
    for col in df.select_dtypes(include=[np.number]).columns:
        if col in exclude_cols:
            continue
        unique_vals = set(df[col].dropna().unique())
        if unique_vals and unique_vals.issubset({1, 2, 1.0, 2.0}):
            binary_cols.append(col)
    return binary_cols

In [16]:
def get_known_categorical_columns():
    """Identify known categorical columns: ordinal, nominal."""
    # Known ordinal and nominal columns based on NHANES codebooks
    ordinal_cols = [
        'HUQ010',
        'FNQ410', 'FNQ430', 'FNQ440', 'FNQ450', 'FNQ460', 
        'FNQ470', 'FNQ480', 'FNQ490',
        'FNQ510', 'FNQ520',
        'FSD032A', 'FSD032B', 'FSD032C', 'FSDAD',
        'ALQ142',
        'OHQ845', 'OHQ620', 'OHQ630', 'OHQ640', 'OHQ660', 'OHQ670', 'OHQ680',
        'DIQ010', 'INDFMMPC', 'KIQ005', 'SMD460',
    ]
    
    nominal_cols = [
        'HUQ042', 'OCD150', 'HUQ030',
    ]

    return ordinal_cols, nominal_cols

In [17]:
def identify_column_types(df, target_cols):
    """
    Separate columns into ordinal, nominal, binary, object (categorical) and numerical types.
    """
    exclude_cols = set(target_cols)
    object_cols = df.select_dtypes(include='object').columns.tolist()
    exclude_cols.update(object_cols)

    # Get known ordinal and nominal columns
    ordinal_cols, nominal_cols = get_known_categorical_columns()

    ordinal_cols = [col for col in ordinal_cols if col in df.columns]
    nominal_cols = [col for col in nominal_cols if col in df.columns]

    # Identify binary columns (Yes=1, No=2)
    binary_cols = identify_binary_columns(df, exclude_cols)
    
    # Combine all known categorical columns
    known_categorical = set(ordinal_cols + nominal_cols + binary_cols)

    # Build numerical list
    numerical_cols = []
    for col in df.select_dtypes(include=[np.number]).columns:
        if col not in known_categorical and col not in exclude_cols:
            numerical_cols.append(col)
    
    return ordinal_cols, nominal_cols, binary_cols, numerical_cols, object_cols

### Preprocessing pipeline

In [18]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer

In [19]:
# Identify binary and categorical columns
binary_cols = identify_binary_columns(df_depression_data, exclude_cols=TARGET_COLS)
ordinal_cols, nominal_cols = get_known_categorical_columns()

In [20]:
df_depression_data = convert_time_columns(df_depression_data)
df_depression_data = calculate_bmi(df_depression_data)
df_depression_data = calculate_total_alcohol_consumption(df_depression_data)
df_depression_data = remap_ordinal_features(df_depression_data)
df_depression_data = convert_binary_columns(df_depression_data, binary_cols)
df_depression_data = shift_to_zero_indexed(df_depression_data, ordinal_cols)

In [21]:
# Identify column types
(ordinal_cols, nominal_cols, binary_cols, numerical_cols, object_cols) = identify_column_types(
    df_depression_data,
    target_cols=TARGET_COLS
    )

In [22]:
if 'BMI_category' in df_depression_data.columns:
    df_depression_data['BMI_category'] = df_depression_data['BMI_category'].astype(str)
    nominal_cols.append('BMI_category')

In [23]:
print(f"Ordinal columns: {len(ordinal_cols)}")
print(f"Nominal columns: {len(nominal_cols)}")
print(f"Binary columns: {len(binary_cols)}")
print(f"Numerical columns: {len(numerical_cols)}")
print(f"Object columns (excluded): {len(object_cols)}")
print(f"Total columns identified: {len(ordinal_cols) + len(nominal_cols) + len(binary_cols) + len(numerical_cols) + len(object_cols)}")

Ordinal columns: 27
Nominal columns: 4
Binary columns: 1
Numerical columns: 71
Object columns (excluded): 2
Total columns identified: 105


In [28]:
object_cols

['PAD790U', 'PAD810U']

In [24]:
df_depression_data.to_csv('processed_data/depression_data_preprocessed.csv', index=False)
df_depression_data.head(30)

Unnamed: 0,DPQ010,DPQ020,DPQ030,DPQ040,DPQ050,DPQ060,DPQ070,DPQ080,DPQ090,DPQ100,...,SMQ851,SMQ863,SMDANY,SMQ020,WHD050,WHQ070,BMI,BMI_category,alcohol_days_per_year,annual_drinks
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,220.0,0.0,31.563265,obese,286.0,858.0
1,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,165.0,1.0,29.291667,overweight,1.5,1.5
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,180.0,0.0,30.406574,obese,104.0,208.0
3,3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,1.0,2.0,...,0.0,0.0,1.0,1.0,265.0,0.0,32.484772,obese,,
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,235.0,1.0,43.075517,obese,9.0,9.0
5,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,320.0,1.0,44.045102,obese,182.0,364.0
6,1.0,1.0,2.0,3.0,3.0,1.0,2.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,280.0,0.0,49.594356,obese,1.5,1.5
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,143.0,0.0,26.152185,overweight,182.0,182.0
8,1.0,1.0,1.0,1.0,3.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,285.0,1.0,50.47997,obese,9.0,18.0
9,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,160.0,0.0,22.955102,normal,182.0,364.0


#### Basic preprocessing
Only imputes missing values and encodes categorical variables.

In [25]:
# Basic pipeline
basic_preprocessor = ColumnTransformer(transformers=[
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), numerical_cols),
    
    ('ord', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
    ]), ordinal_cols),
    
    ('nom', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
    ]), nominal_cols),
    
    ('bin', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
    ]), binary_cols),
])

In [26]:
target_cols = ['DPQ010', 'DPQ020', 'DPQ030', 'DPQ040', 
               'DPQ050', 'DPQ060', 'DPQ070', 'DPQ080', 
               'DPQ090', 'DPQ100']
y = df_depression_data[target_cols].copy()
X = df_depression_data.drop(columns=target_cols)

In [27]:
basic_preprocessor.fit_transform(X)



array([[ 0.32766011, -0.46735538,  1.31895602, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.32766011, -0.46735538, -0.7581754 , ...,  1.        ,
         0.        ,  1.        ],
       [ 0.32766011, -0.46735538, -0.7581754 , ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.32766011, -0.46735538, -0.7581754 , ...,  0.        ,
         0.        ,  1.        ],
       [ 0.32766011, -0.46735538,  1.31895602, ...,  1.        ,
         0.        ,  1.        ],
       [ 0.32766011, -0.46735538,  1.31895602, ...,  1.        ,
         0.        ,  1.        ]], shape=(4167, 109))