In [1]:
# Multiple Correspondence Analysis (MCA) for identification of discriminitive SNPs

In [6]:
# Install prince library
!pip install prince



In [88]:
import pandas as pd
import prince
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
file_path = './Resources/kidd_train.csv'
data = pd.read_csv(file_path, delimiter=',')

# Replace missing values with 'NN'
data_filled = data.fillna('NN')

# Define non-SNP columns by name
non_snp_columns = ['id']

# Select SNP columns
snp_columns = [col for col in data.columns if col not in non_snp_columns]

# One-Hot Encode the SNP columns
X_encoded = pd.get_dummies(data_filled[snp_columns], drop_first=True)

# Convert boolean values to numeric
X_encoded_numeric = X_encoded.astype(int)

# Apply MCA
mca = prince.MCA(n_components=2)
mca.fit(X_encoded)  # Use the original X_encoded for fitting

# Reindex columns for alignment
X_encoded_numeric_aligned = X_encoded_numeric.reindex(columns=X_encoded.columns, fill_value=0)

# Transform the data
mca_transformed = mca.transform(X_encoded_numeric_aligned)

# Create a DataFrame with the MCA components
mca_df = pd.DataFrame(mca_transformed, columns=['Dimension 1', 'Dimension 2'])

# Ensure alignment of the 'id' column
mca_df['id'] = data['id'].values

# Check DataFrame content
print(mca_df.head())

# Assuming mca_transformed is a DataFrame with the MCA results
plt.figure(figsize=(10, 6))
plt.scatter(mca_transformed[0], mca_transformed[1], alpha=0.5)
plt.xlabel('MCA Component 1')
plt.ylabel('MCA Component 2')
plt.title('MCA Plot')
plt.show()


KeyError: "None of [Index(['rs3737576_CT_False', 'rs3737576_CT_True', 'rs3737576_TC_False',\n       'rs3737576_TC_True', 'rs3737576_TT_False', 'rs3737576_TT_True',\n       'rs7554936_CT_False', 'rs7554936_CT_True', 'rs7554936_TC_False',\n       'rs7554936_TC_True',\n       ...\n       'superpopulation_AMR_False', 'superpopulation_AMR_True',\n       'superpopulation_EAS_False', 'superpopulation_EAS_True',\n       'superpopulation_EUR_False', 'superpopulation_EUR_True',\n       'superpopulation_SAS_False', 'superpopulation_SAS_True',\n       'gender_male_False', 'gender_male_True'],\n      dtype='object', length=340)] are in the [columns]"

In [75]:
# Print the columns used for fitting
print("Columns used for fitting:", X_encoded.columns.tolist())

Columns used for fitting: ['rs3737576_CT', 'rs3737576_NN', 'rs3737576_TC', 'rs3737576_TT', 'rs7554936_CT', 'rs7554936_NN', 'rs7554936_TC', 'rs7554936_TT', 'rs2814778_CT', 'rs2814778_NN', 'rs2814778_TC', 'rs2814778_TT', 'rs798443_AG', 'rs798443_GA', 'rs798443_GG', 'rs798443_NN', 'rs1876482_AG', 'rs1876482_GA', 'rs1876482_GG', 'rs1876482_NN', 'rs1834619_AG', 'rs1834619_GA', 'rs1834619_GG', 'rs1834619_NN', 'rs3827760_AG', 'rs3827760_GA', 'rs3827760_GG', 'rs3827760_NN', 'rs260690_AC', 'rs260690_CA', 'rs260690_CC', 'rs260690_NN', 'rs6754311_CT', 'rs6754311_NN', 'rs6754311_TC', 'rs6754311_TT', 'rs10497191_CT', 'rs10497191_NN', 'rs10497191_TC', 'rs10497191_TT', 'rs12498138_AG', 'rs12498138_GA', 'rs12498138_GG', 'rs12498138_NN', 'rs4833103_AC', 'rs4833103_CA', 'rs4833103_CC', 'rs4833103_NN', 'rs1229984_CT', 'rs1229984_NN', 'rs1229984_TC', 'rs1229984_TT', 'rs3811801_AG', 'rs3811801_GA', 'rs3811801_GG', 'rs3811801_NN', 'rs7657799_GT', 'rs7657799_NN', 'rs7657799_TG', 'rs7657799_TT', 'rs870347_AC'

In [76]:
# Print the columns used for transformation
print("Columns used for transformation:", X_encoded_numeric.columns.tolist())

Columns used for transformation: ['rs3737576_CT', 'rs3737576_NN', 'rs3737576_TC', 'rs3737576_TT', 'rs7554936_CT', 'rs7554936_NN', 'rs7554936_TC', 'rs7554936_TT', 'rs2814778_CT', 'rs2814778_NN', 'rs2814778_TC', 'rs2814778_TT', 'rs798443_AG', 'rs798443_GA', 'rs798443_GG', 'rs798443_NN', 'rs1876482_AG', 'rs1876482_GA', 'rs1876482_GG', 'rs1876482_NN', 'rs1834619_AG', 'rs1834619_GA', 'rs1834619_GG', 'rs1834619_NN', 'rs3827760_AG', 'rs3827760_GA', 'rs3827760_GG', 'rs3827760_NN', 'rs260690_AC', 'rs260690_CA', 'rs260690_CC', 'rs260690_NN', 'rs6754311_CT', 'rs6754311_NN', 'rs6754311_TC', 'rs6754311_TT', 'rs10497191_CT', 'rs10497191_NN', 'rs10497191_TC', 'rs10497191_TT', 'rs12498138_AG', 'rs12498138_GA', 'rs12498138_GG', 'rs12498138_NN', 'rs4833103_AC', 'rs4833103_CA', 'rs4833103_CC', 'rs4833103_NN', 'rs1229984_CT', 'rs1229984_NN', 'rs1229984_TC', 'rs1229984_TT', 'rs3811801_AG', 'rs3811801_GA', 'rs3811801_GG', 'rs3811801_NN', 'rs7657799_GT', 'rs7657799_NN', 'rs7657799_TG', 'rs7657799_TT', 'rs870

In [77]:
# Print columns for debugging
print("Fitted columns:")
print(X_encoded.columns)

print("Transformation columns:")
print(X_encoded_numeric.columns)

# Check for any missing columns
missing_cols = set(X_encoded.columns) - set(X_encoded_numeric.columns)
if missing_cols:
    print(f"Missing columns: {missing_cols}")
else:
    print("All columns are present.")

Fitted columns:
Index(['rs3737576_CT', 'rs3737576_NN', 'rs3737576_TC', 'rs3737576_TT',
       'rs7554936_CT', 'rs7554936_NN', 'rs7554936_TC', 'rs7554936_TT',
       'rs2814778_CT', 'rs2814778_NN',
       ...
       'rs2024566_AG', 'rs2024566_GA', 'rs2024566_GG', 'rs2024566_NN',
       'gender_male', 'superpopulation_AMR', 'superpopulation_EAS',
       'superpopulation_EUR', 'superpopulation_NN', 'superpopulation_SAS'],
      dtype='object', length=226)
Transformation columns:
Index(['rs3737576_CT', 'rs3737576_NN', 'rs3737576_TC', 'rs3737576_TT',
       'rs7554936_CT', 'rs7554936_NN', 'rs7554936_TC', 'rs7554936_TT',
       'rs2814778_CT', 'rs2814778_NN',
       ...
       'rs2024566_AG', 'rs2024566_GA', 'rs2024566_GG', 'rs2024566_NN',
       'gender_male', 'superpopulation_AMR', 'superpopulation_EAS',
       'superpopulation_EUR', 'superpopulation_NN', 'superpopulation_SAS'],
      dtype='object', length=226)
All columns are present.


In [79]:
# Sort columns to ensure the order matches
X_encoded_sorted = X_encoded.sort_index(axis=1)
X_encoded_numeric_aligned_sorted = X_encoded_numeric_aligned.sort_index(axis=1)

# Verify that the order is correct
assert X_encoded_sorted.columns.equals(X_encoded_numeric_aligned_sorted.columns), "Column order does not match"


In [81]:
print(mca_transformed.head())

          0         1
0 -0.163226  0.062070
1  0.450059  0.032034
2  0.402048 -0.019612
3  0.508730  0.023215
4  0.504717  0.046665


In [82]:
X_encoded_numeric_aligned.head()


Unnamed: 0,rs3737576_CT,rs3737576_NN,rs3737576_TC,rs3737576_TT,rs7554936_CT,rs7554936_NN,rs7554936_TC,rs7554936_TT,rs2814778_CT,rs2814778_NN,...,rs2024566_AG,rs2024566_GA,rs2024566_GG,rs2024566_NN,gender_male,superpopulation_AMR,superpopulation_EAS,superpopulation_EUR,superpopulation_NN,superpopulation_SAS
0,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
