In [None]:

# import pandas as pd

# # Load the dataset
# file_path = 'your_dataset.csv'
# data = pd.read_csv(file_path, delimiter='\t')

# # Replace missing values with a specific marker, such as 'NN'
# data_filled = data.fillna('NN')

# # Select SNP columns for encoding
# snp_columns = data.columns[1:-2]  # assuming the last two columns are gender and superpopulation

# # One-Hot Encode the SNP columns
# X_encoded = pd.get_dummies(data_filled[snp_columns], drop_first=True)

# # Add the non-SNP columns back to the encoded DataFrame
# clean_data_encoded = pd.concat([data[['id', 'gender', 'superpopulation']], X_encoded], axis=1)

# # Display the first few rows to verify the changes
# print(clean_data_encoded.head())

# # Estimate the size of the encoded DataFrame
# print(f'Estimated number of columns after one-hot encoding: {clean_data_encoded.shape[1]}')
# print(f'Estimated dataset size: {clean_data_encoded.memory_usage(deep=True).sum() / 1024:.2f} KB')


In [None]:
# Import, one-hot-encode, PCA with 2 components, then 2d display 

import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
file_path = 'your_dataset.csv'
data = pd.read_csv(file_path, delimiter='\t')

# Replace missing values with 'NN'
data_filled = data.fillna('NN')

# Select SNP columns for encoding
snp_columns = data.columns[1:-2]  # assuming the last two columns are gender and superpopulation

# One-Hot Encode the SNP columns
X_encoded = pd.get_dummies(data_filled[snp_columns], drop_first=True)

# Add the non-SNP columns back to the encoded DataFrame
clean_data_encoded = pd.concat([data[['id', 'gender', 'superpopulation']], X_encoded], axis=1)

# Drop non-numeric columns for PCA
X_pca = clean_data_encoded.drop(columns=['id', 'gender', 'superpopulation'])

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_pca)

# Apply PCA
pca = PCA(n_components=2)  # Reduce to 2 components for visualization
principal_components = pca.fit_transform(X_scaled)

# Create a DataFrame with the principal components
pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])
pca_df['superpopulation'] = data['superpopulation']

# Plot the PCA results
plt.figure(figsize=(10, 7))
sns.scatterplot(data=pca_df, x='PC1', y='PC2', hue='superpopulation', palette='tab10')
plt.title('PCA of Genetic Data')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title='Superpopulation')
plt.show()


In [None]:
# Create a new PCA with 3 components, then 3d plot

# import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# Assuming you have 3 PCA components
num_components = 3
pca = PCA(n_components=num_components)
principal_components = pca.fit_transform(X_scaled)

# Create a DataFrame with the principal components
pca_df = pd.DataFrame(data=principal_components, columns=[f'PC{i+1}' for i in range(num_components)])
pca_df['superpopulation'] = data['superpopulation']

# 3D Scatter plot
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')
sc = ax.scatter(pca_df['PC1'], pca_df['PC2'], pca_df['PC3'], c=pca_df['superpopulation'].astype('category').cat.codes, cmap='tab10')

ax.set_xlabel('Principal Component 1')
ax.set_ylabel('Principal Component 2')
ax.set_zlabel('Principal Component 3')
plt.title('3D PCA of Genetic Data')
plt.colorbar(sc, label='Superpopulation')
plt.show()