In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Charge and Exploration of the Dataset

In [2]:
def load_data(file_path):
    """
    Loads the data from a CSV file and performs an initial exploration.
    
    Args:
    file_path (str): Path to the CSV file containing the data.
    
    Returns:
    pd.DataFrame: DataFrame with the loaded data.
    """
    data = pd.read_csv(file_path)
    print("Dataset dimensions:", data.shape)
    print("\nFirst 5 rows of the dataset:")
    print(data.head())
    print("\nDataset information:")
    print(data.info())
    print("\nDescriptive statistics:")
    print(data.describe())
    return data

## Reading genome.csv

In [4]:
data = load_data('.\genome.csv')

  data = pd.read_csv(file_path)


Dataset dimensions: (966977, 4)

First 5 rows of the dataset:
         rsid chromosome  position genotype
0   rs4477212          1     72017       AA
1   rs3094315          1    742429       AA
2   rs3131972          1    742584       GG
3  rs12124819          1    766409       AA
4  rs11240777          1    788822       AG

Dataset information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 966977 entries, 0 to 966976
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   rsid        966977 non-null  object
 1   chromosome  966977 non-null  object
 2   position    966977 non-null  int64 
 3   genotype    966977 non-null  object
dtypes: int64(1), object(3)
memory usage: 29.5+ MB
None

Descriptive statistics:
           position
count  9.669770e+05
mean   7.672147e+07
std    5.640974e+07
min    3.000000e+00
25%    3.012131e+07
50%    6.667251e+07
75%    1.134776e+08
max    2.471856e+08


# Data Preprocesing
## Cleaning the data

In [None]:
def clean_data(data):
  """
  Cleans the data by handling missing values and verifying the distribution.
  
  Args:
  data (pd.DataFrame): Original DataFrame.
  
  Returns:
  pd.DataFrame: Cleaned DataFrame.
  """

  print ("missing values for columns")
  print(data.isnull().sum())

  # Handle missing values
  imputer = SimpleImputer(strategy='most_frequent')#used to replace missing values with the most frequent value in the column
  data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)# data_imputed is a new dataframe with the missing values replaced

  print("\nDistribution of genotypes:")
  print(data_imputed['genotype'].value_counts(normalize=True)) #will get the percentage of each genotype

  print("\nDistribution of Phenotypes:")
  print(data_imputed['phenotype'].value_counts(normalize=True))
    
  return data_imputed
data_clean = clean_data(data)


## Processing data

In [None]:
def preprocess_data(data):
  """
  Preprocesses the data, including one-hot encoding and scaling.
  
  Args:
  data (pd.DataFrame): Cleaned DataFrame.
  
  Returns:
  tuple: X (features), y (target), preprocessor (ColumnTransformer)
  """
  # Separate features and target
  X = data.drop('phenotype', axis=1)
  y = data['phenotype']
  
  # Create preprocessor
  numeric_features = ['chromosome', 'position']
  categorical_features = ['genotype']
  
  preprocessor = ColumnTransformer(
    transformers=[
      ('num', StandardScaler(), numeric_features),
      ('cat', OneHotEncoder(drop='first', sparse=False), categorical_features)
    ])
  
  # Fit and transform
  X_processed = preprocessor.fit_transform(X)
  
  # Get feature names after preprocessing
  feature_names = (numeric_features + 
           preprocessor.named_transformers_['cat']
           .get_feature_names_out(categorical_features).tolist())
  
  X_processed_df = pd.DataFrame(X_processed, columns=feature_names)
  
  print("Dimensions of X after preprocessing:", X_processed_df.shape)
  print("\nFirst 5 rows of preprocessed X:")
  print(X_processed_df.head())
  
  return X_processed_df, y, preprocessor

X, y, preprocessor = preprocess_data(data_clean)

# Training and Testing Model