<div align = "center" style = "font-family:consolas;"> <h1> Alzheimer's  Dataset Exploratory Analysis </h1> </div>

<div align = "center"> <p style = "font-family: consolas"> This notebook is being used to conduct an exploratory analysis and preprocessing on the dataset, i will clean, standardize the data</p> </div>

<ul><li><p style = "font-family: consolas"> Importing important libraries</p></li></ul>

In [39]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split

<ul><li><p style = "font-family: consolas"> Loading the data and describing it</p></li></ul>

In [None]:
df = pd.read_csv('data/data.csv')
df.head()

In [None]:
df.info()
df.describe()

<ul><li><p style = "font-family: consolas"> Check for missing values and duplicate data</p></li></ul>

In [None]:
print(df.isnull().sum())
print(df.duplicated().sum())

<p style = "font-family: consolas"> No missing values found</p>

<ul><li><p style = "font-family: consolas"> Visualize the data</p></li></ul>

In [None]:
df.hist(figsize=(12, 8), bins=30)
plt.show()

In [None]:
# Create a 3x3 grid of subplots
fig, axes = plt.subplots(7, 3, figsize=(24, 16))

# List of categorical columns
categorical_columns = ["Country", "Gender", "Physical Activity Level", 
                       "Smoking Status", "Alcohol Consumption", "Diabetes", "Hypertension", "Cholesterol Level",
                       "Family History of Alzheimer’s" , "Depression Level","Sleep Quality",
                       "Dietary Habits","Air Pollution Exposure", "Employment Status", "Marital Status",
                       "Genetic Risk Factor (APOE-ε4 allele)", "Social Engagement Level",
                       "Income Level","Stress Levels","Urban vs Rural Living","Alzheimer’s Diagnosis"]

# Flatten the 2D axes array to easily loop through it
axes = axes.flatten()

# Plot countplots for each categorical feature
for i, col in enumerate(categorical_columns):
    sns.countplot(y=df[col], ax=axes[i])
    axes[i].set_title(f"{col} Distribution")
    axes[i].tick_params(axis='y', labelsize=12)  # Adjust label size for readability

# Adjust layout
plt.tight_layout()
plt.show()

<ul><li><p style = "font-family: consolas"> Binary Encoding Suitiable Categorical Data </p></li></ul>

In [45]:
df['Gender'] = df['Gender'].map({'Male' : 0 , 'Female' : 1})
df['Diabetes'] = df['Diabetes'].map({'No' : 0 , 'Yes' : 1})
df['Hypertension'] = df['Hypertension'].map({'No' : 0 , 'Yes' : 1})
df['Cholesterol Level'] = df['Cholesterol Level'].map({'Normal' : 0 , 'High' : 1})
df['Family History of Alzheimer’s'] = df['Family History of Alzheimer’s'].map({'No' : 0 , 'Yes' : 1})
df['Genetic Risk Factor (APOE-ε4 allele)'] = df['Genetic Risk Factor (APOE-ε4 allele)'].map({'No' : 0 , 'Yes' : 1})
df['Urban vs Rural Living'] = df['Urban vs Rural Living'].map({'Urban' : 0 , 'Rural' : 1})
df['Alzheimer’s Diagnosis'] = df['Alzheimer’s Diagnosis'].map({'No' : 0 , 'Yes' : 1})

<ul><li><p style = "font-family: consolas"> Split test and training data</p></li></ul>

In [46]:
train_data , test_data = train_test_split(df, test_size=0.2, random_state=42)

<ul><li><p style = "font-family: consolas"> Save Training And Testing Data</p></li></ul>

In [47]:
train_data.to_csv('data/train_data.csv', index=False)
test_data.to_csv('data/test_data.csv', index=False)