# Data Loading

In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

# Load datasets
hbn_data = pd.read_csv('HBN.csv')
data_dictionary = pd.read_csv('data_dictionary.csv')
parquet_file_path = 'E:/programing/code/quera/first project/series.parquet'
series_data = pd.read_parquet(parquet_file_path)


# Data Merging

In [None]:

# Merge datasets on 'id'
combined_data = pd.merge(hbn_data, series_data, on='id', how='left')


# Data Overview

In [None]:

# Display dataset info
print("Dataset Info:")
print(hbn_data.info())


# Missing Data Visualization

In [None]:

# Visualize missing data
plt.figure(figsize=(10, 6))
msno.matrix(hbn_data)
plt.title("Missing Data Matrix")
plt.show()

plt.figure(figsize=(10, 6))
msno.heatmap(hbn_data)
plt.title("Missing Data Heatmap")
plt.show()

plt.figure(figsize=(10, 6))
msno.bar(hbn_data)
plt.title("Existing Data Barplot")
plt.show()


# Exploratory Data Analysis - Distribution

In [None]:

# Distribution of Age
plt.figure(figsize=(8, 5))
sns.histplot(hbn_data['Age'], kde=True, bins=20, color='blue')
plt.title('Distribution of Age')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.grid()
plt.show()

# Distribution of Gender
plt.figure(figsize=(6, 4))
sns.countplot(x='Sex', data=hbn_data, palette='Set2')
plt.title('Gender Distribution')
plt.xlabel('Sex (0 = Male, 1 = Female)')
plt.ylabel('Count')
plt.grid()
plt.show()

# Distribution of BMI
plt.figure(figsize=(8, 5))
sns.histplot(hbn_data['Physical-BMI'], kde=True, bins=20, color='green')
plt.title('Distribution of Physical-BMI')
plt.xlabel('BMI')
plt.ylabel('Frequency')
plt.grid()
plt.show()


# Analysis by Gender

In [None]:

# Muscle Mass by Gender
plt.figure(figsize=(8, 5))
sns.boxplot(x='Sex', y='BIA-BIA_SMM', data=hbn_data, palette='Set2')
plt.title('Muscle Mass by Gender')
plt.xlabel('Gender (0 = Male, 1 = Female)')
plt.ylabel('Skeletal Muscle Mass')
plt.grid()
plt.show()

# Basal Metabolic Rate by Gender
plt.figure(figsize=(8, 5))
sns.boxplot(x='Sex', y='BIA-BIA_BMR', data=hbn_data, palette='Set1')
plt.title('Basal Metabolic Rate (BMR) by Gender')
plt.xlabel('Gender (0 = Male, 1 = Female)')
plt.ylabel('BMR')
plt.grid()
plt.show()


# Sleep Disorder Analysis

In [None]:

# Categorize sleep disorder based on threshold
combined_data['Sleep_Disorder'] = combined_data['SDS-SDS_Total_T'].apply(lambda x: 'No Disorder' if x <= 50 else 'Disorder')

# Visualize Ambient Light by Sleep Disorder
plt.figure(figsize=(8, 5))
sns.boxplot(x='Sleep_Disorder', y='light', data=combined_data, palette='coolwarm')
plt.title('Ambient Light Distribution by Sleep Disorder')
plt.xlabel('Sleep Disorder Category')
plt.ylabel('Average Ambient Light')
plt.grid()
plt.show()


# Correlation Matrix

In [None]:

correlation_columns = [
    'Age', 'Physical-BMI', 'BIA-BIA_SMM', 'BIA-BIA_BMR',
    'light', 'BIA-BIA_DEE', 'BIA-BIA_FFM', 'step', 'SDS-SDS_Total_T'
]
correlation_data = combined_data[correlation_columns].dropna()

# Visualize correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_data.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Key Variables')
plt.show()
