In [16]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
# 🧹 Load data with custom missing value recognition
df = pd.read_csv('framingham.csv', na_values=["NA", "N/A", "", " ", "null", "?"])

# Basic info
print("Shape of dataset:", df.shape)
print("\nData types:\n", df.dtypes)

# 🔍 Missing Data Analysis
missing_counts = df.isnull().sum()
missing_percent = (missing_counts / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing_counts,
    'Missing %': missing_percent
}).sort_values(by='Missing %', ascending=False)

print("\nMissing Data Overview:\n", missing_df[missing_df['Missing Count'] > 0])

# 🌡️ Visualize

focus_cols = ['glucose', 'education', 'BPMeds', 'totChol', 'cigsPerDay', 'BMI', 'heartRate']
plt.figure(figsize=(12, 6))
sns.heatmap(df[focus_cols].isnull(), cbar=False, cmap='viridis')
plt.title("Missing Data Heatmap")
plt.tight_layout()
plt.show()

In [None]:
# Create indicator columns for missingness
df['glucose_missing'] = df['glucose'].isnull().astype(int)
df['BPMeds_missing'] = df['BPMeds'].isnull().astype(int)
df['cigsPerDay_missing'] = df['cigsPerDay'].isnull().astype(int)

# Plot glucose missingness vs age
plt.figure(figsize=(6,4))
sns.boxplot(data=df, x='glucose_missing', y='age')
plt.title('Age vs Missing Glucose (Check for MAR)')
plt.show()

# Plot BPMeds missingness vs prevalentHyp
plt.figure(figsize=(6,4))
sns.countplot(data=df, x='prevalentHyp', hue='BPMeds_missing')
plt.title('prevalentHyp vs Missing BPMeds')
plt.show()

# Plot cigsPerDay missingness vs currentSmoker
plt.figure(figsize=(6,4))
sns.countplot(data=df, x='currentSmoker', hue='cigsPerDay_missing')
plt.title('Smoking Status vs Missing cigsPerDay')
plt.show()

In [None]:
# اختار الأعمدة اللي فيها missing
columns_with_na = ['glucose', 'education', 'BPMeds', 'totChol', 'cigsPerDay', 'BMI', 'heartRate']

# نسخة من الداتا علشان نحافظ على الأصل
df_basic = df.copy()

# --- Mean Imputation ---
mean_imputer = SimpleImputer(strategy='mean')
df_basic[['glucose', 'totChol', 'BMI', 'heartRate']] = mean_imputer.fit_transform(df_basic[['glucose', 'totChol', 'BMI', 'heartRate']])

# --- Median Imputation ---
median_imputer = SimpleImputer(strategy='median')
df_basic[['cigsPerDay']] = median_imputer.fit_transform(df_basic[['cigsPerDay']])

# --- Mode Imputation (for categorical-like variables) ---
mode_imputer = SimpleImputer(strategy='most_frequent')
df_basic[['education', 'BPMeds']] = mode_imputer.fit_transform(df_basic[['education', 'BPMeds']])

plt.figure(figsize=(10, 4))
sns.heatmap(df_basic[columns_with_na].isnull(), cbar=False, cmap='crest')
plt.title("Missing Values After Basic Imputation")
plt.tight_layout()
plt.show()


In [None]:
df_knn = df.copy()
knn_imputer = KNNImputer(n_neighbors=5)
df_knn[columns_with_na] = knn_imputer.fit_transform(df_knn[columns_with_na])

plt.figure(figsize=(10, 4))
sns.heatmap(df_knn[columns_with_na].isnull(), cbar=False, cmap='viridis')
plt.title("Missing Values After KNN Imputation")
plt.tight_layout()
plt.show()


In [None]:
df_mice = df.copy()
mice_imputer = IterativeImputer(random_state=0)
df_mice[columns_with_na] = mice_imputer.fit_transform(df_mice[columns_with_na])

plt.figure(figsize=(10, 4))
sns.heatmap(df_mice[columns_with_na].isnull(), cbar=False, cmap='plasma')
plt.title("Missing Values After MICE Imputation")
plt.tight_layout()
plt.show()

In [None]:
df_reg = df.copy()

# الخطوة 1: نختار الأعمدة اللي هنستخدمها كـ features
features = ['age', 'BMI', 'totChol', 'heartRate']

# الخطوة 2: نحذف الصفوف اللي فيها missing في أي عمود من الـ features أو glucose
df_reg_filtered = df_reg.dropna(subset=features + ['glucose'])

# الخطوة 3: نقسم البيانات train/test حسب glucose
train = df_reg_filtered[df_reg_filtered['glucose'].notnull()]
test = df_reg_filtered[df_reg_filtered['glucose'].isnull()]  # ده هيطلع فاضي غالبًا بعد dropna

# الخطوة 4: تدريب الموديل
reg = LinearRegression()
reg.fit(train[features], train['glucose'])

# لو فيه test حقيقي (يعني صفوف كانت glucose ناقصة)، نعمل predict ونرجع نحطها
if not test.empty:
    predicted_glucose = reg.predict(test[features])
    df_reg.loc[test.index, 'glucose'] = predicted_glucose
else:
    print("No missing glucose values left to predict after cleaning.")


In [None]:
# نحضّر البيانات
df_dl = df.copy()
df_dl_subset = df_dl[columns_with_na].copy()

# نعمل impute مبدئي بسيط (مثلاً بـ mean) عشان نقدر نديها للـ neural network
df_dl_imputed = df_dl_subset.fillna(df_dl_subset.mean())

# Normalize
scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(df_dl_imputed)

# Autoencoder architecture
input_dim = df_scaled.shape[1]
input_layer = Input(shape=(input_dim,))
encoded = Dense(16, activation='relu')(input_layer)
encoded = Dense(8, activation='relu')(encoded)
decoded = Dense(16, activation='relu')(encoded)
output_layer = Dense(input_dim, activation='sigmoid')(decoded)

autoencoder = Model(input_layer, output_layer)
autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

# Train the autoencoder
autoencoder.fit(df_scaled, df_scaled, epochs=100, batch_size=32, shuffle=True, verbose=0)

# Predict (reconstruct) the data
reconstructed = autoencoder.predict(df_scaled)

# Inverse scaling
df_reconstructed = scaler.inverse_transform(reconstructed)
df_imputed_deep = pd.DataFrame(df_reconstructed, columns=columns_with_na)

# Replace missing values ONLY where original was missing
for col in columns_with_na:
    df_dl.loc[df_dl[col].isnull(), col] = df_imputed_deep.loc[df_dl[col].isnull(), col]
