In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
df = pd.read_csv('synthetic_health_data.csv')
print(df)

In [None]:
df.info()

In [None]:
df[df.columns].nunique()

In [None]:
# Check for missing values
df.isna().sum()

In [None]:
# Check for duplicates 
df.duplicated().sum()

In [None]:
df_cleaned = df

In [None]:
# Age should be an integer Change the data type of age to an integer
df_cleaned['Age'] = df_cleaned['Age'].astype('int')
df_cleaned.info()

In [None]:
# Assuming that 'Smoking_Status' should be categorical data
# convert it to categorical type
df_cleaned['Smoking_Status'] = df_cleaned['Smoking_Status'].astype('category')
df_for_analysis_encoded = pd.get_dummies(df_for_analysis, drop_first=True)

In [None]:
# Check a reasonable range for the 'Age' column, assuming that the age should be between 0 and 100
df_cleaned = df_cleaned[(df_cleaned['Age'] >= 0) & (df_cleaned['Age'] <= 100)]

In [None]:
# Check a reasonable range for 'BMI', assuming that BMI should be between 15 and 40
df_cleaned = df_cleaned[(df_cleaned['BMI'] >= 15) & (df_cleaned['BMI'] <= 40)]


In [None]:
# Check the reasonable range of 'Sleep_Hours', assuming that sleep should be between 0 and 24 hours per day
df_cleaned = df_cleaned[(df_cleaned['Sleep_Hours'] >= 0) & (df_cleaned['Sleep_Hours'] <= 24)]

In [None]:
columns_for_analysis = ['Diet_Quality', 'Exercise_Frequency', 'Sleep_Hours', 'Smoking_Status', 'Alcohol_Consumption', 'Health_Score']
df_for_analysis = df_cleaned[columns_for_analysis].dropna()

In [None]:
descriptive_stats = df_for_analysis.describe()
print("Descriptive Statistics for Selected Variables (Including Smoking Status):")
print(descriptive_stats)

In [None]:
for column in columns_for_analysis:
    if column != 'Smoking_Status':
        plt.figure(figsize=(8, 5))
        sns.histplot(df_for_analysis[column], kde=True, bins=20, edgecolor='black', alpha=0.7)
        mean_value = df_for_analysis[column].mean()
        plt.axvline(mean_value, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean_value:.2f}')
        plt.title(f'{column} Distribution')
        plt.xlabel(column)
        plt.ylabel('Frequency')
        plt.grid(True)
        plt.tight_layout()
        plt.show()

In [None]:
correlation_matrix = df_for_analysis.corr()
print("Correlation Matrix:")
print(correlation_matrix)

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title("Correlation Matrix Heatmap")
plt.tight_layout()
plt.show()

In [None]:
X = df_for_analysis[['Diet_Quality', 'Exercise_Frequency', 'Sleep_Hours', 'Smoking_Status', 'Alcohol_Consumption']]
y = df_for_analysis['Health_Score']
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
print("Regression Analysis Summary:")
print(model.summary())
print("R-squared:", model.rsquared)

In [None]:
r_squared = model.rsquared
coefs = model.params[1:]
plt.figure(figsize=(10, 6))
sns.barplot(x=coefs.index, y=coefs.values, palette="viridis")
plt.title(f"Regression Coefficients for Health Score Prediction (R² = {r_squared:.2f})")
plt.xlabel("Variables")
plt.ylabel("Coefficient Value")
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()
