In [128]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [129]:
data_file_path = "dataset/health_dataset.csv"
df = pd.read_csv(data_file_path, encoding='ascii')
df.columns = df.columns.str.strip() # Remove leading/trailing whitespace from column names

In [None]:
chosen_attr = ['Act_improve_health', 'Age', 'Edu_level', 'Gender', 'Gen_health_state', 'Health_utility_indx', 'Household', 'Life_satisfaction', 'Marital_status', 'Mental_health_state', 'Sense_belonging', 'Stress_level', 'Total_income', 'Weight_state', 'Work_hours', 'Worked_job_business', 'working_status']

# Drop rows with missing values in any of the chosen attributes to ensure complete data for analysis
df_cleaned = df.dropna(subset=chosen_attr) 
row_count = len(df_cleaned)
print(f"Number of rows in the cleaned dataset: {row_count}")

correlation_matrix = df_cleaned[chosen_attr].corr(method='spearman')
# plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, annot_kws={"size": 6})
plt.title('Correlation Heatmap of Health Dataset')
plt.show()

In [None]:
sns.scatterplot(x='Gender', y='Stress_level', data=df_cleaned[['Gender', 'Stress_level']], color='red')
plt.title('Stress Level by Gender')
plt.show()

contingency_table = pd.crosstab(df_cleaned['Gender'], df_cleaned['Stress_level'])
chi2, p, dof, expected = stats.chi2_contingency(contingency_table)
print(f"Chi-squared statistic: {chi2}")
print(f"P-value: {p}")
print(f"Degrees of freedom: {dof}")
print("Expected frequencies:\n", expected)
alpha = 0.05  # Significance level
if p < alpha:
    print("Reject the null hypothesis: Variables are dependent.")
else:
    print("Fail to reject the null hypothesis: Variables are independent.")

In [None]:
# Count the number of rows being used in this analysis
print(f"Number of rows in this analysis: {len(df_cleaned)}")

# Create the scatter plot
sns.scatterplot(x='Marital_status', y='Stress_level', hue='Gender', data=df_cleaned[['Marital_status', 'Stress_level', 'Gender']], palette='bright')
plt.title('Stress Level by Marital Status with Gender Classification')
plt.show()

In [None]:
sns.regplot(x='Mental_health_state', y='Stress_level', data=df_cleaned[['Mental_health_state', 'Stress_level']], scatter=False, color='red')
plt.title('Stress Level by Mental Health State')
plt.show()
sns.regplot(x='Life_satisfaction', y='Stress_level', data=df_cleaned[['Life_satisfaction', 'Stress_level']], scatter=False, color='purple')
plt.title('Stress Level by Life Satisfaction')
plt.show()
sns.regplot(x='Life_satisfaction', y='Weight_state', data=df_cleaned[['Life_satisfaction', 'Weight_state']], scatter=False, color='green')
plt.title('Weight State by Life Satisfaction')
plt.show()

In [None]:
contingency_table = pd.crosstab(df_cleaned['Age'], df_cleaned['working_status'])
chi2, p, dof, expected = stats.chi2_contingency(contingency_table)
print(f"Chi-squared statistic: {chi2}")
print(f"P-value: {p}")
print(f"Degrees of freedom: {dof}")
print("Expected frequencies:\n", expected)
alpha = 0.05  # Significance level
if p < alpha:
    print("Reject the null hypothesis: Variables are dependent.")
else:
    print("Fail to reject the null hypothesis: Variables are independent.")

sns.regplot(x='Age', y='working_status', data=df_cleaned[['Age', 'working_status']], scatter=False, color='blue')
plt.title('Working Status by Age')
plt.show()

In [None]:
contingency_table = pd.crosstab(df_cleaned['Age'], df_cleaned['Edu_level'])
chi2, p, dof, expected = stats.chi2_contingency(contingency_table)
print(f"Chi-squared statistic: {chi2}")
print(f"P-value: {p}")
print(f"Degrees of freedom: {dof}")
print("Expected frequencies:\n", expected)
alpha = 0.05  # Significance level
if p < alpha:
    print("Reject the null hypothesis: Variables are dependent.")
else:
    print("Fail to reject the null hypothesis: Variables are independent.")

sns.regplot(x='Age', y='Edu_level', data=df_cleaned[['Age', 'Edu_level']], scatter=False, color='blue')
plt.title('Education Level by Age')
plt.show()

sns.regplot(x='Edu_level', y='working_status', data=df_cleaned[['Edu_level', 'working_status']], scatter=False, color='blue')
plt.title('Working Status by Education Level')
plt.show()


In [None]:
X = df[['Work_hours', 'Age', 'Life_satisfaction']]  # Independent variables
y = df['Stress_level']  # Dependent variable

X = sm.add_constant(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = sm.OLS(y_train, X_train).fit()
print(model.summary())

predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

In [None]:
sns.regplot(x='Work_hours', y='Stress_level', data=df_cleaned[['Work_hours', 'Stress_level']], scatter=False, color='red')
plt.title('Stress level by working hours')
plt.show()

sns.regplot(x='working_status', y='Stress_level', data=df_cleaned[['working_status', 'Stress_level']], scatter=False, color='blue')
plt.title('Stress level by working status')
plt.show()

In [None]:
# Create separate histograms for Total_income and Stress_level
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.histplot(df_cleaned['Total_income'], kde=True, edgecolor='black')
plt.title('Total Income Distribution')

plt.subplot(1, 2, 2)
sns.histplot(df_cleaned['Stress_level'], kde=True, edgecolor='black')
plt.title('Stress Level Distribution')

plt.tight_layout()
plt.show()