## Items to add in the code
- adding the 
- adding a function to split the data in: training, validation, test set
- adding ANOVA test to check the significance of the features
- adding validation functions, like Classifier Tuning (Hyperparameter Tuning: **grid search** or random search)
- adding models: KNN, Decision Tree, Random Forest, Gradient Boosting, cross-validation
- adding performance metrics for the models (overfitting, underfitting, learning curve)
- adding EDA to visualize the data
- Interpretation of Summary Statistics for the leep productivity Dataset

## actions
- prepare PPT (problem statement, objective, plan, status)
- defining the research and sub questions
- split tasks



In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# dataset
localdir = 'data/sleep_productivity.csv'

def preprocess_data(CSV_file):
    # Load dataset
    df = pd.read_csv(CSV_file)

    # Data Cleaning & Handling missing values
    df.replace("unknown", np.nan, inplace=True)
    df.dropna(subset=["Sleep Duration (hours)", "Deep Sleep (%)"], inplace=True)
    df["Caffeine Intake (mg)"] = df["Caffeine Intake (mg)"].replace(-999, np.nan)

    # Convert Deep Sleep to numeric
    df["Deep Sleep (%)"] = pd.to_numeric(df["Deep Sleep (%)"], errors='coerce')

    # Add Caffeine Intake Category
    df["Caffeine Intake Category"] = pd.cut(df["Caffeine Intake (mg)"], bins=[0, 50, 150, 300], labels=["Low", "Medium", "High"])

    # Drop remaining rows with missing values    
    df = df.dropna()

    return df

def create_features(df):
    # Feature Engineering
    df["Light Sleep (%)"] = df["Sleep Duration (hours)"] - df["Deep Sleep (%)"]
    df["Total Sleep (%)"] = df["Deep Sleep (%)"] + df["Light Sleep (%)"]
    df["Sleep Efficiency"] = df["Total Sleep (%)"] / df["Sleep Duration (hours)"]
    df["Caffeine per Hour"] = df["Caffeine Intake (mg)"] / df["Sleep Duration (hours)"]
    
    return df

def train_model(df, target="Productivity Score"):
    # drop the values that are not needed in the model 
    X = df.drop(columns=[target, "Caffeine Intake Category"])
    
    # Train-test split
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

    # Train a linear regression model
    model = LinearRegression()
    model.fit(X_train, y_train)

    return model

def evaluate_model(model, df):
    # drop the values that are not needed in the model 
    X = df.drop(columns=["Productivity Score", "Caffeine Intake Category"])

    # Evaluate the model and verify the mean squared error for the prediction (y_pred)
    y = df["Productivity Score"]
    y_pred = model.predict(X)
    mse = mean_squared_error(y, y_pred)
    print(f"Mean Squared Error: {mse}")

def visualize_correlations(df):
    # Correlation analysis
    correlation_matrix = df[["Caffeine Intake (mg)", "Sleep Duration (hours)", "Deep Sleep (%)", "Productivity Score"]].corr()

    # Heatmap visualization
    plt.figure(figsize=(8, 6))
    sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
    plt.title("Correlation Between Caffeine Intake, Sleep, and Productivity")
    plt.show()

    # Distribution plots
    plt.figure(figsize=(10, 4))
    sns.histplot(df["Sleep Duration (hours)"], bins=20, kde=True)
    plt.title("Distribution of Sleep Duration")
    plt.xlabel("Hours of Sleep")
    plt.ylabel("Frequency")
    plt.show()

    plt.figure(figsize=(10, 4))
    sns.histplot(df["Productivity Score"], bins=20, kde=True)
    plt.title("Distribution of Productivity Scores")
    plt.xlabel("Productivity Score")
    plt.ylabel("Frequency")
    plt.show()

    # Boxplot of productivity across caffeine intake levels
    df["Caffeine Intake Category"] = pd.cut(df["Caffeine Intake (mg)"], bins=[0, 50, 150, 300], labels=["Low", "Medium", "High"])
    plt.figure(figsize=(8, 5))
    sns.boxplot(x="Caffeine Intake Category", y="Productivity Score", data=df)
    plt.title("Productivity Score by Caffeine Intake Level")
    plt.xlabel("Caffeine Intake Level")
    plt.ylabel("Productivity Score")
    plt.show()

    # Summary statistics
    print(df.describe())

# load and clean data
df = preprocess_data(localdir)

# # Feature Engineering
df = create_features(df)

# Train ML models
model = train_model(df)

# Evaluate and visualize results
evaluate_model(model, df)
visualize_correlations(df)

KeyError: ['Sleep Duration (hours)', 'Deep Sleep (%)']