# **Data Collection**

## Objectives

* Download data from Kaggle.com and perform an initial EDA.

## Inputs

* unclean_smartwatch_health_data.csv

## Outputs

* ydata-profiling EDA

## Additional Comments

* In case you have any additional comments that don't fit in the previous bullets, please state them here. 

---

# Change working directory

* We are assuming you will store the notebooks in a subfolder, therefore when running the notebook in the editor, you will need to change the working directory

We need to change the working directory from its current folder to its parent folder
* We access the current directory with os.getcwd()

In [None]:
import os
current_dir = os.getcwd()
current_dir

We want to make the parent of the current directory the new current directory
* os.path.dirname() gets the parent directory
* os.chir() defines the new current directory

In [None]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

Confirm the new current directory

In [None]:
current_dir = os.getcwd()
current_dir

Locate our clean data files

In [None]:
CleanedDataFolder = "inputs/cleaned_data/"
OutputFolder = "outputs/"
HypothesisOneFolder = CleanedDataFolder + "hypothesis_one_data/"
HypothesisTwoFolder = CleanedDataFolder + "hypothesis_two_data/"
HypothesisThreeFolder = CleanedDataFolder + "hypothesis_three_data/"

Load the cleaned data into variables

In [None]:
import pandas as pd
# Hypothesis 1
data = pd.read_csv(HypothesisOneFolder + "cleaned_smartwatch_health_data.csv")
df1 = pd.DataFrame(data)

# Hypothesis 2
xtest_set = pd.read_csv(HypothesisTwoFolder + "X_test.csv")
ytest_set = pd.read_csv(HypothesisTwoFolder + "y_test.csv")
xtrain_set = pd.read_csv(HypothesisTwoFolder + "X_train.csv")
ytrain_set = pd.read_csv(HypothesisTwoFolder + "y_train.csv")

# Hypothesis 3
xtest_set = pd.read_csv(HypothesisThreeFolder + "X_test.csv")
ytest_set = pd.read_csv(HypothesisThreeFolder + "y_test.csv")
xtrain_set = pd.read_csv(HypothesisThreeFolder + "X_train.csv")
ytrain_set = pd.read_csv(HypothesisThreeFolder + "y_train.csv")


# Section 2 Normality, Skewness and Kurtosis Improvement


Lets try improve the datasets variables distriubtions and such

In [None]:
# check min and max for numeric variables to see if boxcox is suitable
for col in df1:
    if df1[col].dtype == "float64" or df1[col].dtype == "int64":
        print(f"{col} min: {df1[col].min()}, max: {df1[col].max()}")

df1.head()

As we dont have any negative values, The BoxCox Transformer seems appropriate to try first.

In [None]:
from sklearn.pipeline import Pipeline
from feature_engine import transformation as vt
from feature_engine.imputation import MeanMedianImputer
import seaborn as sns
import pingouin as pg
import matplotlib.pyplot as plt

df_numeric = df1.select_dtypes(include=['float64','int64'])

def calculate_skew_kurtosis(df,col, moment):
  print(f"{moment}  | skewness: {df[col].skew().round(2)} | kurtosis: {df[col].kurtosis().round(2)}")


pipeline = Pipeline([
      ( 'log', vt.BoxCoxTransformer() ) # Change the transformer here
  ])

df_transformed = pipeline.fit_transform(df_numeric)

def compare_distributions_before_and_after_applying_transformer(df, df_transformed, method):

  for col in df.columns:
    print(f"*** {col} ***")
    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10,8))

    sns.histplot(data=df, x=col, kde=True, ax=axes[0,0])
    axes[0,0].set_title(f'Before {method}')
    pg.qqplot(df[col], dist='norm',ax=axes[0,1])
    
    sns.histplot(data=df_transformed, x=col, kde=True, ax=axes[1,0])
    axes[1,0].set_title(f'After {method}')
    pg.qqplot(df_transformed[col], dist='norm',ax=axes[1,1])
    
    plt.tight_layout()
    plt.show()
    
    # Save plot
    plot_names = method + "_" + col + ".png"
    # Add a subfolder to the output folder for normality and skewness improvement plots
    NormalitySkewness = os.path.join(OutputFolder, f"norm_skew_improvement/{method}/")
    if not os.path.exists(NormalitySkewness):
        os.makedirs(NormalitySkewness)
    plot_dir = os.path.join(NormalitySkewness, plot_names)
    fig.savefig(plot_dir)
    

    calculate_skew_kurtosis(df,col, moment='before transformation')
    calculate_skew_kurtosis(df_transformed,col, moment='after transformation')
    print("\n")
    
compare_distributions_before_and_after_applying_transformer(df_numeric, df_transformed, method='BoxCoxTransformer')

The results are good from the BoxCox Transformer

Lets try the Log Transformer next

In [None]:
df_numeric = df1.select_dtypes(include=['float64','int64'])

def calculate_skew_kurtosis(df,col, moment):
  print(f"{moment}  | skewness: {df[col].skew().round(2)} | kurtosis: {df[col].kurtosis().round(2)}")


pipeline = Pipeline([
      ( 'log', vt.LogTransformer() ) # Change transformation type here
  ])

df_transformed = pipeline.fit_transform(df_numeric)

def compare_distributions_before_and_after_applying_transformer(df, df_transformed, method):

  for col in df.columns:
    print(f"*** {col} ***")
    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10,8))

    sns.histplot(data=df, x=col, kde=True, ax=axes[0,0])
    axes[0,0].set_title(f'Before {method}')
    pg.qqplot(df[col], dist='norm',ax=axes[0,1])
    
    sns.histplot(data=df_transformed, x=col, kde=True, ax=axes[1,0])
    axes[1,0].set_title(f'After {method}')
    pg.qqplot(df_transformed[col], dist='norm',ax=axes[1,1])
    
    plt.tight_layout()
    plt.show()
    
    # Save plot
    plot_names = method + "_" + col + ".png"
    # Add a subfolder to the output folder for normality and skewness improvement plots
    NormalitySkewness = os.path.join(OutputFolder, f"norm_skew_improvement/{method}/")
    if not os.path.exists(NormalitySkewness):
        os.makedirs(NormalitySkewness)
    plot_dir = os.path.join(NormalitySkewness, plot_names)
    fig.savefig(plot_dir)
    

    calculate_skew_kurtosis(df,col, moment='before transformation')
    calculate_skew_kurtosis(df_transformed,col, moment='after transformation')
    print("\n")
    
compare_distributions_before_and_after_applying_transformer(df_numeric, df_transformed, method='LogTransformer')

The Log Transformer did not perform as well as the BoxCox

Lets also try the Power Tranformer

In [None]:
df_numeric = df1.select_dtypes(include=['float64','int64'])

def calculate_skew_kurtosis(df,col, moment):
  print(f"{moment}  | skewness: {df[col].skew().round(2)} | kurtosis: {df[col].kurtosis().round(2)}")


pipeline = Pipeline([
      ( 'log', vt.PowerTransformer() ) # Change transformation type here
  ])

df_transformed = pipeline.fit_transform(df_numeric)

def compare_distributions_before_and_after_applying_transformer(df, df_transformed, method):

  for col in df.columns:
    print(f"*** {col} ***")
    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10,8))

    sns.histplot(data=df, x=col, kde=True, ax=axes[0,0])
    axes[0,0].set_title(f'Before {method}')
    pg.qqplot(df[col], dist='norm',ax=axes[0,1])
    
    sns.histplot(data=df_transformed, x=col, kde=True, ax=axes[1,0])
    axes[1,0].set_title(f'After {method}')
    pg.qqplot(df_transformed[col], dist='norm',ax=axes[1,1])
    
    plt.tight_layout()
    plt.show()
    
    # Save plot
    plot_names = method + "_" + col + ".png"
    # Add a subfolder to the output folder for normality and skewness improvement plots
    NormalitySkewness = os.path.join(OutputFolder, f"norm_skew_improvement/{method}/")
    if not os.path.exists(NormalitySkewness):
        os.makedirs(NormalitySkewness)
    plot_dir = os.path.join(NormalitySkewness, plot_names)
    fig.savefig(plot_dir)
    

    calculate_skew_kurtosis(df,col, moment='before transformation')
    calculate_skew_kurtosis(df_transformed,col, moment='after transformation')
    print("\n")
    
compare_distributions_before_and_after_applying_transformer(df_numeric, df_transformed, method='PowerTransformer')

Power Transformer did not perform as well as the Box Cox, we will setlle for the BoxCox transformation.

---

* You may add as many sections as you want, as long as it supports your project workflow.
* All notebook's cells should be run top-down (you can't create a dynamic wherein a given point you need to go back to a previous cell to execute some task, like go back to a previous cell and refresh a variable content)

---

# Push files to Repo

* In case you don't need to push files to Repo, you may replace this section with "Conclusions and Next Steps" and state your conclusions and next steps.

In [None]:
import os
try:
  # create here your folder
  # os.makedirs(name='')
except Exception as e:
  print(e)
