# <font color = yellow> Data prep to Skewness

### Using the Kaggle Insurance dataset

In [None]:
library(dplyr)
library(lubridate)
library(missForest)
library(e1071)  # for determinning skewness
library(ggplot2)

options(scipen = 999)

In [None]:
library(ppcor)
library(tibble)
library(corrplot)

In [None]:
library(reshape2)  # meld
library(Hmisc)    # cor 
library(e1071)  # skewness

### <font color = yellow> Get the fixed df (created in dataPrep_p_values.ipynb)

In [None]:
# missing data was imputed w/ missForrest( )
train_path <- "C:\\all_programming\\Kaggle\\datasets\\df_fixed.csv"
df_fixed <- read.csv(train_path, stringsAsFactors = FALSE)
dim(df_fixed)

### <font color = yellow> Calculate the Skewness values, consider direction and size

In [None]:
# Calculate skewness for each variable
skewness_Age <- skewness(df_fixed$Age, na.rm = TRUE)
skewness_Income <- skewness(df_fixed$Income, na.rm = TRUE)
skewness_Dependents <- skewness(df_fixed$Kids, na.rm = TRUE)
skewness_HC <- skewness(df_fixed$Health, na.rm = TRUE)
skewness_PC <- skewness(df_fixed$Claims, na.rm = TRUE)
skewness_CS <- skewness(df_fixed$Credit, na.rm = TRUE)
skewness_ID <- skewness(df_fixed$InsDur, na.rm = TRUE)

# Print skewness values
print(paste("Skewness of Age:", skewness_Age))
print(paste("Skewness of Income:", skewness_Income))
print(paste("Skewness of Dependents:", skewness_Dependents))
print(paste("Skewness of Health Score:", skewness_HC))
print(paste("Skewness of Previous Claims:", skewness_PC))
print(paste("Skewness of Credit Score:", skewness_CS))
print(paste("Skewness of Credit Score:", skewness_ID))

In [None]:
# Install and load necessary packages

# Melt the dataframe 
df_melted <- melt(df_fixed)  # has no id var

# View the melted dataframe
dim(df_melted)
head(df_melted)
# review the vars for completeness
unique(df_melted$variable)
# summary(df_melted)

### <font color = yellow> Visual the unskewed variables

In [None]:
# Faceted plot
ggplot(df_melted, aes(x = value)) + 
  geom_histogram(bins = 10, fill = "blue", alpha = 0.7) +
  facet_wrap(~ variable, scales = "free") +
  labs(title = "Distribution of Variables", x = "Value", y = "Count") +
  theme_minimal()

In [None]:
summary(df_fixed)

### <font color = yellow> Kids is not a continuous value.  Kids represent a count ... an integer variable

#### Imput the Kids variable seperately with nearest neighbors (k-NN)


In [None]:
colnames(df_fixed)

In [None]:
# Install and load necessary packages


# Sample dataframe with missing values
df_vars <- data.frame(
  Age = c(25, 45, 35, 50, 60),
  Income = c(30000, 50000, 40000, 60000, 70000),
  Kids = c(2, NA, 3, 1, 0),
  Health = c(620, 630, 608, 507, 780)
)

# Separate integer and non-integer features
df_non_integer <- df_vars[, c("Age", "Income", "Health")]
df_integer <- df_vars[, "Kids", drop = FALSE]

# Impute non-integer features using missForest
imputed_non_integer <- missForest(df_non_integer)
df_non_integer_imputed <- imputed_non_integer$ximp

# View the imputed non-integer dataframe
print(df_non_integer_imputed)


In [None]:
# Install and load necessary package

library(VIM)

# Ensure df_fixed is a dataframe and contains the 'Kids' column 
if (!"Kids" %in% names(df_fixed)) { 
    stop("The dataframe must contain the 'Kids' column") 
    } 
# Impute integer features using k-NN imputation 
# `kNN()` returns the complete dataframe with an additional 'Kids_imp' column for imputed values 
imputed_integer <- kNN(df_fixed, variable = "Kids", k = 3) 

# Extract the imputed 'Kids' column 
df_integer_imputed <- imputed_integer$Kids 
# View the imputed integer dataframe 
head(df_integer_imputed)