# <font color = yellow> Data prep to Skewness

### Using the Kaggle Insurance dataset

In [None]:
library(dplyr)
library(lubridate)
library(missForest)
library(e1071)  # for determinning skewness
library(ggplot2)

options(scipen = 999)

In [None]:

library(ppcor)
library(tibble)
library(corrplot)


In [None]:
library(reshape2)  # meld
library(Hmisc)    # cor 
library(e1071)  # skewness
library(VIM)    # k-NN nearest neighbors

In [None]:
# Load the test dataset
test_path <- "C:\\all_programming\\Kaggle\\datasets\\test\\test.csv"
df_test <- read.csv(test_path, stringsAsFactors = FALSE)
dim(df_test)   # has empty fields

train_path <- "C:\\all_programming\\Kaggle\\datasets\\train\\train.csv"
df_train <- read.csv(train_path, stringsAsFactors = FALSE)
dim(df_train)

### <font color = yellow> Get smaller samples from each dataset ... 1000 rows

In [None]:
n <- nrow(df_test)
log_vertices <- sample(1:n, 1000, replace = FALSE)
str(log_vertices)

df_1k <- df_test[log_vertices, ]
cat("\n Size of the testing sample is: ", dim(df_1k), "\n")

In [None]:
head(df_1k)

### <font color = yellow> Rename the long var names

In [None]:
test <- df_1k
dim(test)

In [None]:
colnames(df_1k)

In [None]:
# Rename columns to shorter names
names(df_1k) <- c("ID", "Age", "Sex", "Income", "Marital", "Kids", "Edu", "Job", "Health", "Loc", "InsType", "Claims", "VehAge", "Credit", "InsDur", "InsDate", "CusFeed", "Skoker", "Active", "Type")

# View the dataframe with renamed columns
dim(df_1k)
head(df_1k)


In [None]:
sum(is.na(df_1k$Sex))

In [None]:
# stopper

### <font color = yellow> Select the numeric variables from df

In [None]:
is.numeric(df_1k$Age)
is.character(df_1k$Sex)

In [None]:
#get_numeric <- function(x) { is.numeric(x) }

log_result <- sapply(df_1k, is.numeric)
df_vars <- df_1k[, log_result]
dim(df_vars)
head(df_vars, 3)

### <font color = yellow> Remove the ID var before imputing data

In [None]:
df_vars <- df_vars[, -1]
dim(df_vars)

In [None]:
# stopper

### <font color = yellow> Plot the existing distribution of df before imputing missing values 

#### ggplot will remove the NA (624 rows) from the 1000 rows

In [None]:

# Melt the dataframe
df_melted_pre <- melt(df_vars)
# View the melted dataframe
dim(df_melted_pre)

# Faceted plot for pre-imputed data
ggplot(df_melted_pre, aes(x = value)) + 
  geom_histogram(bins = 10, fill = "blue", alpha = 0.7, na.rm = FALSE) +   #  This does NOT change anything
  facet_wrap(~ variable, scales = "free") +
  labs(title = "Distribution of Variables (Pre-Imputation)", x = "Value", y = "Count") +
  theme_minimal()



In [None]:
# Replace NAs with a placeholder value
df_melted_pre$value[is.na(df_melted_pre$value)] <- -1

# Faceted plot for pre-imputed data
ggplot(df_melted_pre, aes(x = value)) + 
  geom_histogram(bins = 10, fill = "blue", alpha = 0.7) +
  facet_wrap(~ variable, scales = "free") +
  labs(title = "Distribution of Variables (Pre-Imputation)", x = "Value", y = "Count") +
  theme_minimal()


### <font color = yellow> Plot the df with placeholders for NAs ( using -1 )

#### Note: This replacement method eliminates putting the df 20 x 1000 back together again  

In [None]:
# Remove NA values before melting
#     df_no_na <- na.omit(df_vars)   # duplicates above


# Melt the dataframe with removed NA values
df_melted_replaced_nas <- melt(df_vars)

# Replace NAs with a placeholder value 
df_melted_replaced_nas$value[is.na(df_melted_replaced_nas$value)] <- -1


# View the melted dataframe without NA values
dim(df_melted_replaced_nas)

# Faceted plot for pre-imputed data without NA values
plot_prior <- ggplot(df_melted_replaced_nas, aes(x = value)) + 
  geom_histogram(bins = 10, fill = "blue", alpha = 0.7, na.rm = TRUE) +
  facet_wrap(~ variable, scales = "free") +
  labs(title = "Distribution of Variables (Pre-Imputation, No NA)", x = "Value", y = "Count") +
  theme_minimal()
plot_prior

### <font color = yellow> Analysis of Comparison:  

    - In many cases, missing values replaced by a placeholder (-1) will significantly alter the overall 
        shape of the plots, especially if they are signif (642 /1000) compared to the total number of observations.

    - Plotting data w/many NAs or placeholders is not great.  
  


### <font color = yellow> Remove the ID column from the data before AnY processing of data - done above

In [None]:
# dim(df_vars)
# df_vars <- df_vars[, -1] 
# dim(df_vars)
# head(df_vars, 2)

In [None]:
dim(df_vars)
head(df_vars, 3)

### <font color = yellow> Preserve the locations where the NAs exist / to be replaced (imputed)

In [None]:
# Get the indices of the NAs .... req'd to put this df back together again

fct.na <-  function(x) {is.na(x) }

na_indices <- sapply(df_vars, fct.na)
dim(na_indices)


### <font color = yellow> Impute the existing / unmodified NA missing values w/ missForest 

#####  The Age var, a counting variable, it included here.  The imputed value w/not be integer

#### The new df is named: df_fixed

In [None]:
# Impute missing values
imputed_data <- missForest(df_vars) 

dim(imputed_data$ximp)

# Extract the complete dataframe ... new df w/ imputed values replacing the NAs 
df_fixed <- imputed_data$ximp
dim(df_fixed)
head(df_fixed, 3)

## <font color = yellow> Write the df_fixed to folder

In [None]:
write.csv(df_fixed, file = "df_fixed.csv", row.names = FALSE)

In [None]:
dim(df_vars)
head(df_vars, 3)

### <font color = yellow> Explore estimated impute error rate

In [None]:
# Retrieve the estimated error from missForest
oob_error <- imputed_data$OOBerror
print(paste("Out-of-bag (OOB) Error:", oob_error))


#### <font color = yellow>  The OOB Error: .0186 is a low error rate ... and better than the OOB perviously determined after removing zeros

        - T/F, Will use the "fixed df", df named df_fixed,  for skewness analysis

## <font color = yellow> Plot all the pre-skewness variables (matix format)

In [None]:
# Install and load necessary packages

# Melt the dataframe 
df_melted <- melt(df_fixed)

# View the melted dataframe
dim(df_melted)
head(df_melted)
# review the vars for completeness
unique(df_melted$variable)
# summary(df_melted)

In [None]:
# Faceted plot
ggplot(df_melted, aes(x = value)) + 
  geom_histogram(bins = 10, fill = "blue", alpha = 0.7) +
  facet_wrap(~ variable, scales = "free") +
  labs(title = "Distribution of Variables", x = "Value", y = "Count") +
  theme_minimal()

In [None]:
# Compare with plot_prior
plot_prior

### <font color = yellow> But ...Kids is not a continuous value.  Kids represent a count ... an integer variable

#### Imput the Kids variable seperately using nearest neighbors (k-NN) ... returns an integer value

#### Need to use the clean, pre-imputed data ... df_vars


In [None]:
dim(df_vars)

In [None]:
colnames(df_vars)

In [None]:

# Ensure df_fixed is a dataframe and contains the 'Kids' column
if (!"Kids" %in% names(df_vars)) {
  stop("The dataframe must contain the 'Kids' column")
}

# Impute integer features using k-NN imputation
# `kNN()` returns the complete dataframe with an additional 'Kids_imp' column for imputed values
imputed_integer <- kNN(df_vars, variable = "Kids", k = 3)

# Extract the imputed 'Kids' column
vector_integer_imputed <- imputed_integer$Kids

# View the imputed integer dataframe
str(vector_integer_imputed)
head(vector_integer_imputed,10)


In [None]:
# compare before / after imput
head(df_vars$Kids, 10)

#### <font color = yellow> Insert the imputed Kids vector into a df_vars_x, and then impute the other missing vars w/missforrest()

In [None]:
df_vars_x <- df_vars
df_vars_x$Kids <- vector_integer_imputed
sum(is.na(df_vars_x$Kids))
dim(df_vars_x)
head(df_vars_x, 6)

### <font color = yellow> Impute (anew) the remaining NAs in df_vars_x using missForrest

In [None]:
# Impute the other missing values
imputed_data <- missForest(df_vars_x) 

dim(imputed_data$ximp)

# Extract the complete dataframe ... new df w/ imputed values replacing the NAs 
df_fixed2 <- imputed_data$ximp
dim(df_fixed2)
head(df_fixed2, 3)

### <font color = yellow> Explore estimated impute error rate in the better imputed data

In [None]:
# Retrieve the estimated error from missForest
oob_error <- imputed_data$OOBerror
print(paste("Out-of-bag (OOB) Error:", oob_error))


### <font color = yellow> Not Good!

#### This data was not cleared of those multi-var NA observations ... this used all 1000 rows

    - [1] "Out-of-bag (OOB) Error: 0.714625431146593"


### <font color = yellow> cor(df)  - before normalization of data / after normalization

    This method will return a correlation matrix that contains the Pearson correlation coefficient between each pairwise combination of numeric variables in a data frame.  A matrix represents the pairwise combonations. 

In [None]:
# Calculate correlation matrix for original data (excluding ID column)
cor_matrix <- cor(df_vars, use = "complete.obs")    # complete observations (rows w/out NAs)
head(cor_matrix)


In [None]:
# Flatten the correlation matrix and remove self-correlations
flatten_correlation_matrix <- function(cor_matrix) {
  cor_matrix[lower.tri(cor_matrix, diag = TRUE)] <- NA  # Remove lower triangle and diagonal
  cor_df <- as.data.frame(as.table(cor_matrix))
  cor_df <- cor_df[complete.cases(cor_df), ]  # Remove NA values
  colnames(cor_df) <- c("Variable1", "Variable2", "Correlation")
  return(cor_df)
}

cor_df <- flatten_correlation_matrix(cor_matrix)
print(cor_df)


### <font color = yellow> Top Correlations

In [None]:
top_coefficients <- cor_df %>% arrange(desc(Correlation))
head(top_coefficients)

In [None]:

colnames(cor_matrix) <- rownames(cor_matrix) <- c("Age", "Annual.Income", "Number.of.Dependents", "Health.Score", "Previous.Claims", "Vehicle.Age", "Credit.Score", "Insurance.Duration")

# Melt the correlation matrix
melted_corr <- melt(cor_matrix)

# Plot heatmap
# Plot heatmap with more intense colors for high correlations 
ggplot(data = melted_corr, aes(x = Var1, y = Var2, fill = value)) + 
    geom_tile(color = "white") + 
    scale_fill_gradient2(low = "blue", mid = "white", high = "red", 
                     midpoint = 0, limit = c(-1, 1), space = "Lab", name = "Correlation") + 
    theme_minimal() + 
    theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1)) + 
    coord_fixed()

# <font color = yellow> Determine the probabilities ... the p-values

#### Interpreting p-values:

- Low p-value (< 0.05): Indicates strong evidence against the null hypothesis,
   suggesting that the observed correlation is statistically significant.

In [None]:
library(Hmisc)
cor_matrix <- rcorr(as.matrix(df_vars))
# 
#print(pval_list)

In [None]:
str(cor_matrix)
is.matrix(cor_matrix)

### <font color = yellow> p-value outcome: Income & Kids have low p-values - t/f... indicate statistical significance

In [None]:
# Extract p-values matrix
pval_matrix <- cor_matrix$P

# Flatten the p-values matrix to a long format dataframe
flatten_pval_matrix <- function(pval_matrix) {
  pval_matrix[lower.tri(pval_matrix, diag = TRUE)] <- NA  # Remove lower triangle and diagonal
  pval_df <- as.data.frame(as.table(pval_matrix))
  pval_df <- pval_df[complete.cases(pval_df), ]  # Remove NA values
  colnames(pval_df) <- c("Variable1", "Variable2", "p_value")
  return(pval_df)
}

df_pval <- flatten_pval_matrix(pval_matrix)

# View the resulting dataframe
dim(df_pval)
head(df_pval, 2)
df_pval <- df_pval %>% arrange(p_value)
head(df_pval)

### <font color = yellow> Next Step: Calculate the Skewness, consider direction and size

In [None]:

colnames(df_fixed)

In [None]:
# Calculate skewness for each variable
skewness_Age <- skewness(df_fixed$Age, na.rm = TRUE)
skewness_Income <- skewness(df_fixed$Income, na.rm = TRUE)
skewness_Dependents <- skewness(df_fixed$Kids, na.rm = TRUE)
skewness_HC <- skewness(df_fixed$Health, na.rm = TRUE)
skewness_PC <- skewness(df_fixed$Claims, na.rm = TRUE)
skewness_CS <- skewness(df_fixed$Credit, na.rm = TRUE)
skewness_ID <- skewness(df_fixed$InsDur, na.rm = TRUE)

# Print skewness values
print(paste("Skewness of Age:", skewness_Age))
print(paste("Skewness of Income:", skewness_Income))
print(paste("Skewness of Dependents:", skewness_Dependents))
print(paste("Skewness of Health Score:", skewness_HC))
print(paste("Skewness of Previous Claims:", skewness_PC))
print(paste("Skewness of Credit Score:", skewness_CS))
print(paste("Skewness of Credit Score:", skewness_ID))