# Hands On: Data Quality and Pre-Processing

### 1. Assessing Data Quality

Load the following packages: dplyr, na.tools, tidyimpute (version from github
decisionpatterns/tidyimpute”)
Load the carInsurance data set about the insurance risk rating of cars based on several characteristics of
each car

In [None]:
# Install and load the devtools package (if not already installed)
if (!require(devtools)) {
  install.packages("devtools")
}
library(devtools)
install.packages("DescTools")

# Install the tidyimpute package from GitHub
install_github("decisionpatterns/tidyimpute")

# Load the required packages
library(dplyr)
library(na.tools)
library(tidyimpute)
library(DescTools)

Load the carInsurance data set about the insurance risk rating of cars based on several characteristics of
each car

In [None]:
# Load the data from "data.Rdata"
load("/kaggle/input/carinsurance/carInsurance.Rdata") 

head(carIns)

(a) Check if there are any missing values.
Tip: use the function any_na().


In [None]:
# Check for missing values in the dataframe
has_missing <- anyNA(carIns)

# Print the result
if (has_missing) {
  cat("The dataframe has missing values.")
} else {
  cat("The dataframe does not have any missing values.")
}

(b) Count the number of cases that have, at least, one missing value.
Tip: use the function filter_any_na() and then count().



In [None]:
# Filter cases with at least one missing value and count the number of cases
count_missing <- carIns %>%
  filter_any_na() %>%
  count()

# Print the count
cat("Number of cases with at least one missing value:", count_missing$n)

# Print the length of the dataset
cat("\nDataset length:", nrow(carIns))

(c) Create a new data set by removing all the cases that have missing values.
Tip: use the function drop_rows_any_na()


In [None]:
# Create a new data set by removing cases with missing values
imputed_drop_any <- drop_rows_any_na(carIns)

# Print the length of the dataset
cat("\nDataset length:", nrow(imputed_drop_any))

(d) Create a new data set by imputing all the missing values with 0.
Tip: explore the variants of the function impute()


In [None]:
# Create a new data set by imputing missing values with 0
imputed_missing_values <- impute(carIns, method = "fixed", value = 0)
head(imputed_missing_values)

(e) Create a new data set by imputing the mean in all the columns which have double type values.


In [None]:
# Impute mean in double-type columns
imputed_mean <- carIns %>%
  mutate(across(.cols = where(is.double), ~ if_else(is.na(.), mean(., na.rm = TRUE), .)))
# Display the head of the imputed data
head(imputed_mean)

(f) Create a new data set by imputing the mode in all the columns which have integer type values.


In [None]:
# Impute mode in integer-type columns
imputed_mode <- carIns %>%
  mutate(across(.cols = where(is.integer), ~ ifelse(is.na(.), Mode(.[!is.na(.)]), .)))

# Display the head of the imputed data
head(imputed_mode)

(g) Create a new data set by imputing the most frequent value to the column ”nDoors”.
Tip: use the function impute_replace()


In [None]:
 # Copy carIns
imputed_data_most_frequent <- carIns
summary(carIns$nDoors)
# Calculate the most frequent value
most_frequent_value <- carIns %>%
  count(nDoors) %>%
  arrange(desc(n)) %>%
  pull(nDoors) %>%
  first()

# Replace all values in "nDoors" column with the most frequent value
imputed_data_most_frequent$nDoors <- most_frequent_value

imputed_data_most_frequent$nDoors[is.na(imputed_data_most_frequent$nDoors)] <- most_frequent_value
summary(imputed_data_most_frequent$nDoors)
head(imputed_data_most_frequent)

(h) Combine the three last imputations to obtain a final dataset. Are there any duplicated cases?
Tip: use the functions distinct() and count()

In [None]:
# Select columns from imputed_mean (double columns)
imputed_mean_selected <- select(imputed_mean, where(is.double))

# Select columns from impute_mode (integer columns)
impute_mode_selected <- select(imputed_mode, where(is.integer))

# Select nDoors column from imputed_data_most_frequent
imputed_data_most_frequent_selected <- select(imputed_data_most_frequent, nDoors)

# Select columns that are not double, not integer, and column name is not "nDoors"
rest_columns <- select(carIns, -where(is.double), -where(is.integer), -nDoors)

# Combine the selected columns into a single dataset
final_data <- cbind(imputed_mean_selected, impute_mode_selected, imputed_data_most_frequent_selected,rest_columns)
# Check for duplicated cases
duplicated_cases <- final_data %>%
  distinct() %>%
  count() %>%
  filter(n > 1)

# Print the duplicated cases, if any
if (nrow(duplicated_cases) > 0) {
    print("Duplicated")
    print(duplicated_cases)
} else {
  print("No duplicated cases")
}

head(final_data)
summary(final_data)

### 2. Data Pre-Processing
2. Load the package dlookr. Use the same car insurance data set above and apply the following transformations to the price attribute. Be critical regarding the obtained results.


In [None]:
install.packages("dlookr")
library(dlookr)


In [None]:
car_insurance <- final_data
diagnose(car_insurance)

(a) Apply range-based normalization and z-score normalization.

In [None]:
# Extract the "price" column
price <- car_insurance$price

# Range-based normalization
car_insurance$price_range_normalized <- scale(price, center = min(price), scale = max(price) - min(price))

# Z-score normalization
car_insurance$price_zscore_normalized <- scale(price)
diagnose(car_insurance, price, price_range_normalized, price_zscore_normalized)
head(car_insurance)
tail(car_insurance)

(b) Discretize it into 4 equal-frequency ranges an into 4 equal-width ranges.

In [None]:
# Discretize into 4 equal-frequency ranges
car_insurance$price_equal_frequency <- binning(car_insurance$price, n = 4)

# Discretize into 4 equal-width ranges
car_insurance$price_equal_width <- binning(car_insurance$price, n = 4)

# View the discretized data set
head(car_insurance)

### 3. With the seed 111019 obtain the following samples on the car insurance data set. 
Tip: use the function sample_frac().


In [None]:
# Random sample of 60% of the cases, with replacement
car_insurance_sample_60_replacement <- sample_frac(car_insurance, size = 0.6, replace = TRUE, seed = 111019)

# Stratified sample of 60% of the cases of cars, according to the fuelType attribute
car_insurance_sample_60_stratified <- sample_frac(car_insurance, size = 0.6, replace = FALSE, stratify = "fuelType", seed = 111019)

# View the two samples
head(car_insurance_sample_60_replacement)
head(car_insurance_sample_60_stratified)

### 4. Load the package corrplot and select the numeric attributes of the car insurance data set.

(a) Using the function cor(), obtain the pearson correlation coefficient between each pair of variables.

(b) Apply the function cor.mtest() to the previous result to calculate the p-values and confidence 
intervals of the correlation coefficient for each pair of variables.

(c) Plot the all correlation information using the function corrplot. Explore some of its parameters.

In [None]:
# Load the corrplot package
library(corrplot)

In [None]:
# Select the numeric attributes
numeric_attributes <- select_if(car_insurance, is.numeric)
# Create a correlation matrix
correlation_matrix <- cor(numeric_attributes)

# Calculate the p-values and confidence intervals of the correlation coefficient for each pair of variables
p_values <- cor.mtest(correlation_matrix)$p.value
confidence_intervals <- cor.mtest(correlation_matrix)$conf.int

# Plot the correlation matrix
corrplot(correlation_matrix, method = "circle", p.mat = p_values, sig.level = 0.05, insig = "blank")

### 5. Load the data set USJudgeRatings, from the datasets package, containing lawyers’ ratings of state judges in the US Superior Court regarding a set of attributes.

In [None]:
# Load the datasets package
library(datasets)

# Load the USJudgeRatings data set
data(USJudgeRatings)
head(USJudgeRatings)
diagnose(USJudgeRatings)

(a) Apply the function prcomp() to obtain the principal components. Inspect how each variable is 
obtained by the linear combination of each component.

In [None]:
# Apply the prcomp() function to obtain the principal components
pc <- prcomp(USJudgeRatings, scale = TRUE)

# Inspect how each variable is obtained by the linear combination of each component
pc$rotation

(b) Load the package ggbiplot and plot the two first components with the function ggbiplot(). You can 
label each point with the lawyer’s name by setting the labels parameter

In [None]:
# Load the package stats
library(stats)

# Plot the biplot
biplot(pc, 
       expand = 3, 
       var.col = "blue", 
       var.cex = 0.7, 
       point.size = 2, 
       alpha = 0.5)