## Task 1

In [2]:
# Import the CSV file into R
used_cars <- read.csv("UsedCars.csv", header = TRUE, stringsAsFactors = FALSE)

# Run the linear regression
model <- lm(
  Price ~ Age + KM + HP + Metallic + Automatic + CC + Doors + Gears + Weight,
  data = used_cars
)

# Display the regression summary
summary(model)

# Calculate fitted values and residuals
fitted_values <- fitted(model)
residuals_values <- residuals(model)

# Combines y values, fitted values, and residuals 10x
results <- data.frame(
  Original_Y = used_cars$Price[1:10],
  Fitted_Y = fitted_values[1:10],
  Residuals = residuals_values[1:10]
)

# Display the results
print(results)

# Verify residuals calculation
verification <- results$Original_Y - results$Fitted_Y
all.equal(verification, results$Residuals)
# Should return TRUE if residuals are correct


Call:
lm(formula = Price ~ Age + KM + HP + Metallic + Automatic + CC + 
    Doors + Gears + Weight, data = used_cars)

Residuals:
    Min      1Q  Median      3Q     Max 
-8117.5  -771.5   -50.0   701.0  6403.5 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -2.883e+03  1.505e+03  -1.916 0.055649 .  
Age         -1.299e+02  2.610e+00 -49.782  < 2e-16 ***
KM          -1.465e-02  1.441e-03 -10.162  < 2e-16 ***
HP           2.540e+01  3.376e+00   7.524 1.01e-13 ***
Metallic    -2.281e+01  7.436e+01  -0.307 0.759059    
Automatic    4.991e+02  1.497e+02   3.334 0.000881 ***
CC          -5.259e-03  8.499e-02  -0.062 0.950667    
Doors        1.315e+01  4.086e+01   0.322 0.747638    
Gears        6.281e+02  1.927e+02   3.259 0.001148 ** 
Weight       1.504e+01  1.300e+00  11.567  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1227 on 1254 degrees of freedom
Multiple R-squared:  0.8649,	Adjusted R-sq

   Original_Y Fitted_Y Residuals
1       21500 19186.61 2313.3889
2       20950 19902.38 1047.6167
3       19950 20114.27 -164.2689
4       19600 19891.72 -291.7201
5       21500 19223.83 2276.1696
6       22500 18930.85 3569.1535
7       22000 19698.89 2301.1095
8       22750 18564.51 4185.4929
9       17950 16239.52 1710.4842
10      16750 15604.50 1145.5044


## Task 2

In [7]:
# Extract coefficients and standard errors
coefficients <- coef(summary(model))
beta_hat <- coefficients[, "Estimate"]
se_beta_hat <- coefficients[, "Std. Error"]

# Calculate t-statistics manually
t_statistics_manual <- beta_hat / se_beta_hat

# Compare with t-statistics from summary()
t_statistics_summary <- coefficients[, "t value"]
all.equal(t_statistics_manual, t_statistics_summary)

# Display the t-statistics
data.frame(
  Beta_Hat = beta_hat,
  SE_Beta_Hat = se_beta_hat,
  T_Statistics_Manual = t_statistics_manual,
  T_Statistics_Summary = t_statistics_summary
)

# Get degrees of freedom
df <- model$df.residual

# Calculate critical value
critical_value <- qt(0.975, df)
# 0.975 for two-tailed test at 95% confidence
critical_value

# Calculate p-values manually
p_values_manual <- 2 * pt(-abs(t_statistics_manual), df)

# Compare with p-values from summary()
p_values_summary <- coefficients[, "Pr(>|t|)"]
all.equal(p_values_manual, p_values_summary)
# Should return TRUE

# Display the p-values
data.frame(
  T_Statistics = t_statistics_manual,
  P_Values_Manual = p_values_manual,
  P_Values_Summary = p_values_summary
)

# Identify significant variables
significant_variables <-
  abs(t_statistics_manual) > critical_value | p_values_manual < 0.05

# Display significant variables
data.frame(
  Variable = rownames(coefficients),
  T_Statistics = t_statistics_manual,
  P_Values = p_values_manual,
  Significant = significant_variables
)

Unnamed: 0_level_0,Beta_Hat,SE_Beta_Hat,T_Statistics_Manual,T_Statistics_Summary
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>
(Intercept),-2882.537,1504.804,-1.91555684,-1.91555684
Age,-129.9451,2.610276,-49.7821352,-49.7821352
KM,-0.01464727,0.001441421,-10.16168388,-10.16168388
HP,25.39843,3.375622,7.52407315,7.52407315
Metallic,-22.81278,74.36117,-0.30678348,-0.30678348
Automatic,499.058,149.6871,3.33400809,3.33400809
CC,-0.005259189,0.08498812,-0.06188146,-0.06188146
Doors,13.15082,40.86314,0.32182598,0.32182598
Gears,628.1485,192.7432,3.25899198,3.25899198
Weight,15.03712,1.300029,11.56676201,11.56676201


Unnamed: 0_level_0,T_Statistics,P_Values_Manual,P_Values_Summary
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>
(Intercept),-1.91555684,0.05564875,0.05564875
Age,-49.7821352,2.8025330000000003e-299,2.8025330000000003e-299
KM,-10.16168388,2.2952360000000002e-23,2.2952360000000002e-23
HP,7.52407315,1.0092e-13,1.0092e-13
Metallic,-0.30678348,0.7590591,0.7590591
Automatic,3.33400809,0.0008810432,0.0008810432
CC,-0.06188146,0.9506671,0.9506671
Doors,0.32182598,0.7476382,0.7476382
Gears,3.25899198,0.001148079,0.001148079
Weight,11.56676201,1.790759e-29,1.790759e-29


Unnamed: 0_level_0,Variable,T_Statistics,P_Values,Significant
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<lgl>
(Intercept),(Intercept),-1.91555684,0.05564875,False
Age,Age,-49.7821352,2.8025330000000003e-299,True
KM,KM,-10.16168388,2.2952360000000002e-23,True
HP,HP,7.52407315,1.0092e-13,True
Metallic,Metallic,-0.30678348,0.7590591,False
Automatic,Automatic,3.33400809,0.0008810432,True
CC,CC,-0.06188146,0.9506671,False
Doors,Doors,0.32182598,0.7476382,False
Gears,Gears,3.25899198,0.001148079,True
Weight,Weight,11.56676201,1.790759e-29,True


## Task 3

In [9]:
# Calculate R-squared manually
ssr <- sum((fitted(model) - mean(used_cars$Price))^2)
# Regression sum of squares
sst <- sum((used_cars$Price - mean(used_cars$Price))^2)
# Total sum of squares
r_squared_manual <- ssr / sst

# Retrieve R-squared from the summary output
r_squared_summary <- summary(model)$r.squared

# Compare the two R-squared values
all.equal(r_squared_manual, r_squared_summary)

# Display the R-squared values
data.frame(
  r_squared_manual = r_squared_manual,
  r_squared_summary = r_squared_summary
)

# Install and load the car package
install.packages("car")
library(car)

# Calculate VIF for all independent variables
vif_values <- vif(model)

# Display the VIF values
print(vif_values)

# Discuss multicollinearity
# A VIF > 10 indicates high multicollinearity.

# Regress Weight on all other independent variables
weight_model <- lm(
  Weight ~ Age + KM + HP + Metallic + Automatic + CC + Doors + Gears,
  data = used_cars
)

# Retrieve R-squared for the Weight regression
r_squared_weight <- summary(weight_model)$r.squared

# Calculate VIF for Weight
vif_weight <- 1 / (1 - r_squared_weight)

# Display the VIF value for Weight
print(vif_weight)

r_squared_manual,r_squared_summary
<dbl>,<dbl>
0.8648677,0.8648677



The downloaded binary packages are in
	/var/folders/c6/c9_qxvf55ksch5f2x5v7c3980000gn/T//RtmpzyHXfb/downloaded_packages
      Age        KM        HP  Metallic Automatic        CC     Doors     Gears 
 1.914772  1.586299  1.548521  1.017014  1.101810  1.104307  1.269749  1.129618 
   Weight 
 2.096541 
[1] 2.096541


## Task 4

In [10]:
# Run a new linear regression with only significant variables
new_model <- lm(
  Price ~ Age + KM + HP + Weight,  # Replace with your significant variables
  data = used_cars
)

# Display the regression summary
summary(new_model)

# Retrieve R-squared and Adjusted R-squared for the full model
r_squared_full <- summary(model)$r.squared
adj_r_squared_full <- summary(model)$adj.r.squared

# Retrieve R-squared and Adjusted R-squared for the new model
r_squared_new <- summary(new_model)$r.squared
adj_r_squared_new <- summary(new_model)$adj.r.squared

# Compare the values
comparison <- data.frame(
  Model = c("Full Model", "New Model"),
  R_Squared = c(r_squared_full, r_squared_new),
  Adjusted_R_Squared = c(adj_r_squared_full, adj_r_squared_new)
)

# Display the comparison
print(comparison)

# Retrieve coefficients from the new model
coefficients <- coef(summary(new_model))

# Interpret the effects
age_effect <- coefficients["Age", "Estimate"]  # Effect of Age
km_effect <- coefficients["KM", "Estimate"]    # Effect of KM

# Display the effects
cat("Effect of Age: For every additional month, the price decreases by", age_effect, "Euros.\n")
cat("Effect of KM: For every additional kilometer, the price decreases by", km_effect, "Euros.\n")

# Convert effects for specific scenarios
age_effect_year <- age_effect * 12
# Effect of one additional year
km_effect_10k <- km_effect * 10000
# Effect of 10,000 additional kilometers

cat("Effect of Age (1 year): For every additional year, the price decreases by", age_effect_year, "Euros.\n")
cat("Effect of KM (10,000 km): For every additional 10,000 km, the price decreases by", km_effect_10k, "Euros.\n")


Call:
lm(formula = Price ~ Age + KM + HP + Weight, data = used_cars)

Residuals:
    Min      1Q  Median      3Q     Max 
-8682.6  -768.3   -53.2   716.0  6335.4 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -8.257e+02  1.184e+03  -0.697    0.486    
Age         -1.287e+02  2.590e+00 -49.680  < 2e-16 ***
KM          -1.484e-02  1.442e-03 -10.291  < 2e-16 ***
HP           2.616e+01  3.200e+00   8.175 7.13e-16 ***
Weight       1.600e+01  1.176e+00  13.608  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 1235 on 1259 degrees of freedom
Multiple R-squared:  0.8626,	Adjusted R-squared:  0.8622 
F-statistic:  1976 on 4 and 1259 DF,  p-value: < 2.2e-16


       Model R_Squared Adjusted_R_Squared
1 Full Model 0.8648677          0.8638979
2  New Model 0.8626271          0.8621907
Effect of Age: For every additional month, the price decreases by -128.6562 Euros.
Effect of KM: For every additional kilometer, the price decreases by -0.01484368 Euros.
Effect of Age (1 year): For every additional year, the price decreases by -1543.874 Euros.
Effect of KM (10,000 km): For every additional 10,000 km, the price decreases by -148.4368 Euros.


## Homework Questions

In [None]:
# Retrieve the 7th fitted value
fitted_value_7 <- fitted_values[7]

# Retrieve the 7th residual
residual_7 <- residuals_values[7]

# Display the results
cat("The 7th fitted value (ŷ) is:", fitted_value_7, "\n")
cat("The 7th residual is:", residual_7, "\n")


The 7th fitted value (<U+0177>) is: 19698.89 
The 7th residual is: 2301.109 


In [5]:
# Extract the coefficient matrix from the regression summary
coefficients <- coef(summary(model))

# Calculate the t-statistics
t_statistics <- coefficients[, "Estimate"] / coefficients[, "Std. Error"]

# Display the t-statistics
print(t_statistics)

 (Intercept)          Age           KM           HP     Metallic    Automatic 
 -1.91555684 -49.78213520 -10.16168388   7.52407315  -0.30678348   3.33400809 
          CC        Doors        Gears       Weight 
 -0.06188146   0.32182598   3.25899198  11.56676201 


In [6]:
# Extract the coefficient matrix from the regression summary
coefficients <- coef(summary(model))

# Retrieve the t-statistic for HP
t_stat_hp <- coefficients["HP", "t value"]

# Display the t-statistic for HP
cat("The t-statistic for HP is:", t_stat_hp, "\n")

The t-statistic for HP is: 7.524073 


In [7]:
# Retrieve the degrees of freedom from the model
degrees_of_freedom <- model$df.residual

# Display the degrees of freedom
cat("The degrees of freedom (df) is:", degrees_of_freedom, "\n")

The degrees of freedom (df) is: 1254 


In [8]:
# Retrieve degrees of freedom
degrees_of_freedom <- model$df.residual

# Calculate the critical value for a 95% confidence level
critical_value <- qt(0.975, degrees_of_freedom)

# Display the critical value
cat("The critical value is:", critical_value, "\n")

The critical value is: 1.961858 


In [9]:
# Extract the coefficient matrix from the regression summary
coefficients <- coef(summary(model))

# Retrieve the p-value for CC
p_value_cc <- coefficients["CC", "Pr(>|t|)"]

# Display the p-value for CC
cat("The p-value for CC is:", p_value_cc, "\n")

The p-value for CC is: 0.9506671 


In [10]:
# Extract the coefficient matrix from the regression summary
coefficients <- coef(summary(model))

# Retrieve p-values
p_values <- coefficients[, "Pr(>|t|)"]

# Identify statistically significant variables (p-value < 0.05)
significant_variables <- rownames(coefficients)[p_values < 0.05]

# Display significant variables
cat("Statistically significant variables are:", significant_variables, "\n")

Statistically significant variables are: Age KM HP Automatic Gears Weight 


In [18]:
# Extract the coefficient matrix from the regression summary
coefficients <- coef(summary(model))

# Retrieve p-values
p_values <- coefficients[, "Pr(>|t|)"]

# Identify variables that are NOT statistically significant (p-value >= 0.05)
non_significant_variables <- rownames(coefficients)[p_values >= 0.05]

significant_variables <- rownames(coefficients)[p_values < 0.05]
# Display significant variables
cat("Statistically significant variables are:", significant_variables, "\n")

# Display non-significant variables
cat("Variables that are NOT statistically significant at 95% confidence level are:",
    non_significant_variables, "\n")

Statistically significant variables are: Age KM HP Automatic Gears Weight 
Variables that are NOT statistically significant at 95% confidence level are: (Intercept) Metallic CC Doors 


In [13]:
# Create the linear regression model
lm_res <- lm(
  Price ~ Age + KM + HP + Metallic + Automatic + CC + Doors + Gears + Weight,
  data = used_cars
)

# Calculate RSS (Residual Sum of Squares)
rss <- sum(resid(lm_res)^2)

# Calculate TSS (Total Sum of Squares)
tss <- sum((lm_res$model$Price - mean(lm_res$model$Price))^2)

# Calculate R-squared
r_squared <- 1 - (rss / tss)

# Display the R-squared value
cat("The calculated R-squared is:", r_squared, "\n")

The calculated R-squared is: 0.8648677 


In [14]:
# Install and load the car package if not already installed
if (!requireNamespace("car", quietly = TRUE)) {
  install.packages("car")
}
library(car)

# Calculate VIF for all independent variables
vif_values <- vif(lm_res)

# Retrieve the VIF for the KM variable
vif_km <- vif_values["KM"]

# Display the VIF for KM
cat("The VIF for KM is:", vif_km, "\n")

Loading required package: carData



The VIF for KM is: 1.586299 


In [15]:
# Regress Weight on all other independent variables
weight_model <- lm(
  Weight ~ Age + KM + HP + Metallic + Automatic + CC + Doors + Gears,
  data = used_cars
)

# Retrieve R-squared for the Weight regression
r_squared_weight <- summary(weight_model)$r.squared

# Display the R-squared value
cat("The R-squared value for Weight is:", r_squared_weight, "\n")

The R-squared value for Weight is: 0.5230239 


In [20]:
# Run the new regression model with only significant variables
new_model <- lm(
  Price ~ Age + KM + HP + Automatic + Gears + Weight,
  # Include significant variables
  data = used_cars
)

# Retrieve the estimated coefficient for Gears
beta_hat_gears <- coef(new_model)["Gears"]

# Display the coefficient in scientific notation
cat(
  "The estimated coefficient (β̂) of Gears is:",
  formatC(beta_hat_gears, format = "e", digits = 3),
  "\n"
)

The estimated coefficient (<U+03B2><U+0302>) of Gears is: 6.139e+02 


In [21]:
# Retrieve Adjusted R-Squared for the simplified regression model
adjusted_r_squared <- summary(new_model)$adj.r.squared

# Display the Adjusted R-Squared value
cat(
  "The Adjusted R-Squared of the simplified model is:",
  adjusted_r_squared,
  "\n"
)

The Adjusted R-Squared of the simplified model is: 0.8642022 


In [23]:
# Retrieve R-Squared and Adjusted R-Squared for both models
r_squared_full <- summary(lm_res)$r.squared
adj_r_squared_full <- summary(lm_res)$adj.r.squared

r_squared_simplified <- summary(new_model)$r.squared
adj_r_squared_simplified <- summary(new_model)$adj.r.squared

# Display the comparison
cat(
  "Full Model - R-Squared:", r_squared_full, ", Adjusted R-Squared:", adj_r_squared_full, "\n", # nolint
  "Simplified Model - R-Squared:", r_squared_simplified, ", Adjusted R-Squared:", adj_r_squared_simplified, "\n" # nolint
)

Full Model - R-Squared: 0.8648677 , Adjusted R-Squared: 0.8638979 
 Simplified Model - R-Squared: 0.8648474 , Adjusted R-Squared: 0.8642022 


In [25]:
# Retrieve the estimated coefficient for Age
beta_hat_age <- coef(new_model)["Age"]

# Calculate the price change for one year (12 months)
price_change_one_year <- beta_hat_age * 12

# Install and load the scales package if not already installed
if (!requireNamespace("scales", quietly = TRUE)) {
  install.packages("scales")
}
library(scales)

# Format the price change as Euro currency
euro_format <- dollar_format(prefix = "€", big.mark = ",", decimal.mark = ".")

# Display the result in Euro format
cat(
  "Holding everything else equal, the sales price would change by:",
  euro_format(price_change_one_year),
  "if a car were one year older.\n"
)

Holding everything else equal, the sales price would change by: -<U+20AC>1,558.46 if a car were one year older.


In [26]:
# Check the current encoding
cat("Current encoding:", Sys.getlocale("LC_CTYPE"), "\n")

# Set the encoding to UTF-8
Sys.setlocale("LC_CTYPE", "en_US.UTF-8")

Current encoding: C 


In [28]:
# Retrieve the estimated coefficient for KM
beta_hat_km <- coef(new_model)["KM"]

# Calculate the price change for 10,000 additional kilometers
price_change_10k_km <- beta_hat_km * 10000

# Display the result
cat(
  "Holding everything else equal, the sales price would change by:€",
  price_change_10k_km, "if a car accumulated 10,000 more kilometers.\n"
)

Holding everything else equal, the sales price would change by:€ -146.3109 if a car accumulated 10,000 more kilometers.
