# Install packages

In [None]:
packages <- c("tidyr", "fpp", "dplyr", "fBasics", "ggplot2", "corrplot", 
             "plotly", "ggalt", "rpart", "rattle", "rpart.plot", 
             "RColorBrewer", "party", "MASS", "Hmisc", "splus2R", 
             "PortfolioAnalytics", "PerformanceAnalytics", "zoo", 
             "DEoptim", "ROI", "tidyverse", "tidyquant")
install.packages(packages)

# Load libraries

In [None]:
library(tidyr)
library(fpp)
library(dplyr)

library(fBasics)

library(ggplot2)
library(corrplot)
library(plotly)
library(ggalt)

library(rpart)
library(rattle)
library(rpart.plot)
library(RColorBrewer)
library(party)
library(MASS)
library(Hmisc)
library(splus2R)

library(PortfolioAnalytics)
library(PerformanceAnalytics)
library(zoo)
library(DEoptim)
library(tidyverse)
library(tidyquant)
library(ROI)

require(ROI.plugin.glpk)
require(ROI.plugin.quadprog)
require(reshape2)

# Loading dataset

In [None]:
ETF_data <- read.csv("../input/mutual-funds-and-etfs/ETFs.csv", sep = ",", header = TRUE, na = c("N/A", ""))
MUTUAL_data <- read.csv("../input/mutual-funds-and-etfs/Mutual Funds.csv", sep = ",", header = TRUE, na = c("N/A", ""))

# Cleaning dataset
**Converting price_earnings to numerical column**

In [None]:
ETF_data$price_earnings <- as.numeric(ETF_data$price_earnings)
MUTUAL_data$price_earnings <- as.numeric(MUTUAL_data$price_earnings)

### ETF clean
#### Discard unused columns in further analysis

In [None]:
ETF_data$fund_symbol <- NULL
ETF_data$fund_name <- NULL
ETF_data$fund_extended_name <- NULL
ETF_data$category <- NULL
ETF_data$fund_family <- NULL
ETF_data$currency <- NULL
ETF_data$legal_type <- NULL
ETF_data$rating_us_government <- NULL
ETF_data$returns_ytd <- NULL
ETF_data$alpha_3y <- NULL
ETF_data$beta_3y <- NULL
ETF_data$treynor_ratio_3y <- NULL
ETF_data$sharpe_ratio_3y <- NULL
ETF_data$standard_deviation_3y <- NULL
ETF_data$mean_annual_return_3y <- NULL
ETF_data$net_assets <- NULL
ETF_data$ytd_return <- NULL
ETF_data$fund_yield <- NULL
ETF_data$inception_date <- NULL
ETF_data$currency <- NULL
ETF_data$net_annual_expense_ratio_fund <- NULL
ETF_data$net_annual_expense_ratio_category <- NULL
ETF_data$price_book <- NULL
ETF_data$price_sales <- NULL
ETF_data$price_cashflow <- NULL
ETF_data$rating_us_government <- NULL
ETF_data$fund_return_ytd <- NULL
ETF_data$category_return_ytd <- NULL
ETF_data$fund_return_1month <- NULL
ETF_data$category_return_1month <- NULL
ETF_data$fund_return_3months <- NULL
ETF_data$category_return_3months <- NULL
ETF_data$fund_return_1year <- NULL
ETF_data$category_return_1year <- NULL
ETF_data$fund_return_3years <- NULL
ETF_data$category_return_3years <- NULL
ETF_data$fund_return_5years <- NULL
ETF_data$category_return_5years <- NULL
ETF_data$fund_return_10years <- NULL
ETF_data$category_return_10years <- NULL
ETF_data$fund_alpha_3years <- NULL
ETF_data$category_alpha_3years <- NULL
ETF_data$fund_alpha_5years <- NULL
ETF_data$category_alpha_5years <- NULL
ETF_data$fund_alpha_10years <- NULL
ETF_data$category_alpha_10years <- NULL
ETF_data$fund_beta_3years <- NULL
ETF_data$category_beta_3years <- NULL
ETF_data$fund_beta_5years <- NULL
ETF_data$category_beta_5years <- NULL
ETF_data$fund_beta_10years <- NULL
ETF_data$category_beta_10years <- NULL
ETF_data$fund_mean_annual_return_3years <- NULL
ETF_data$category_mean_annual_return_3years <- NULL
ETF_data$fund_mean_annual_return_5years <- NULL
ETF_data$category_mean_annual_return_5years <- NULL
ETF_data$fund_mean_annual_return_10years <- NULL
ETF_data$category_mean_annual_return_10years <- NULL
ETF_data$fund_r_squared_3years <- NULL
ETF_data$category_r_squared_3years <- NULL
ETF_data$fund_r_squared_5years <- NULL
ETF_data$category_r_squared_5years <- NULL
ETF_data$fund_r_squared_10years <- NULL
ETF_data$category_r_squared_10years <- NULL
ETF_data$fund_standard_deviation_3years <- NULL
ETF_data$category_standard_deviation_3years <- NULL
ETF_data$fund_standard_deviation_5years <- NULL
ETF_data$category_standard_deviation_5years <- NULL
ETF_data$fund_standard_deviation_10years <- NULL
ETF_data$category_standard_deviation_10years <- NULL
ETF_data$fund_sharpe_ratio_3years <- NULL
ETF_data$category_sharpe_ratio_3years <- NULL
ETF_data$fund_sharpe_ratio_5years <- NULL
ETF_data$category_sharpe_ratio_5years <- NULL
ETF_data$fund_sharpe_ratio_10years <- NULL
ETF_data$category_sharpe_ratio_10years <- NULL
ETF_data$fund_treynor_ratio_3years <- NULL
ETF_data$category_treynor_ratio_3years <- NULL
ETF_data$fund_treynor_ratio_5years <- NULL
ETF_data$category_treynor_ratio_5years <- NULL
ETF_data$fund_treynor_ratio_10years <- NULL
ETF_data$category_treynor_ratio_10years <- NULL

### MF clean
#### Discard unused columns in further analysis

In [None]:
MUTUAL_data$fund_symbol <- NULL
MUTUAL_data$fund_name <- NULL
MUTUAL_data$fund_extended_name <- NULL
MUTUAL_data$category <- NULL
MUTUAL_data$fund_family <- NULL
MUTUAL_data$currency <- NULL
MUTUAL_data$rating_us_government <- NULL
MUTUAL_data$returns_ytd <- NULL
MUTUAL_data$alpha_3y <- NULL
MUTUAL_data$beta_3y <- NULL
MUTUAL_data$treynor_ratio_3y <- NULL
MUTUAL_data$sharpe_ratio_3y <- NULL
MUTUAL_data$standard_deviation_3y <- NULL
MUTUAL_data$mean_annual_return_3y <- NULL
MUTUAL_data$net_assets <- NULL
MUTUAL_data$ytd_return <- NULL
MUTUAL_data$fund_yield <- NULL
MUTUAL_data$inception_date <- NULL
MUTUAL_data$currency <- NULL
MUTUAL_data$net_annual_expense_ratio_fund <- NULL
MUTUAL_data$net_annual_expense_ratio_category <- NULL
MUTUAL_data$price_book <- NULL
MUTUAL_data$price_sales <- NULL
MUTUAL_data$price_cashflow <- NULL
MUTUAL_data$rating_us_government <- NULL
MUTUAL_data$fund_return_ytd <- NULL
MUTUAL_data$category_return_ytd <- NULL
MUTUAL_data$fund_return_1month <- NULL
MUTUAL_data$category_return_1month <- NULL
MUTUAL_data$fund_return_3months <- NULL
MUTUAL_data$category_return_3months <- NULL
MUTUAL_data$fund_return_1year <- NULL
MUTUAL_data$category_return_1year <- NULL
MUTUAL_data$fund_return_3years <- NULL
MUTUAL_data$category_return_3years <- NULL
MUTUAL_data$fund_return_5years <- NULL
MUTUAL_data$category_return_5years <- NULL
MUTUAL_data$fund_return_10years <- NULL
MUTUAL_data$category_return_10years <- NULL
MUTUAL_data$fund_alpha_3years <- NULL
MUTUAL_data$category_alpha_3years <- NULL
MUTUAL_data$fund_alpha_5years <- NULL
MUTUAL_data$category_alpha_5years <- NULL
MUTUAL_data$fund_alpha_10years <- NULL
MUTUAL_data$category_alpha_10years <- NULL
MUTUAL_data$fund_beta_3years <- NULL
MUTUAL_data$category_beta_3years <- NULL
MUTUAL_data$fund_beta_5years <- NULL
MUTUAL_data$category_beta_5years <- NULL
MUTUAL_data$fund_beta_10years <- NULL
MUTUAL_data$category_beta_10years <- NULL
MUTUAL_data$fund_mean_annual_return_3years <- NULL
MUTUAL_data$category_mean_annual_return_3years <- NULL
MUTUAL_data$fund_mean_annual_return_5years <- NULL
MUTUAL_data$category_mean_annual_return_5years <- NULL
MUTUAL_data$fund_mean_annual_return_10years <- NULL
MUTUAL_data$category_mean_annual_return_10years <- NULL
MUTUAL_data$fund_r_squared_3years <- NULL
MUTUAL_data$category_r_squared_3years <- NULL
MUTUAL_data$fund_r_squared_5years <- NULL
MUTUAL_data$category_r_squared_5years <- NULL
MUTUAL_data$fund_r_squared_10years <- NULL
MUTUAL_data$category_r_squared_10years <- NULL
MUTUAL_data$fund_standard_deviation_3years <- NULL
MUTUAL_data$category_standard_deviation_3years <- NULL
MUTUAL_data$fund_standard_deviation_5years <- NULL
MUTUAL_data$category_standard_deviation_5years <- NULL
MUTUAL_data$fund_standard_deviation_10years <- NULL
MUTUAL_data$category_standard_deviation_10years <- NULL
MUTUAL_data$fund_sharpe_ratio_3years <- NULL
MUTUAL_data$category_sharpe_ratio_3years <- NULL
MUTUAL_data$fund_sharpe_ratio_5years <- NULL
MUTUAL_data$category_sharpe_ratio_5years <- NULL
MUTUAL_data$fund_sharpe_ratio_10years <- NULL
MUTUAL_data$category_sharpe_ratio_10years <- NULL
MUTUAL_data$fund_treynor_ratio_3years <- NULL
MUTUAL_data$category_treynor_ratio_3years <- NULL
MUTUAL_data$fund_treynor_ratio_5years <- NULL
MUTUAL_data$category_treynor_ratio_5years <- NULL
MUTUAL_data$fund_treynor_ratio_10years <- NULL
MUTUAL_data$category_treynor_ratio_10years <- NULL

#### Removing MFs attributes that don't exist in ETFs

In [None]:
MUTUAL_data$morningstar_rating <- NULL
MUTUAL_data$inception_date <- NULL
MUTUAL_data$portfolio_cash<- NULL
MUTUAL_data$portfolio_others <- NULL
MUTUAL_data$portfolio_preferred <- NULL
MUTUAL_data$portfolio_convertable <- NULL
MUTUAL_data$morningstar_return_rating <- NULL
MUTUAL_data$morningstar_risk_rating <- NULL
MUTUAL_data$median_market_cap <- NULL
MUTUAL_data$bond_maturity <- NULL
MUTUAL_data$bond_duration <- NULL
MUTUAL_data$category_return_2018 <- NULL
MUTUAL_data$category_return_2017 <- NULL
MUTUAL_data$category_return_2016 <- NULL
MUTUAL_data$category_return_2015 <- NULL
MUTUAL_data$category_return_2014 <- NULL
MUTUAL_data$category_return_2013 <- NULL
MUTUAL_data$category_return_2012 <- NULL
MUTUAL_data$category_return_2011 <- NULL
MUTUAL_data$category_return_2010 <- NULL
MUTUAL_data$years_up <- NULL
MUTUAL_data$years_down <- NULL

### Removing missing data

In [None]:
ETF_data <- data.frame(na.omit(ETF_data))
MUTUAL_data <- data.frame(na.omit(MUTUAL_data))

### Combine both data frames in a single one for easier analysis

In [None]:
ETF_data$fund_type <- "ETF"
MUTUAL_data$fund_type <- "MF"
BOTH_data <- rbind(ETF_data, MUTUAL_data)
remove(ETF_data, MUTUAL_data)

# Dataset normalization
### Normalizing portfolio from decimal to percentage range [0, 1]

In [None]:
BOTH_data$portfolio_bonds <- BOTH_data$portfolio_bonds / 100
BOTH_data$portfolio_stocks <- BOTH_data$portfolio_stocks / 100

### Normalizing returns per year from decimal to percentage range [0, 1]

In [None]:
BOTH_data$fund_return_2010 <- BOTH_data$fund_return_2010 / 100
BOTH_data$fund_return_2011 <- BOTH_data$fund_return_2011 / 100
BOTH_data$fund_return_2012 <- BOTH_data$fund_return_2012 / 100
BOTH_data$fund_return_2013 <- BOTH_data$fund_return_2013 / 100
BOTH_data$fund_return_2014 <- BOTH_data$fund_return_2014 / 100
BOTH_data$fund_return_2015 <- BOTH_data$fund_return_2015 / 100
BOTH_data$fund_return_2016 <- BOTH_data$fund_return_2016 / 100
BOTH_data$fund_return_2017 <- BOTH_data$fund_return_2017 / 100
BOTH_data$fund_return_2018 <- BOTH_data$fund_return_2018 / 100

### Normalizing investment in certain sectors from decimal to percentage range [0, 1]

In [None]:
BOTH_data$basic_materials <- BOTH_data$basic_materials / 100
BOTH_data$consumer_cyclical <- BOTH_data$consumer_cyclical / 100
BOTH_data$financial_services <- BOTH_data$financial_services / 100
BOTH_data$real_estate <- BOTH_data$real_estate / 100
BOTH_data$consumer_defensive <- BOTH_data$consumer_defensive / 100
BOTH_data$healthcare <- BOTH_data$healthcare / 100
BOTH_data$utilities <- BOTH_data$utilities / 100
BOTH_data$communication_services <- BOTH_data$communication_services / 100
BOTH_data$energy <- BOTH_data$energy / 100
BOTH_data$industrials <- BOTH_data$industrials / 100
BOTH_data$technology <- BOTH_data$technology / 100

### Normalizing ratings from decimal to percentage range [0, 1]

In [None]:
BOTH_data$rating_aaa <- BOTH_data$rating_aaa / 100
BOTH_data$rating_aa <- BOTH_data$rating_aa / 100
BOTH_data$rating_a <- BOTH_data$rating_a / 100
BOTH_data$rating_bbb <- BOTH_data$rating_bbb / 100
BOTH_data$rating_bb <- BOTH_data$rating_bb / 100
BOTH_data$rating_b <- BOTH_data$rating_b / 100
BOTH_data$rating_below_b <- BOTH_data$rating_below_b / 100
BOTH_data$rating_others <- BOTH_data$rating_others / 100

# Data Analysis

In [None]:
options(repr.plot.width=17, repr.plot.height=11)
plot(
  BOTH_data$investment,
  BOTH_data$size,
  col = rainbow(11, start = 0.1, end = 0.3)
)

In [None]:
ggplot(BOTH_data, aes(investment)) +
  geom_bar(fill = "#0073C2FF")

In [None]:
ggplot(BOTH_data, aes(size)) +
  geom_bar(fill = "#0073C2FF")

In [None]:
options(repr.plot.width=25, repr.plot.height=20)
melt.both <- melt(BOTH_data)
ggplot(data = melt.both, aes(x = value, color=investment, fill=investment)) + 
  stat_density() + 
  facet_wrap(~variable, scales = "free") +
  scale_fill_brewer(palette="Pastel1")

In [None]:
options(repr.plot.width=25, repr.plot.height=20)
melt.both <- melt(BOTH_data)
ggplot(data = melt.both, aes(x = value, color=size, fill=size)) + 
  stat_density() + 
  facet_wrap(~variable, scales = "free") +
  scale_fill_brewer(palette="Pastel1")

In [None]:
options(repr.plot.width=25, repr.plot.height=20)
melt.both <- melt(BOTH_data)
ggplot(data = melt.both, aes(x = value, color=fund_type, fill=fund_type)) + 
  stat_density() + 
  facet_wrap(~variable, scales = "free") +
  scale_fill_brewer(palette="Pastel1")

In [None]:
circular_values <- c(
  sum(BOTH_data$investment == "Growth" & BOTH_data$size == "Small"),
  sum(BOTH_data$investment == "Growth" & BOTH_data$size == "Medium"),
  sum(BOTH_data$investment == "Growth" & BOTH_data$size == "Large"),
  sum(BOTH_data$investment == "Value" & BOTH_data$size == "Small"),
  sum(BOTH_data$investment == "Value" & BOTH_data$size == "Medium"),
  sum(BOTH_data$investment == "Value" & BOTH_data$size == "Large"),
  sum(BOTH_data$investment == "Blend" & BOTH_data$size == "Small"),
  sum(BOTH_data$investment == "Blend" & BOTH_data$size == "Medium"),
  sum(BOTH_data$investment == "Blend" & BOTH_data$size == "Large")
)

pie(
  circular_values,
  labels = c("Growth - Small", "Growth - Medium",  "Growth - Large", 
    "Value - Small", "Value - Medium", "Value - Large", 
    "Blend - Small", "Blend - Medium", "Blend - Large"),
  main = "Portfolios by investment strategies and the size of capitalization",
  col = rainbow(11, start = 0.1, end = 0.9),
  radius=1
)

#### Shows the overall distribution of how much each fund invests in stocks.

In [None]:
options(repr.plot.width=17, repr.plot.height=11)
hist(
  BOTH_data$portfolio_stocks,
  xlab="The ratio of stocks in the portfolio",
  ylab = "Number of stocks",
  main = "Distribution of stocks in a portfolio",
  col = rainbow(11, start = 0.1, end = 0.3),
  breaks = 10
)

#### Shows the overall distribution of how much each fund invests in bonds.

In [None]:
options(repr.plot.width=17, repr.plot.height=11)
hist(
  BOTH_data$portfolio_bonds,
  xlab="The ratio of bonds in the portfolio",
  ylab = "Number of bonds",
  main = "Distribution of bond ratios in the portfolio",
  col = rainbow(11, start = 0.3, end = 0.5),
  breaks = 10
)

#### Distribution of other fund assets (besides stocks and bonds).

In [None]:
BOTH_data <- cbind(BOTH_data, "other_assets" = (1 - (BOTH_data$portfolio_stocks + BOTH_data$portfolio_bonds)))

#### Shows the overall distribution of how much each fund invests in other assets besides stocks and bonds.

In [None]:
options(repr.plot.width=17, repr.plot.height=11)
hist(
  BOTH_data$other_assets,
  xlab="Ratio of other assets in the portfolio",
  ylab = "Number of assets",
  main = "Distribution of relations between other assets in the portfolio",
  col = rainbow(11, start = 0.5, end = 0.7),
  breaks = 10
)

#### The distribution of sectors to shares in the portfolio shows how much funds invest in specific areas.

In [None]:
options(repr.plot.width=17, repr.plot.height=11)
margines <- par(mar = c(5, 15, 3, 3))
boxplot(
  BOTH_data$basic_materials,
  BOTH_data$consumer_cyclical,
  BOTH_data$financial_services,
  BOTH_data$real_estate,
  BOTH_data$consumer_defensive,
  BOTH_data$healthcare,
  BOTH_data$utilities,
  BOTH_data$communication_services,
  BOTH_data$energy,
  BOTH_data$industrials,
  BOTH_data$technology,
  xlab = "The ratio of the sector to shares in the portfolio",
  main = "Distribution of sectors to shares in the portfolio",
  names = c("Basic materials", "Consumer cyclical", "Financial services", "Realestate", "Consumer defensive", "Healthcare", "Utilities", "Communication services", "Energy", "Industrials", "Technology"),
  col = rainbow(11, start = 0.7, end = 1),
  ylim = c(-0.01, 1.0),
  las = 1,
  horizontal = TRUE
)

#### The distribution shows the ratio of ratings for all investment funds.

In [None]:
options(repr.plot.width=17, repr.plot.height=11)
margines <- par(mar = c(5, 15, 3, 3))
boxplot(
  BOTH_data$rating_aaa,
  BOTH_data$rating_aa,
  BOTH_data$rating_a,
  BOTH_data$rating_bbb,
  BOTH_data$rating_bb,
  BOTH_data$rating_b,
  BOTH_data$rating_below_b,
  BOTH_data$rating_others,
  xlab = "The ratio of ratings in the portfolio",
  main = "Distribution of ratings in the portfolio",
  names = c("AAA", "AA", "A", "BBB", "BB", "B", "Bellow B", "Others"),
  col = rainbow(11, start = 0.1, end = 0.4),
  ylim = c(-0.05, 1.1)
)

In [None]:
options(repr.plot.width=17, repr.plot.height=11)
margines <- par(mar = c(3, 3, 3, 3))
hist(
  BOTH_data$price_earnings,
  xlab = "Price to profit",
  ylab = "Number of stocks",
  main = "Price-to-profit distribution",
  col = rainbow(11, start = 0.4, end = 0.6),
  breaks = 10
)

In [None]:
options(repr.plot.width=17, repr.plot.height=11)
margines <- par(mar = c(3, 3, 3, 3))
boxplot(
  BOTH_data$fund_return_2010,
  BOTH_data$fund_return_2011,
  BOTH_data$fund_return_2012,
  BOTH_data$fund_return_2013,
  BOTH_data$fund_return_2014,
  BOTH_data$fund_return_2015,
  BOTH_data$fund_return_2016,
  BOTH_data$fund_return_2017,
  BOTH_data$fund_return_2018,
  xlab = "Year",
  ylab = "Rate of return (%)",
  names = c("2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018"),
  main = "Return rates in 2010-2018",
  col = rainbow(11, start = 0.4, end = 0.8)
)

In [None]:
rating_corr <- data.frame(
  BOTH_data$rating_aaa,
  BOTH_data$rating_aa,
  BOTH_data$rating_a,
  BOTH_data$rating_bbb,
  BOTH_data$rating_bb,
  BOTH_data$rating_b,
  BOTH_data$rating_below_b,
  BOTH_data$rating_others
)

In [None]:
names(rating_corr)[1] <- "AAA"
names(rating_corr)[2] <- "AA"
names(rating_corr)[3] <- "A"
names(rating_corr)[4] <- "BBB"
names(rating_corr)[5] <- "BB"
names(rating_corr)[6] <- "B"
names(rating_corr)[7] <- "Bellow B"
names(rating_corr)[8] <- "Others"

In [None]:
options(repr.plot.width=17, repr.plot.height=11)
margines <- par(mar = c(0, 0, 0, 0))
corrplot(
  cor(rating_corr),
  method ="color",
  order = "original",
  col = rainbow(20, start = 0.2, end = 0.8),
  type ="lower",
  addCoef.col = "black",
  tl.col ="black",
  tl.srt = 0,
  title = "Weight correlations of ratings",
  mar = c(0, 0, 5, 0)
)

In [None]:
industry_corr <- data.frame(
  BOTH_data$basic_materials,
  BOTH_data$consumer_cyclical,
  BOTH_data$financial_services,
  BOTH_data$real_estate,
  BOTH_data$consumer_defensive,
  BOTH_data$healthcare,
  BOTH_data$utilities,
  BOTH_data$communication_services,
  BOTH_data$energy,
  BOTH_data$industrials,
  BOTH_data$technology
)

In [None]:
names(industry_corr)[1] <- "Basic materials"
names(industry_corr)[2] <- "Consumer cyclical"
names(industry_corr)[3] <- "Financial services"
names(industry_corr)[4] <- "Realestate"
names(industry_corr)[5] <- "Consumer defensive"
names(industry_corr)[6] <- "Healthcare"
names(industry_corr)[7] <- "Utilities"
names(industry_corr)[8] <- "Communication services"
names(industry_corr)[9] <- "Energy"
names(industry_corr)[10] <- "Industrials"
names(industry_corr)[11] <- "Technology"

In [None]:
options(repr.plot.width=17, repr.plot.height=13)
margines <- par(mar = c(0, 0, 0, 0))
corrplot(
  cor(industry_corr),
  method ="color",
  order = "original",
  col = rainbow(20, start = 0.2, end = 0.8),
  type ="lower",
  addCoef.col = "black",
  tl.col ="black",
  tl.srt = 30,
  title = "Weight correlations of industry sectors",
  mar = c(0, 0, 5, 0)
)

In [None]:
combined_correlation <- data.frame(
  rating_corr,
  industry_corr,
  BOTH_data$other_assets
)

In [None]:
names(combined_correlation)[20] <- "Other assets"

In [None]:
options(repr.plot.width=17, repr.plot.height=13)
margines <- par(mar = c(0, 0, 0, 0))
corrplot(
  cor(combined_correlation),
  method ="color",
  order = "original",
  col = rainbow(20, start = 0.2, end = 0.8),
  type ="lower",
  addCoef.col = "black",
  tl.col ="black",
  tl.srt = 30,
  title = "Weight correlations of all types of assets in the entire portfolio",
  mar = c(0, 0, 5, 0)
)

# Regression

### Train-Test split

In [None]:
set.seed(100)
trainingRowIndex <- sample(1:nrow(BOTH_data), 0.8*nrow(BOTH_data))
trainingData <- BOTH_data[trainingRowIndex, ]
testData  <- BOTH_data[-trainingRowIndex, ]

### Model 1

In [None]:
lmMod <- lm(fund_return_2018 ~ fund_return_2017,
            data=trainingData
)
fundReturnPrediction <- predict(lmMod, testData)
actuals_preds <- data.frame(cbind(actuals=testData$fund_return_2018, predicteds=fundReturnPrediction)) 
correlation_accuracy <- cor(actuals_preds)
head(correlation_accuracy)

### Model 2

In [None]:
lmMod <- lm(fund_return_2018 ~ fund_return_2017 +
              fund_return_2016 + 
              fund_return_2015 + 
              fund_return_2014 + 
              fund_return_2013 + 
              fund_return_2012 + 
              fund_return_2011 + 
              fund_return_2010, 
            data=trainingData
)
fundReturnPrediction <- predict(lmMod, testData)
actuals_preds <- data.frame(cbind(actuals=testData$fund_return_2018, predicteds=fundReturnPrediction)) 
correlation_accuracy <- cor(actuals_preds)
head(correlation_accuracy)

### Model 2.1

In [None]:
lmMod <- lm(fund_return_2018 ~               
              rating_aaa +
              rating_aa +
              rating_a +
              rating_bbb +
              rating_bb +
              rating_b +
              rating_below_b +
              rating_others
            , 
            data=trainingData
)
fundReturnPrediction <- predict(lmMod, testData)
actuals_preds <- data.frame(cbind(actuals=testData$fund_return_2018, predicteds=fundReturnPrediction)) 
correlation_accuracy <- cor(actuals_preds)
head(correlation_accuracy)

### Model 2.2

In [None]:
lmMod <- lm(fund_return_2018 ~               
              basic_materials +
              consumer_cyclical +
              financial_services +
              real_estate +
              consumer_defensive +
              healthcare +
              utilities +
              communication_services +
              energy +
              industrials +
              technology
            , 
            data=trainingData
)
fundReturnPrediction <- predict(lmMod, testData)
actuals_preds <- data.frame(cbind(actuals=testData$fund_return_2018, predicteds=fundReturnPrediction)) 
correlation_accuracy <- cor(actuals_preds)
head(correlation_accuracy)

### Model 2.3

In [None]:
lmMod <- lm(fund_return_2018 ~               
              investment +
              size
            , 
            data=trainingData
)
fundReturnPrediction <- predict(lmMod, testData)
actuals_preds <- data.frame(cbind(actuals=testData$fund_return_2018, predicteds=fundReturnPrediction)) 
correlation_accuracy <- cor(actuals_preds)
head(correlation_accuracy)

### Model 2.4

In [None]:
lmMod <- lm(fund_return_2018 ~               
              portfolio_stocks +
              portfolio_bonds +
              price_earnings
            , 
            data=trainingData
)
fundReturnPrediction <- predict(lmMod, testData)
actuals_preds <- data.frame(cbind(actuals=testData$fund_return_2018, predicteds=fundReturnPrediction)) 
correlation_accuracy <- cor(actuals_preds)
head(correlation_accuracy)

### Model 3

In [None]:
lmMod <- lm(fund_return_2018 ~ 
              fund_return_2017 +
              fund_return_2016 + 
              fund_return_2015 + 
              fund_return_2014 + 
              fund_return_2013 + 
              fund_return_2012 + 
              fund_return_2011 + 
              fund_return_2010 +
              
              rating_aaa +
              rating_aa +
              rating_a +
              rating_bbb +
              rating_bb +
              rating_b +
              rating_below_b +
              rating_others
            , 
            data=trainingData
)
fundReturnPrediction <- predict(lmMod, testData)
actuals_preds <- data.frame(cbind(actuals=testData$fund_return_2018, predicteds=fundReturnPrediction)) 
correlation_accuracy <- cor(actuals_preds)
head(correlation_accuracy)

### Model 4

In [None]:
lmMod <- lm(fund_return_2018 ~ 
              fund_return_2017 +
              fund_return_2016 + 
              fund_return_2015 + 
              fund_return_2014 + 
              fund_return_2013 + 
              fund_return_2012 + 
              fund_return_2011 + 
              fund_return_2010 +
              
              rating_aaa +
              rating_aa +
              rating_a +
              rating_bbb +
              rating_bb +
              rating_b +
              rating_below_b +
              rating_others +

              basic_materials +
              consumer_cyclical +
              financial_services +
              real_estate +
              consumer_defensive +
              healthcare +
              utilities +
              communication_services +
              energy +
              industrials +
              technology
            , 
            data=trainingData
)
fundReturnPrediction <- predict(lmMod, testData)
actuals_preds <- data.frame(cbind(actuals=testData$fund_return_2018, predicteds=fundReturnPrediction)) 
correlation_accuracy <- cor(actuals_preds)
head(correlation_accuracy)

### Model 5

In [None]:
lmMod <- lm(fund_return_2018 ~ 
              fund_return_2017 +
              fund_return_2016 + 
              fund_return_2015 + 
              fund_return_2014 + 
              fund_return_2013 + 
              fund_return_2012 + 
              fund_return_2011 + 
              fund_return_2010 +
              
              rating_aaa +
              rating_aa +
              rating_a +
              rating_bbb +
              rating_bb +
              rating_b +
              rating_below_b +
              rating_others +

              basic_materials +
              consumer_cyclical +
              financial_services +
              real_estate +
              consumer_defensive +
              healthcare +
              utilities +
              communication_services +
              energy +
              industrials +
              technology +

              investment +
              size +

              portfolio_stocks +
              portfolio_bonds +
              price_earnings
            , 
            data=trainingData
)
fundReturnPrediction <- predict(lmMod, testData)
actuals_preds <- data.frame(cbind(actuals=testData$fund_return_2018, predicteds=fundReturnPrediction)) 
correlation_accuracy <- cor(actuals_preds)
head(correlation_accuracy)