## 1. Problem Statement
* This project aims to predict inflation in the city of Manado using the support vector regression method, the data used is monthly inflation data for the city of Manado for the period January 2010 to December 2021

## 2. Data Collection
* Data Source : https://sulut.bps.go.id/indicator/3/61/1/inflasi-kota-manado-bulanan-.html


## 3. Data Understanding and EDA

In [None]:
## Import the library that will be use
library(tidyverse)
library(caret)
library(MLmetrics)
library(e1071)

In [None]:
## Read The Data
data <- read_csv2('/kaggle/input/inflasi/Inflasi Kota Manado.csv',col_types='ccd', )

### 3.1 Data Understanding

In [None]:
## see top 5 data
head(data)

In [None]:
## see dimension of data
dim(data)

#### The data consists of 141 rows and 3 columns
* Bulan : month of inflation
* Tahun : Year of inflation
* Inflasi : Inflation value

In [None]:
## check type of data
sapply(data, class)

In [None]:
## check duplicate in data
data[sum(duplicated(data)),]

#### There is no duplicate value in the data

In [None]:
## check missing value in data
# Initialize an empty vector to store the counts of missing values for each column
missing_counts <- numeric(length(data))

# Iterate through each column
for (i in 1:ncol(data)) {
  # Count the missing values in the current column using is.na()
  missing_counts[i] <- sum(is.na(data[, i]))
  
  # Print the column name and the count of missing values
  cat("Column", colnames(data)[i], ": Total Missing Value ---->", missing_counts[i], "\n")
}

#### There is no missing value

In [None]:
## check descriptive statistic of the data
summary(data)

#### we can see from the descriptive statistic of inflasi column, highest inflation is 3.96 and lowest inflation is -2.1, and also other values such as mean, median and quartiles

### 3.2 EDA

In [None]:
## Plot of Inflation
plot(data$Inflasi,type = "o", col = "blue", xlab = "Index", ylab = 'Inflation',
   main = "inflation plot for 2010 - 2021")

In [None]:
head(data[order(-data$Inflasi), ], 5)

#### From the plot, it can be seen that the inflation value continues to rise and fall every month. The inflation value which rises significantly in the plot shows that in these months, whether Muslims celebrate Eid al-Fitr or Christians celebrate Christmas, which causes the prices of goods and services to also rise.

In [None]:
## Histogram
hist(data$Inflasi, 
     main = "Histogram of Inflation",  
     xlab = "Values",                   
     ylab = "Frequency",                
     col = "lightblue",                 
     border = "black") 

#### we can see from the histogram, the data we have follows a normal distribution, with most of the data spread over the range of values -1 and 1

In [None]:
## Boxplot
boxplot(data$Inflasi,col='green')

#### from the box plot we can see,we have some oulier in it

## 4. Feature Engginering

### divide data base on PACF
what is pacf ?
PACF stands for "Partial AutoCorrelation Function," and it is a statistical tool used in time series analysis and forecasting. The PACF is a way to investigate the relationship between a data point and its lagged values while controlling for the influence of other lags

In [None]:
pacf(data$Inflasi, main='PACF plot of inflation')

#### from the pacf plot, lag 2 and lag 3 are significant lags(past value), This means that the current inflation value is influenced by the inflation value two periods before and the value three periods before

In [None]:
## Output Feature
Y <- data$Inflasi[4:length(data$Inflasi)]

In [None]:
## Input Feature
lag_2 <- data$Inflasi[2:142]
lag_3 <- data$Inflasi[1:141]

In [None]:
## create dataframe for input and output
data <- data.frame(lag_3=lag_3, lag_2=lag_2, Y=Y)

In [None]:
## check top 6 observation
head(data)

### Split Data Into Train and Test

In [None]:
set.seed(123)
index <- createDataPartition(data$Y, p = 0.8, list = FALSE)
train_data <- data[index, ]
test_data <- data[-index, ]

In [None]:
X_train <- train_data[,c('lag_3','lag_2')]
head(X_train)
Y_train <- train_data[,'Y']
head(Y_train)

In [None]:
dim(train_data)
dim(test_data)

### Feature Scaling

In [None]:
# Define the column names you want to scale
column_names <- c("lag_2", "lag_3")

# Loop through each column name and scale the corresponding columns in train_data and test_data
for (col_name in column_names) {
  train_data[, col_name] <- (train_data[, col_name] - min(train_data[, col_name])) / (max(train_data[, col_name]) - min(train_data[, col_name]))
  test_data[, col_name] <- (test_data[, col_name] - min(test_data[, col_name])) / (max(test_data[, col_name]) - min(test_data[, col_name]))
}


In [None]:
head(train_data)

In [None]:
head(test_data)

## 5.Model Training

In [None]:
## SVR with default parameter
## default kernel = radial
## cost = 1
## gamma = 0.5
## epsilon = 0.1

svr_model <- svm(Y ~ ., data = train_data, kernel = "radial",type='eps-regression')

## use deafult svr model to preict training and testing data
predictions_train <- predict(svr_model, newdata = train_data[,c('lag_3','lag_2')])
predictions_test <- predict(svr_model, newdata = test_data[,c('lag_3','lag_2')])

In [None]:
## Model Evaluation
mse_train <- MSE(predictions_train, train_data$Y)
mse_test <- MSE(predictions_test, test_data$Y)
cat('MSE Train :',mse_train, '\n')
cat('MSE Test  :',mse_test)

#### The MSE train and test scores have similar values and mse value in the test data also decreased it show the model work good with unseen data

## 6. Hyperparameter Tuning SVR

In [None]:
## Hyperparameter Tuning in SVR using Grid Search 

# Define the range of hyperparameters for the grid search
cost_range <- 2^(0:9)
epsilon_range <- 2^(-9:0)
gamma_range <- 2^(0:9)

# Define the tuning control parameters using the tune.control function
tuning_control <- tune.control(sampling = "cross",  # Cross-validation
                               cross = 10)   # Number of folds for cross-validation

In [None]:
set.seed(123)  # Set a specific random seed

# Perform the grid search for hyperparameter tuning with the radial kernel
tune_radial <- tune.svm(Y ~ ., 
                        data = train_data,
                        kernel = "radial",
                        cost = cost_range,
                        epsilon = epsilon_range,
                        gamma = gamma_range,
                        tunecontrol = tuning_control)

In [None]:
tune_radial

In [None]:
best_model_rbf<-tune_radial$best.model

In [None]:
## use best model with hyperparameter tuning in train and test data
predictions_test <- predict(best_model_rbf, test_data[,c('lag_3','lag_2')] )
mse_test <- MSE(predictions_test, test_data$Y)
cat('MSE Test  :',mse_test)

#### The rmse value of the model shows a value close to 0, which indicates that the model is good for prediction

## 7. Make Prediction

#### we want to predict inflation for December, and we also know the inflation data for the previous months, the inflation values for the last three months and two months are used as input variables and predict inflation for December, let's say September inflation = 0.45 and October inflation = 0.18, then prediction for December :

In [None]:
input_data <- data.frame(lag_3 = 0.58, lag_2 = 0.04)
december_inflation <- predict(best_model_rbf, input_data)
december_inflation

In [None]:
# Save the SVR model to the Kaggle output directory
saveRDS(best_model_rbf, file.path("/kaggle/working", "svr_model_radial.rds"))