# Preliminary Results


In [None]:
# Load the necessary packages.
library(GGally)
library(tidyverse)
library(infer)
library(cowplot)
library(broom)

## Importing Data

### Table 1: Top 6 Rows of Forest Fires Data

In [None]:
#Setting the seed.
set.seed(1234)

#Dowloading the data from the web and reading it in R.
url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/forestfires.csv"
download.file (url,"../data/forestfires.csv" )

forest_fires_data <-read_delim("../data/forestfires.csv", skip=0, delim=",") %>%
    select(wind, area) %>%
    mutate(wind = as.numeric(wind), area = as.numeric(area)) %>% #Wrangling data and selecting columns to be used in prediction
    filter(!is.na(wind) | !is.na(area))
    
head(forest_fires_data)

## Wind Speed Distributions

In [None]:
options(repr.plot.width = 6, repr.plot.height = 4)
wind_speed_plot <- forest_fires_data %>%
    ggplot(aes(x = wind))+
    geom_histogram(binwidth = 0.8) + 
    labs(x="Wind Speed (km/h)", y="Count")+
    ggtitle("Figure 1: Wind Speed Distribution") +
    scale_fill_discrete(name = "Month of Fires")+
    theme(text = element_text(size = 12)) 
wind_speed_plot

Based on figure 1, wind speed is almost a bimodal distribution (without much modification), we can safely assume that we can categorize wind speed into "high" and "low".

In [None]:
xs <- quantile(forest_fires_data$wind,c(0,1/2,1))

# add category column
forest_fires_data <- forest_fires_data %>%
    mutate(category=cut(wind, breaks=xs, labels=c("low","high"),include.lowest = TRUE))

wind_plot_categorized <- forest_fires_data %>%
    ggplot(aes(x =wind, fill = category))+
    geom_histogram(binwidth = 0.8) + 
    labs(x="Wind Speed", y="Count")+
    ggtitle("Figure 2: Wind Speed Distribution") +
    scale_fill_discrete(name = "Wind Speed Category") +
    theme(text = element_text(size = 12)) + 
    geom_vline(aes(xintercept = quantile(wind)[3]))

box_plot <- forest_fires_data %>%
  ggplot(aes(x = category, y = wind, fill = category)) +
  geom_boxplot() +
  labs(x="Wind Speed Category", y="Wind Speed") +
  ggtitle("Figure 3: Boxplot of Wind Speed by Category") 

options(repr.plot.width = 12, repr.plot.height = 4)
plot_grid(wind_plot_categorized, box_plot)

Based on Figure 3, there is a considerable difference in wind speed. A confidence interval would be useful to see if their ranges overlap in a interesting manner. We could also have a confidence interval around the difference of the mean.

## Area Burned Distributions

In [None]:
options(repr.plot.width = 6, repr.plot.height = 4)

#Visualizing the distribution of each variable
area_burned_plot <- forest_fires_data %>%
    ggplot(aes(x =area))+
    geom_histogram(binwidth = 10) + 
    labs(x="Area Burned (ha)", y="Count")+
    ggtitle("Figure 4: Area Burned Distribution") +
    scale_fill_discrete(name = "Month of Fires")+
    theme(text = element_text(size = 12))
area_burned_plot   

Figure 4 shows that it make sense to scale the data in a logrithmic scale.
> See https://www.programmingr.com/tutorial/log-in-r/.

In [None]:
forest_fires_data <- forest_fires_data %>%
    mutate(area_log10 = log10(1+ area))

In [None]:
options(repr.plot.width = 6, repr.plot.height = 4)

area_burned_scaled_plot <- forest_fires_data %>%
    ggplot(aes(x =area_log10))+
    geom_histogram(binwidth = 0.5) + 
    labs(x="Area Burned (ha) [log 10 scaled]", y="Count")+
    ggtitle("Figure 5: Area Burned Distribution (scaled)") +
    scale_fill_discrete(name = "Month of Fires") +
    theme(text = element_text(size = 12))

box_plot_burned <- forest_fires_data %>%
  ggplot(aes(x = category, y = area_log10, fill = category)) +
  geom_boxplot() +
  labs(x="Scaled Area vs Wind Speed Category", y="Scaled Area Burned (ha)")+
  ggtitle("Figure 6: Boxplot of Area Burned by Category")

options(repr.plot.width = 12, repr.plot.height = 4)
plot_grid(area_burned_scaled_plot, box_plot_burned)

##### Table 2: Summary Statistics of Forest Fires Data

In [None]:
forest_fires_data %>%
group_by(category) %>%
filter(category == "low") %>%
summary()

forest_fires_data %>%
group_by(category) %>%
filter(category == "high") %>%
summary()