In [160]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource, HoverTool, LinearInterpolator, NumeralTickFormatter, BasicTicker, ColorBar, LinearColorMapper, PrintfTickFormatter, Label
from bokeh.palettes import Viridis256, Magma7, Spectral11, RdBu11 as palette
from bokeh.io import output_file
from bokeh.layouts import column
import numpy as np
from scipy import stats

from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
forest_fires = fetch_ucirepo(id=162) 
  
# data (as pandas dataframes) 
X = forest_fires.data.features 
y = forest_fires.data.targets 
  
# metadata 
print(forest_fires.metadata) 
  
# variable information 
print(forest_fires.variables) 

print(forest_fires.shape)

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/forestfires.csv'
data = pd.read_csv(url)

# Display the first few rows to understand the data
data.head()

# Enable Bokeh in the notebook
output_notebook()

{'uci_id': 162, 'name': 'Forest Fires', 'repository_url': 'https://archive.ics.uci.edu/dataset/162/forest+fires', 'data_url': 'https://archive.ics.uci.edu/static/public/162/data.csv', 'abstract': 'This is a difficult regression task, where the aim is to predict the burned area of forest fires, in the northeast region of Portugal, by using meteorological and other data (see details at: http://www.dsi.uminho.pt/~pcortez/forestfires).', 'area': 'Climate and Environment', 'tasks': ['Regression'], 'characteristics': ['Multivariate'], 'num_instances': 517, 'num_features': 12, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['area'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2007, 'last_updated': 'Thu Jan 11 2024', 'dataset_doi': '10.24432/C5D88D', 'creators': ['Paulo Cortez', 'Anbal Morais'], 'intro_paper': {'ID': 368, 'type': 'NATIVE', 'title': 'A data mining approach to predict forest fires using meteorological da

## 1. Distribution of Burned Area

In [161]:
# Calculate histogram data
hist, edges = np.histogram(data['area'], bins=50)

# Create a DataFrame for the histogram data
hist_df = pd.DataFrame({
    'count': hist,
    'left': edges[:-1],
    'right': edges[1:]
})

# Calculate the center of each bin for hover information
hist_df['center'] = (hist_df['left'] + hist_df['right']) / 2

# Create a ColumnDataSource
source = ColumnDataSource(hist_df)

# Create the figure
p = figure(title='Distribution of Burned Area',
           x_axis_label='Burned Area (hectares)',
           y_axis_label='Frequency',
           y_axis_type="log",  # Use log scale for y-axis
           width=800, height=600)

# Add the histogram bars
p.quad(bottom=1, top='count', left='left', right='right', source=source,
       fill_color='orange', line_color='black', alpha=0.7)

# Add hover tool
hover = HoverTool(tooltips=[
    ("Range", "@left{0.0} to @right{0.0} hectares"),
    ("Count", "@count"),
])
p.add_tools(hover)

# Customize the plot
p.y_range.start = 1  # Start y-axis at 1 for log scale
p.xaxis.axis_label_text_font_size = "12pt"
p.yaxis.axis_label_text_font_size = "12pt"
p.title.text_font_size = "14pt"

# Set the output file
output_file("../data/processed/burned_area_histogram.html")

# Show the plot
show(p)

## 2. Impact of Temperature on Burned Area

In [162]:
# Create ColumnDataSource
source = ColumnDataSource(data)

# Create the figure
p = figure(title='Temperature vs. Burned Area',
           x_axis_label='Temperature (Celsius)', 
           y_axis_label='Burned Area (hectares)',
           y_axis_type="log",  # Log scale for y-axis
           width=800, height=600)

# Add the scatter plot
scatter = p.circle(x='temp', y='area', source=source, 
                   size=8, color='red', alpha=0.6)

# Add hover tool
hover = HoverTool(renderers=[scatter], tooltips=[
    ("Temperature", "@temp{0.1f}°C"),
    ("Area", "@area{0.2f} hectares"),
    ("Relative Humidity", "@RH{0.2f}%"),
    ("Wind", "@wind{0.1f} km/h"),
])
p.add_tools(hover)

# Customize the plot
p.title.text_font_size = '16pt'
p.xaxis.axis_label_text_font_size = '12pt'
p.yaxis.axis_label_text_font_size = '12pt'

# Set the y-axis to not use scientific notation
p.yaxis.formatter = NumeralTickFormatter(format="0,0")


# Set the output file
output_file("../data/processed/temp_vs_area_scatter.html")

# Show the plot
show(p)



## 3 and 4. Fire Frequencies in Months Days

In [163]:
# Prepare data for month distribution
month_order = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
month_counts = data['month'].value_counts().reindex(month_order).reset_index()
month_counts.columns = ['month', 'count']
month_counts['color'] = Viridis256[::21][:12]
month_source = ColumnDataSource(month_counts)

# Create figure for month distribution
p1 = figure(title='Forest Fire Occurrences by Month', x_range=month_order,
            x_axis_label='Month', y_axis_label='Count',
            width=800, height=400, toolbar_location=None)

# Add bars for month distribution
month_bars = p1.vbar(x='month', top='count', width=0.9, source=month_source, line_color='white', fill_color='color')

# Add hover tool for month distribution
hover1 = HoverTool(renderers=[month_bars], tooltips=[("Month", "@month"), ("Count", "@count")])
p1.add_tools(hover1)

# Prepare data for day distribution
day_order = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
day_counts = data['day'].value_counts().reindex(day_order).reset_index()
day_counts.columns = ['day', 'count']
day_counts['color'] = Magma7
day_source = ColumnDataSource(day_counts)

# Create figure for day distribution
p2 = figure(title='Forest Fire Occurrences by Day of the Week', x_range=day_order,
            x_axis_label='Day', y_axis_label='Count',
            width=800, height=400, toolbar_location=None)

# Add bars for day distribution
day_bars = p2.vbar(x='day', top='count', width=0.9, source=day_source, line_color='white', fill_color='color')

# Add hover tool for day distribution
hover2 = HoverTool(renderers=[day_bars], tooltips=[("Day", "@day"), ("Count", "@count")])
p2.add_tools(hover2)

# Customize the plots
for p in [p1, p2]:
    p.title.text_font_size = '16pt'
    p.xaxis.axis_label_text_font_size = '12pt'
    p.yaxis.axis_label_text_font_size = '12pt'
    p.xgrid.grid_line_color = None
    p.y_range.start = 0

# Set the output file
output_file("../data/processed/forest_fire_occurrences.html")

# Show the plots
show(column(p1, p2))

## 6. Relative Humidity vs Burned Area

In [164]:
# Create a ColumnDataSource
source = ColumnDataSource(data)

# Create the Bokeh figure
p = figure(title='Relative Humidity vs. Burned Area',
           x_axis_label='Relative Humidity (%)',
           y_axis_label='Burned Area (hectares)',
           y_axis_type='log',
           width=800,
           height=600)

# Add the scatter plot
scatter = p.circle(x='RH', y='area', source=source, size=8, color='green', alpha=0.6)

# Calculate the regression line
log_area = np.log10(data['area'])
slope, intercept, r_value, p_value, std_err = stats.linregress(data['RH'], log_area)

# Create points for the regression line
line_x = np.linspace(data['RH'].min(), data['RH'].max(), 100)
line_y = 10 ** (slope * line_x + intercept)

# Add the regression line
line = p.line(line_x, line_y, line_width=2, color='blue')

# Add hover tool
hover = HoverTool(tooltips=[('Relative Humidity', '@RH{0.2f}%'), 
                            ('Burned Area', '@area{0.2f} hectares')])
p.add_tools(hover)

# Customize the plot
p.title.text_font_size = '16pt'
p.xaxis.axis_label_text_font_size = '12pt'
p.yaxis.axis_label_text_font_size = '12pt'

# Set the output file
output_file("../data/processed/rh_vs_burned_area.html")

# Show the plot
show(p)

  result = getattr(ufunc, method)(*inputs, **kwargs)
  X -= avg[:, None]


## 6. The Amount of Rain Impact on Burned Area

In [165]:
# Create a ColumnDataSource
source = ColumnDataSource(data)

# Create the Bokeh figure
p = figure(title='Rain vs. Burned Area',
           x_axis_label='Rain (mm)',
           y_axis_label='Burned Area (hectares)',
           y_axis_type='log',
           width=800,
           height=600)

# Add the scatter plot
scatter = p.circle(x='rain', y='area', source=source, size=8, color='cyan', alpha=0.6)

# Add hover tool
hover = HoverTool(tooltips=[('Rain', '@rain{0.2f} mm'), 
                            ('Burned Area', '@area{0.2f} hectares')])
p.add_tools(hover)

# Customize the plot
p.title.text_font_size = '16pt'
p.xaxis.axis_label_text_font_size = '12pt'
p.yaxis.axis_label_text_font_size = '12pt'

# Set the output file
output_file("../data/processed/rain_vs_burned_area.html")

# Show the plot
show(p)



In [166]:
# Create a ColumnDataSource
source = ColumnDataSource(data)

# Create the figure
p = figure(title='Wind Speed vs. Burned Area',
           x_axis_label='Wind Speed (km/h)',
           y_axis_label='Burned Area (hectares)',
           y_axis_type="log",  # Use log scale for y-axis
           width=800, height=600)

# Add the scatter plot
p.circle(x='wind', y='area', source=source, size=8, color='blue', alpha=0.6)

# Add hover tool
hover = HoverTool(tooltips=[
    ("Wind Speed", "@wind km/h"),
    ("Burned Area", "@area hectares"),
])
p.add_tools(hover)

# Set the y-axis to not use scientific notation
p.yaxis.formatter = NumeralTickFormatter(format="0,0")

# Set the output file
output_file("../data/processed/wind_vs_area_scatter.html")

# Show the plot
show(p)



In [167]:
# Fetch dataset
forest_fires = fetch_ucirepo(id=162)
data = pd.concat([forest_fires.data.features, forest_fires.data.targets], axis=1)

# Create ColumnDataSource
source = ColumnDataSource(data)

# Create the figure
p = figure(title='Spatial Distribution of Forest Fires',
           x_axis_label='X Coordinate', y_axis_label='Y Coordinate',
           width=800, height=600)

# Define an interpolator to map 'area' to a size range (screen space)
size_mapper = LinearInterpolator(
    x=[min(source.data['area']), max(source.data['area'])],  # Input range
    y=[5, 50]  # Desired size range in pixels (screen space)
)

# Apply the size_mapper to the 'size' argument
scatter = p.circle(x='X', y='Y', size={'field': 'area', 'transform': size_mapper},
                   source=source, alpha=0.6, color='navy')

# # Add the scatter plot
# scatter = p.circle(x='X', y='Y', size='area', source=source, size_units='screen', size_min=5, size_max=50, alpha=0.6, color='navy')

# Add hover tool
hover = HoverTool(renderers=[scatter], tooltips=[
    ("X", "@X"),
    ("Y", "@Y"),
    ("Area", "@area{0.2f} hectares"),
    ("Month", "@month"),
    ("Day", "@day"),
    ("Temperature", "@temp{0.1f}°C"),
    ("Wind", "@wind{0.1f} km/h"),
])
p.add_tools(hover)

# Customize the plot
p.title.text_font_size = '16pt'
p.xaxis.axis_label_text_font_size = '12pt'
p.yaxis.axis_label_text_font_size = '12pt'

# Set the output file
output_file("../data/processed/forest_fires_spatial_distribution.html")

# Show the plot
show(p)



In [168]:
# Create the Bokeh figure
p = figure(title='Fine Fuel Moisture Code (FFMC) vs. Burned Area',
           x_axis_label='FFMC',
           y_axis_label='Burned Area (hectares)',
           y_axis_type='log',
           width=800,
           height=500)

# Add the scatter plot
scatter = p.scatter('FFMC', 'area', source=data, size=8, color='navy', alpha=0.5)

# Add hover tool
hover = HoverTool(tooltips=[('FFMC', '@FFMC'), ('Burned Area', '@area{0.2f} hectares')])
p.add_tools(hover)

# Set the output file
output_file("../data/processed/ffmc_vs_burned_area.html")

# Show the plot
show(p)

In [169]:
# Prepare data_heatmap
data_heatmap = data.copy()
day_to_num = {'mon': 1, 'tue': 2, 'wed': 3, 'thu': 4, 'fri': 5, 'sat': 6, 'sun': 7}
month_to_num = {'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
                'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12}
data_heatmap['month'] = data_heatmap['month'].map(month_to_num)
data_heatmap['day'] = data_heatmap['day'].map(day_to_num)

# Calculate the correlation matrix
corr = data_heatmap.corr()

# Prepare data for Bokeh
names = list(corr.columns)
x = names
y = names[::-1]  # Reverse for bottom-up
colors = palette[::-1]  # Reverse for positive-negative

# Set up data for plotting
source = ColumnDataSource(data=dict(
    x=[name for name in x for _ in y],
    y=[name for _ in x for name in y],
    value=[corr.loc[name2, name1] for name1 in x for name2 in y],
    text=[f'{corr.loc[name2, name1]:.2f}' for name1 in x for name2 in y]
))

# Set up the figure
p = figure(title="Correlation Heatmap of Forest Fires Data",
           x_range=x, y_range=y,
           x_axis_location="above", width=900, height=800,
           toolbar_location=None, tools="hover", 
           tooltips=[('Variables', '@x, @y'), ('Correlation', '@text')])

# Create color mapper
mapper = LinearColorMapper(palette=colors, low=-1, high=1)

# Create rectangle glyphs
p.rect(x="x", y="y", width=1, height=1, source=source,
       line_color=None, fill_color={'field': 'value', 'transform': mapper})

# Add text labels
p.text(x="x", y="y", text="text", source=source,
       text_align="center", text_baseline="middle",
       text_font_size="8pt", text_color="black")

# Set up the color bar
color_bar = ColorBar(color_mapper=mapper, ticker=BasicTicker(desired_num_ticks=11),
                     formatter=PrintfTickFormatter(format="%.2f"),
                     label_standoff=12, border_line_color=None, location=(0, 0))

# Add color bar to the figure
p.add_layout(color_bar, 'right')

# Set up the axes
p.xaxis.axis_label = 'Variables'
p.yaxis.axis_label = 'Variables'
p.xaxis.major_label_orientation = np.pi/4
p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None
p.axis.major_label_text_font_size = "8pt"

# Set the output file
output_file("../data/processed/forest_fires_correlation_heatmap_with_labels.html")

# Show the plot
show(p)