In [1]:
import numpy as np
import pandas as pd

In [2]:
#(i) Loading a Sample Dataset (forestfires.csv)
df=pd.read_csv("forestfires.csv")

In [5]:
'''
(ii) Brief explanation of the dataset columns.
The forest fires dataset has the following structure:
• X: X-axis spatial coordinate within the Montesinho park map: 1 to 9
• Y: Y-axis spatial coordinate within the Montesinho park map: 2 to 9
• month: Month of the year: 'jan' to 'dec'
• day: Day of the week: 'mon' to 'sun'
• FFMC: FFMC index from the FWI system: 18.7 to 96.20
• DMC: DMC index from the FWI system: 1.1 to 291.3
• DC: DC index from the FWI system: 7.9 to 860.6
pandas | 75
• ISI: ISI index from the FWI system: 0.0 to 56.10
• temp: Temperature in degrees Celsius: 2.2 to 33.30
• RH: Relative humidity in %: 15.0 to 100
• wind: Wind speed in km/h: 0.40 to 9.40
• rain: Outside rain in mm/m2
: 0.0 to 6.4
• area: The burned area of the forest (in ha): 0.00 to 1090.84'''
df

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.00
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.00
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.00
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.00
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,4,3,aug,sun,81.6,56.7,665.6,1.9,27.8,32,2.7,0.0,6.44
513,2,4,aug,sun,81.6,56.7,665.6,1.9,21.9,71,5.8,0.0,54.29
514,7,4,aug,sun,81.6,56.7,665.6,1.9,21.2,70,6.7,0.0,11.16
515,1,4,aug,sat,94.4,146.0,614.7,11.3,25.6,42,4.0,0.0,0.00


In [7]:
#(iii) Filtering dataset:
#(a) creating a new dataset that only contains entries with an area value greater than 0
filtered_df = df[df['area'] > 0]
filtered_df

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
138,9,9,jul,tue,85.8,48.3,313.4,3.9,18.0,42,2.7,0.0,0.36
139,1,4,sep,tue,91.0,129.5,692.6,7.0,21.7,38,2.2,0.0,0.43
140,2,5,sep,mon,90.9,126.5,686.5,7.0,21.9,39,1.8,0.0,0.47
141,1,2,aug,wed,95.5,99.9,513.3,13.2,23.3,31,4.5,0.0,0.55
142,8,6,aug,fri,90.1,108.0,529.8,12.5,21.2,51,8.9,0.0,0.61
...,...,...,...,...,...,...,...,...,...,...,...,...,...
509,5,4,aug,fri,91.0,166.9,752.6,7.1,21.1,71,7.6,1.4,2.17
510,6,5,aug,fri,91.0,166.9,752.6,7.1,18.2,62,5.4,0.0,0.43
512,4,3,aug,sun,81.6,56.7,665.6,1.9,27.8,32,2.7,0.0,6.44
513,2,4,aug,sun,81.6,56.7,665.6,1.9,21.9,71,5.8,0.0,54.29


In [8]:
#(b) How many rows does the new filtered dataset contain?
filtered_df.to_csv('filtered_forestfire.csv', index=False) #saves the subset having area>0
num_rows = filtered_df.shape[0]
print(f'The number of rows in the new filtered dataset is: {num_rows}')

The number of rows in the new filtered dataset is: 270


In [11]:
#(iv) Statistical Analysis: Calculate the mean, minimum, maximum, and standard deviation of the area column in the 
# filtered dataset. Discuss what the large standard deviation indicates about the distribution of the area values.
mean_area = filtered_df['area'].mean()
min_area = filtered_df['area'].min()
max_area = filtered_df['area'].max()
std_area = filtered_df['area'].std()
print(f"Mean of area: {mean_area}")
print(f"Minimum area: {min_area}")
print(f"Maximum area: {max_area}")
print(f"Standard Deviation of area: {std_area}")
#A large standard deviation in the 'area' values of the forest fire dataset indicates that the sizes of the burned areas
#  are spread out over a wide range. This means there is high variability in the data. This significant spread suggests 
# that some forest fires impact only small areas, while others cover very large areas. The high variability can be 
# attributed to several factors such as differences in vegetation density, weather conditions, fire suppression efforts, 
# and other environmental influences that affect how fires spread and grow.
#Understanding the reasons behind this variability is crucial for developing effective fire prevention and management
#  strategies, as it highlights the diverse conditions under which fires can occur and the varying scales of impact they 
# can have.

Mean of area: 24.600185185185182
Minimum area: 0.09
Maximum area: 1090.84
Standard Deviation of area: 86.50163460412126


In [12]:
#(v) Explore Distribution: Sort the filtered dataset by the area column in ascending order and display the last 20 entries.
#  Count entries having an area larger than 100?
# Sort the filtered dataset by the 'area' column in ascending order
sorted_df = filtered_df.sort_values(by='area')
# Display the last 20 entries
last_20_entries = sorted_df.tail(20)
print(last_20_entries)
# Count entries having an area larger than 100
count_larger_than_100 = filtered_df[filtered_df['area'] > 100].shape[0]
print(f'Number of entries with area larger than 100: {count_larger_than_100}')

     X  Y month  day  FFMC    DMC     DC   ISI  temp  RH  wind  rain     area
469  6  3   apr  sun  91.0   14.6   25.6  12.3  13.7  33   9.4   0.0    61.13
228  4  6   sep  sun  93.5  149.3  728.6   8.1  28.3  26   3.1   0.0    64.10
473  9  4   jun  sat  90.5   61.1  252.6   9.4  24.5  50   3.1   0.0    70.32
392  1  3   sep  sun  91.0  276.3  825.1   7.1  21.9  43   4.0   0.0    70.76
229  8  6   aug  sat  92.2   81.8  480.8  11.9  16.4  43   4.0   0.0    71.30
457  1  4   aug  wed  91.7  191.4  635.9   7.8  19.9  50   4.0   0.0    82.75
293  7  6   jul  tue  93.1  180.4  430.8  11.0  26.9  28   5.4   0.0    86.45
230  4  4   sep  wed  92.9  133.3  699.6   9.2  26.4  21   4.5   0.0    88.49
231  1  5   sep  sun  93.5  149.3  728.6   8.1  27.8  27   3.1   0.0    95.18
232  6  4   sep  tue  91.0  129.5  692.6   7.0  18.7  43   2.7   0.0   103.39
233  9  4   sep  tue  84.4   73.4  671.9   3.2  24.3  36   3.1   0.0   105.66
234  4  5   sep  sat  92.5  121.1  674.4   8.6  17.7  25   3.1  

In [13]:
#(vi) Calculate the median value for the area. State reason why median might be a
# better measure than the mean.
median_area = filtered_df['area'].median()
print(f"Median of area: {median_area}")
'''The median is often a better measure of central tendency than the mean, especially when the data is skewed or contains
 outliers. Here's why:
Robustness to Outliers: The median is not affected by extremely large or small values, whereas the mean can be 
significantly influenced by outliers. For example, if there is a forest fire that covers an exceptionally large area, 
this would skew the mean upwards, but the median would remain unaffected.
Skewed Distributions: In cases where the data is not symmetrically distributed (i.e., skewed), the mean can give a 
misleading impression of the central tendency. The median, being the middle value, better represents the typical value
 in such distributions.
In the context of the forest fire dataset, the presence of a few large fires (with very high 'area' values) can skew the
 mean, making it higher than what most fires actually affect. The median provides a better central value for understanding
 the typical fire size in such a dataset.'''

Median of area: 6.37


In [14]:
#Analysis:
#(a) Find month that has the highest number of forest fires, mean temperature during that month
# Count the number of forest fires per month
fire_counts_per_month = df['month'].value_counts()
# Find the month with the highest number of forest fires
most_fires_month = fire_counts_per_month.idxmax()
num_fires = fire_counts_per_month.max()
print(f"Month with the highest number of forest fires: {most_fires_month} ({num_fires} fires)")
# Calculate the mean temperature during that month
mean_temp_during_most_fires = df[df['month'] == most_fires_month]['temp'].mean()
print(f"Mean temperature during {most_fires_month}: {mean_temp_during_most_fires:.2f}°C")

Month with the highest number of forest fires: aug (184 fires)
Mean temperature during aug: 21.63°C


In [16]:
#(b) Find correlation between the number of forest fires and the mean temperature for each month.
# Calculate the mean temperature for each month
mean_temp_per_month = df.groupby('month')['temp'].mean()
# Calculate the mean temperature for each month
mean_temp_per_month = df.groupby('month')['temp'].mean()
# Create a DataFrame combining fire counts and mean temperatures
monthly_stats = pd.DataFrame({
    'fire_counts': fire_counts_per_month,
    'mean_temperature': mean_temp_per_month
})

# Calculate correlation between fire counts and mean temperature
correlation = monthly_stats['fire_counts'].corr(monthly_stats['mean_temperature'])

print(f"Correlation between number of forest fires and mean temperature: {correlation:.2f}")

Correlation between number of forest fires and mean temperature: 0.55


In [17]:
"""The value 0.55 indicates that there is a moderate strength of association between the variables. In this case, if one variable increases, the other tends to increase as well, but not perfectly."""

'The value 0.55 indicates that there is a moderate strength of association between the variables. In this case, if one variable increases, the other tends to increase as well, but not perfectly.'

In [19]:
#(i) Indexing:
#(a) Convert it into a NumPy array. Retrieve and print the first 5 rows of the dataset using array indexing.
# Convert DataFrame to NumPy array
np_array = df.to_numpy()
# Retrieve and print the first 5 rows using array indexing
first_5_rows = np_array[:5]
print(first_5_rows)

[[7 5 'mar' 'fri' 86.2 26.2 94.3 5.1 8.2 51 6.7 0.0 0.0]
 [7 4 'oct' 'tue' 90.6 35.4 669.1 6.7 18.0 33 0.9 0.0 0.0]
 [7 4 'oct' 'sat' 90.6 43.7 686.9 6.7 14.6 33 1.3 0.0 0.0]
 [8 6 'mar' 'fri' 91.7 33.3 77.5 9.0 8.3 97 4.0 0.2 0.0]
 [8 6 'mar' 'sun' 89.3 51.3 102.2 9.6 11.4 99 1.8 0.0 0.0]]


In [21]:
#(b) Retrieve and print the 'temp' (temperature) column using column indexing.
# Retrieve the index of the 'temp' column
temp_column_index = df.columns.get_loc('temp')

# Extract the 'temp' column using NumPy array indexing
temperature_column = np_array[:, temp_column_index]

# Print the 'temp' column
print(temperature_column)

[8.2 18.0 14.6 8.3 11.4 22.2 24.1 8.0 13.1 22.8 17.8 19.3 17.0 21.3 26.4
 22.9 15.1 16.7 15.9 9.3 18.3 19.1 21.0 19.5 23.7 16.3 19.0 19.4 30.2 22.8
 25.4 11.2 20.6 17.7 21.2 18.2 21.7 11.3 17.8 14.1 23.3 18.4 16.6 19.6
 12.9 25.9 14.7 23.0 11.8 11.0 20.8 21.5 20.4 20.4 17.6 27.7 17.8 13.8
 13.9 12.3 11.5 5.5 18.8 20.8 23.1 18.6 23.0 19.6 19.6 17.2 15.8 17.7 15.6
 17.3 27.6 6.7 15.7 8.3 14.7 21.6 19.5 17.9 18.6 16.6 20.2 21.5 25.4 22.4
 25.3 17.4 14.7 17.4 20.8 18.2 23.4 17.8 12.7 17.4 11.6 19.8 19.8 14.4
 20.1 24.1 5.3 12.7 18.2 21.4 20.3 17.4 13.7 18.8 22.8 18.9 15.8 15.5 11.6
 15.2 10.6 19.6 10.3 17.1 22.5 17.9 19.8 20.6 9.0 17.2 15.9 15.4 15.4 14.0
 10.6 17.6 14.9 17.6 17.2 15.6 18.0 21.7 21.9 23.3 21.2 16.6 23.8 27.4
 13.2 24.2 17.4 23.7 23.2 24.8 24.6 20.1 29.6 16.4 28.6 18.4 20.5 19.0
 16.1 20.3 15.2 17.8 17.8 5.3 16.6 23.4 14.6 20.7 21.9 17.4 20.1 17.7 14.2
 20.3 5.8 19.2 18.3 14.4 23.9 19.1 12.4 16.8 20.8 17.6 11.5 21.0 13.3 11.5
 11.7 24.2 24.6 24.3 24.6 23.5 5.8 21.5 13.9 22.

In [22]:
#(ii) Slicing:
#(a) Slice the dataset to obtain all rows and only the 'month', 'temp', and 'area' columns.
sliced_data = df[['month', 'temp', 'area']]
print(sliced_data)

    month  temp   area
0     mar   8.2   0.00
1     oct  18.0   0.00
2     oct  14.6   0.00
3     mar   8.3   0.00
4     mar  11.4   0.00
..    ...   ...    ...
512   aug  27.8   6.44
513   aug  21.9  54.29
514   aug  21.2  11.16
515   aug  25.6   0.00
516   nov  11.8   0.00

[517 rows x 3 columns]


In [23]:
#(b) Slice the dataset to get all entries from index 100 to 200
sliced_entries = df.iloc[100:201]
print(sliced_entries)

     X  Y month  day  FFMC    DMC     DC   ISI  temp  RH  wind  rain   area
100  3  4   aug  sun  91.4  142.4  601.4  10.6  19.8  39   5.4   0.0   0.00
101  3  4   aug  tue  88.8  147.3  614.5   9.0  14.4  66   5.4   0.0   0.00
102  2  4   aug  tue  94.8  108.3  647.1  17.0  20.1  40   4.0   0.0   0.00
103  2  4   sep  sat  92.5  121.1  674.4   8.6  24.1  29   4.5   0.0   0.00
104  2  4   jan  sat  82.1    3.7    9.3   2.9   5.3  78   3.1   0.0   0.00
..  .. ..   ...  ...   ...    ...    ...   ...   ...  ..   ...   ...    ...
196  6  5   apr  thu  81.5    9.1   55.2   2.7   5.8  54   5.8   0.0  10.93
197  4  5   sep  thu  92.9  137.0  706.4   9.2  21.5  15   0.9   0.0  11.06
198  3  4   sep  tue  91.0  129.5  692.6   7.0  13.9  59   6.3   0.0  11.24
199  2  4   sep  mon  63.5   70.8  665.3   0.8  22.6  38   3.6   0.0  11.32
200  1  5   sep  tue  91.0  129.5  692.6   7.0  21.6  33   2.2   0.0  11.53

[101 rows x 13 columns]


In [24]:
#(iii) Splitting:
#(a) Split the dataset into two equal halves along the row axis. Showcase the first 5 rows of each split.
midpoint = len(df) // 2

# Split the dataset into two halves
first_half = df.iloc[:midpoint]
second_half = df.iloc[midpoint:]

# Showcase the first 5 rows of each split
print("First half - First 5 rows:")
print(first_half.head())

print("\nSecond half - First 5 rows:")
print(second_half.head())

First half - First 5 rows:
   X  Y month  day  FFMC   DMC     DC  ISI  temp  RH  wind  rain  area
0  7  5   mar  fri  86.2  26.2   94.3  5.1   8.2  51   6.7   0.0   0.0
1  7  4   oct  tue  90.6  35.4  669.1  6.7  18.0  33   0.9   0.0   0.0
2  7  4   oct  sat  90.6  43.7  686.9  6.7  14.6  33   1.3   0.0   0.0
3  8  6   mar  fri  91.7  33.3   77.5  9.0   8.3  97   4.0   0.2   0.0
4  8  6   mar  sun  89.3  51.3  102.2  9.6  11.4  99   1.8   0.0   0.0

Second half - First 5 rows:
     X  Y month  day  FFMC    DMC     DC   ISI  temp  RH  wind  rain  area
258  3  4   aug  sat  91.8  170.9  692.3  13.7  20.6  59   0.9   0.0  0.00
259  7  4   aug  sat  91.8  170.9  692.3  13.7  23.7  40   1.8   0.0  1.38
260  2  4   aug  mon  93.6   97.9  542.0  14.4  28.3  32   4.0   0.0  8.85
261  3  4   aug  fri  91.6  112.4  573.0   8.9  11.2  84   7.6   0.0  3.30
262  2  4   aug  fri  91.6  112.4  573.0   8.9  21.4  42   3.1   0.0  4.25


In [25]:
#(iv) Iteration:
#(a) Calculate the sum of the 'temp' column using iteration and verify it using NumPy's sum function
sum = 0
for index, row in df.iterrows():
    sum += row['temp']
sum_np = np.sum(df['temp'])
print(f"Sum of 'temp' column using iteration: {sum}")
print(f"Sum of 'temp' column using NumPy's sum function: {sum_np}")


Sum of 'temp' column using iteration: 9765.699999999999
Sum of 'temp' column using NumPy's sum function: 9765.699999999999


In [28]:
#(v) Masking:
#(a) Use boolean indexing to filter and print all rows where the 'temp' column has values greater
#than 20°C. Create a boolean mask for entries where the 'area' is greater than 10 and use it to
#retrieve and print the relevant rows.
temp = df[df['temp'] > 20]
print("Rows where 'temp' > 20°C:")
print(temp)

Rows where 'temp' > 20°C:
     X  Y month  day  FFMC    DMC     DC   ISI  temp  RH  wind  rain   area
5    8  6   aug  sun  92.3   85.3  488.0  14.7  22.2  29   5.4   0.0   0.00
6    8  6   aug  mon  92.3   88.9  495.6   8.5  24.1  27   3.1   0.0   0.00
9    7  5   sep  sat  92.5   88.0  698.6   7.1  22.8  40   4.0   0.0   0.00
13   6  5   sep  mon  90.9  126.5  686.5   7.0  21.3  42   2.2   0.0   0.00
14   6  5   sep  wed  92.9  133.3  699.6   9.2  26.4  21   4.5   0.0   0.00
..  .. ..   ...  ...   ...    ...    ...   ...   ...  ..   ...   ...    ...
511  8  6   aug  sun  81.6   56.7  665.6   1.9  27.8  35   2.7   0.0   0.00
512  4  3   aug  sun  81.6   56.7  665.6   1.9  27.8  32   2.7   0.0   6.44
513  2  4   aug  sun  81.6   56.7  665.6   1.9  21.9  71   5.8   0.0  54.29
514  7  4   aug  sun  81.6   56.7  665.6   1.9  21.2  70   6.7   0.0  11.16
515  1  4   aug  sat  94.4  146.0  614.7  11.3  25.6  42   4.0   0.0   0.00

[231 rows x 13 columns]


In [32]:
a = df['area'] > 10
a_rows = df[a]
print("\nRows where 'area' > 10:")
print(a_rows)


Rows where 'area' > 10:
     X  Y month  day  FFMC    DMC     DC   ISI  temp  RH  wind  rain   area
194  2  2   aug  tue  94.8  108.3  647.1  17.0  24.6  22   4.5   0.0  10.01
195  2  5   aug  fri  93.9  135.7  586.7  15.1  23.5  36   5.4   0.0  10.02
196  6  5   apr  thu  81.5    9.1   55.2   2.7   5.8  54   5.8   0.0  10.93
197  4  5   sep  thu  92.9  137.0  706.4   9.2  21.5  15   0.9   0.0  11.06
198  3  4   sep  tue  91.0  129.5  692.6   7.0  13.9  59   6.3   0.0  11.24
..  .. ..   ...  ...   ...    ...    ...   ...   ...  ..   ...   ...    ...
498  6  5   aug  tue  96.1  181.1  671.2  14.3  33.3  26   2.7   0.0  40.54
499  7  5   aug  tue  96.1  181.1  671.2  14.3  27.3  63   4.9   6.4  10.82
504  4  3   aug  wed  94.5  139.4  689.1  20.0  28.9  29   4.9   0.0  49.59
513  2  4   aug  sun  81.6   56.7  665.6   1.9  21.9  71   5.8   0.0  54.29
514  7  4   aug  sun  81.6   56.7  665.6   1.9  21.2  70   6.7   0.0  11.16

[95 rows x 13 columns]
