In [21]:
!pip install wget



In [22]:
import wget
import zipfile
wget.download('https://raw.githubusercontent.com/Call-for-Code/Spot-Challenge-Wildfires/main/data/Nov_10.zip', 'Nov_10.zip')
zip_handle = zipfile.ZipFile("Nov_10.zip")
zip_handle.extractall()

In [23]:

import pandas as pd
import numpy as np

In [30]:

file_wildfires = "/content/Nov_10/Historical_Wildfires.csv"
print("Reading file: '{}'".format(file_wildfires))
df = pd.read_csv(file_wildfires)
print("Loaded...")

df.head()

Reading file: '/content/Nov_10/Historical_Wildfires.csv'
Loaded...


Unnamed: 0,Region,Date,Estimated_fire_area,Mean_estimated_fire_brightness,Mean_estimated_fire_radiative_power,Mean_confidence,Std_confidence,Var_confidence,Count,Replaced
0,NSW,1/4/2005,8.68,312.266667,42.4,78.666667,2.886751,8.333333,3,R
1,NSW,1/5/2005,16.61125,322.475,62.3625,85.5,8.088793,65.428571,8,R
2,NSW,1/6/2005,5.52,325.266667,38.4,78.333333,3.21455,10.333333,3,R
3,NSW,1/7/2005,6.264,313.87,33.8,92.2,7.52994,56.7,5,R
4,NSW,1/8/2005,5.4,337.383333,122.533333,91.0,7.937254,63.0,3,R


In [31]:
num_rows, num_cols = df.shape
print("There are total {} records in the following {} columns:\n".format(num_rows, num_cols))
print("\n".join(list(df.columns)))

There are total 26406 records in the following 10 columns:

Region
Date
Estimated_fire_area
Mean_estimated_fire_brightness
Mean_estimated_fire_radiative_power
Mean_confidence
Std_confidence
Var_confidence
Count
Replaced


In [32]:
# Checking data types of each columns
df.dtypes

Region                                  object
Date                                    object
Estimated_fire_area                    float64
Mean_estimated_fire_brightness         float64
Mean_estimated_fire_radiative_power    float64
Mean_confidence                        float64
Std_confidence                         float64
Var_confidence                         float64
Count                                    int64
Replaced                                object
dtype: object

In [33]:
df['Date'] = pd.to_datetime(df['Date'])
print("Minimum Date: {}".format(df['Date'].min()))
print("Maximum Date: {}".format(df['Date'].max()))

Minimum Date: 2005-01-01 00:00:00
Maximum Date: 2020-10-31 00:00:00


In [34]:
df.isna().sum()

Region                                    0
Date                                      0
Estimated_fire_area                       0
Mean_estimated_fire_brightness            0
Mean_estimated_fire_radiative_power       0
Mean_confidence                           0
Std_confidence                         2207
Var_confidence                         2207
Count                                     0
Replaced                                  0
dtype: int64

In [35]:
df.loc[df.Std_confidence.isna(), :]

Unnamed: 0,Region,Date,Estimated_fire_area,Mean_estimated_fire_brightness,Mean_estimated_fire_radiative_power,Mean_confidence,Std_confidence,Var_confidence,Count,Replaced
48,NSW,2005-02-26,1.00,303.15,8.0,79.0,,,1,R
149,NSW,2005-06-12,1.00,302.55,17.9,79.0,,,1,R
154,NSW,2005-06-18,5.27,301.30,71.9,77.0,,,1,R
157,NSW,2005-06-25,9.60,300.70,145.9,76.0,,,1,R
163,NSW,2005-07-09,2.80,294.65,37.8,79.0,,,1,R
...,...,...,...,...,...,...,...,...,...,...
26327,WA,2020-08-09,2.34,300.15,30.2,85.0,,,1,N
26331,WA,2020-08-13,1.10,320.35,27.1,83.0,,,1,N
26332,WA,2020-08-14,1.00,302.15,15.8,77.0,,,1,N
26335,WA,2020-08-20,1.92,326.85,86.2,92.0,,,1,N


In [36]:
df.loc[df.Var_confidence.isna(), :]

Unnamed: 0,Region,Date,Estimated_fire_area,Mean_estimated_fire_brightness,Mean_estimated_fire_radiative_power,Mean_confidence,Std_confidence,Var_confidence,Count,Replaced
48,NSW,2005-02-26,1.00,303.15,8.0,79.0,,,1,R
149,NSW,2005-06-12,1.00,302.55,17.9,79.0,,,1,R
154,NSW,2005-06-18,5.27,301.30,71.9,77.0,,,1,R
157,NSW,2005-06-25,9.60,300.70,145.9,76.0,,,1,R
163,NSW,2005-07-09,2.80,294.65,37.8,79.0,,,1,R
...,...,...,...,...,...,...,...,...,...,...
26327,WA,2020-08-09,2.34,300.15,30.2,85.0,,,1,N
26331,WA,2020-08-13,1.10,320.35,27.1,83.0,,,1,N
26332,WA,2020-08-14,1.00,302.15,15.8,77.0,,,1,N
26335,WA,2020-08-20,1.92,326.85,86.2,92.0,,,1,N


In [37]:
df.loc[(df['Std_confidence'].isna()) & (df['Var_confidence'].isna()), ['Count']].drop_duplicates().reset_index(drop=True)

Unnamed: 0,Count
0,1


In [38]:
# distinct "Count" column values when Std_confidence and Var_confidence are NULL.
print("Distinct 'Count' column values when Std_confidence and Var_confidence are NULL.\n")
Count_values = df.loc[(df['Std_confidence'].isna()) & (df['Var_confidence'].isna()), 'Count'].values
print("'Count' Column Values: {}".format(Count_values))

import numpy as np
print("'Count' Column Unique Values: {}".format(np.unique(Count_values)))

Distinct 'Count' column values when Std_confidence and Var_confidence are NULL.

'Count' Column Values: [1 1 1 ... 1 1 1]
'Count' Column Unique Values: [1]


In [39]:
df.loc[df['Std_confidence'].isna(), 'Std_confidence'] = 0
df.loc[df['Var_confidence'].isna(), 'Var_confidence'] = 0

In [41]:
region = df['Region'].unique()
num_regions = region.shape[0]
all_regions = "\n".join(region.tolist())
print("Following are {} unique regions for Historical Wildfires:\n\n{}".format(num_regions, all_regions))

Following are 7 unique regions for Historical Wildfires:

NSW
NT
QL
SA
TA
VI
WA


In [42]:
# Remove Duplicates
df.drop_duplicates(inplace=True)

# Reset dataframe index
df.reset_index(drop=True, inplace=True)

In [43]:
# Number of records
num_rows, num_cols = df.shape
print("Total Records:\t{}".format(num_rows))

Total Records:	26406


In [44]:
# First five rows in data
print('First five rows of the Dataset')
df.head()

First five rows of the Dataset


Unnamed: 0,Region,Date,Estimated_fire_area,Mean_estimated_fire_brightness,Mean_estimated_fire_radiative_power,Mean_confidence,Std_confidence,Var_confidence,Count,Replaced
0,NSW,2005-01-04,8.68,312.266667,42.4,78.666667,2.886751,8.333333,3,R
1,NSW,2005-01-05,16.61125,322.475,62.3625,85.5,8.088793,65.428571,8,R
2,NSW,2005-01-06,5.52,325.266667,38.4,78.333333,3.21455,10.333333,3,R
3,NSW,2005-01-07,6.264,313.87,33.8,92.2,7.52994,56.7,5,R
4,NSW,2005-01-08,5.4,337.383333,122.533333,91.0,7.937254,63.0,3,R


In [45]:
# Last five rows in data
print('Last five rows of the Dataset')
df.tail()

Last five rows of the Dataset


Unnamed: 0,Region,Date,Estimated_fire_area,Mean_estimated_fire_brightness,Mean_estimated_fire_radiative_power,Mean_confidence,Std_confidence,Var_confidence,Count,Replaced
26401,WA,2020-10-27,32.97,324.068182,103.318182,84.363636,5.277741,27.854545,11,N
26402,WA,2020-10-28,20.840625,314.425,37.25625,88.3125,8.12173,65.9625,16,N
26403,WA,2020-10-29,136.083077,323.588461,294.102564,92.974359,8.282789,68.604588,39,N
26404,WA,2020-10-30,42.397895,324.205263,54.994737,89.631579,8.461107,71.590327,38,N
26405,WA,2020-10-31,107.370851,330.057447,201.625532,92.978723,7.266628,52.803885,47,N
