In [6]:
import pandas as pd
import numpy as np
from datetime import datetime, date
from sweref99 import projections

In [7]:
path='./data/Brandriskdata 2000-2020.csv'
#'PunktID': str, 'E': str, 'N': str,
types={ 'PunktID': str}

In [8]:
#Reading csv file from path with dtypes
df = pd.read_csv(path, sep=';', dtype=types, )


In [9]:
#Number of rows in fire riskdataset
df.shape[0]

19188859

In [4]:
#tm is used for the conversions of easting and northing to longitude and latitude
tm = projections.make_transverse_mercator("SWEREF_99_TM")

In [5]:
#For now sample is only 50 values
sample = df.head(10)
sample[['E','N']] = df[['E','N']].fillna(0)
sample['N'] = sample['N'].astype(int)
sample['E'] = sample['E'].astype(int)
sample.head(10)


Unnamed: 0,PunktID,E,N,Kommun,Datum,Temp,Tmedel,Nederbord,RH,Vindhastighet,...,DC,ISI,BUI,FWI,FWI_index,HBV_o,HBV_u,HBV,HBV_index,Gras
7732768,3169,622491,6995342,2282.0,2013-06-08,156,13,26,38,29,...,1393.0,13.0,338.0,28.0,2.0,56.0,54.0,65.0,2.0,2.0
5500156,2288,758768,7492962,2523.0,2011-03-03,-15,0,0,575,87,...,,,,,,,,,,1.0
2416893,1004,709601,6578167,120.0,2006-06-05,139,114,4,54,9,...,1586.0,93.0,244.0,147.0,3.0,56.0,51.0,54.0,3.0,
17359444,7137,440695,7079020,2309.0,2015-10-04,68,4,1,641,68,...,654.0,1.0,11.0,3.0,1.0,98.0,87.0,100.0,1.0,2.0
18459521,7590,358100,6476782,1487.0,2001-04-06,76,53,38,79,17,...,121.0,1.0,26.0,0.0,1.0,81.0,88.0,95.0,1.0,
1723795,722,499087,6864256,2361.0,2007-07-05,215,174,3,57,42,...,2986.0,34.0,442.0,9.0,3.0,56.0,38.0,48.0,3.0,
6220052,2561,642134,7347013,2506.0,2008-05-17,24,4,1,41,15,...,0.0,0.0,0.0,0.0,1.0,96.0,97.0,100.0,1.0,1.0
3045147,1272,479880,7327307,2421.0,2019-05-12,7,0,88,908,10,...,0.0,0.0,0.0,0.0,-1.0,100.0,100.0,100.0,1.0,1.0
9816488,4039,560931,6504816,581.0,2007-06-26,214,166,0,49,43,...,1918.0,57.0,236.0,97.0,3.0,66.0,42.0,57.0,3.0,
5632202,2340,376209,6218437,1260.0,2016-07-31,208,16,4,516,326324,...,2243.0,13.0,357.0,29.0,2.0,53.0,44.0,59.0,3.0,2.0


In [6]:
#Function that calculates number of missing data in column of dataframe and prints result.
def missing(df,column):
       x = len(df)
       if df[column].isnull().any():
           print('{0} has total of {1} null values'.format(column, df[column].isnull().sum()))
           print ('In the column {0}'.format(column), round(df[column].count()-1/x * 100, 3), '% of the cells have missing values')
 

In [7]:
#Missing values of column 'Gras'
missing(sample,'Gras')
missing(sample,'Vindriktning')
sample.isnull().sum(axis = 0)

Gras has total of 21 null values
In the column Gras 27.0 % of the cells have missing values
Vindriktning has total of 41 null values
In the column Vindriktning 7.0 % of the cells have missing values


PunktID           0
E                 0
N                 0
Kommun            0
Datum             0
Temp              0
Tmedel            0
Nederbord         0
RH                0
Vindhastighet     0
Vindriktning     41
FFMC              2
DMC               2
DC                2
ISI               2
BUI               2
FWI               2
FWI_index         2
HBV_o             2
HBV_u             2
HBV               2
HBV_index         2
Gras             21
dtype: int64

In [8]:
#Data info
sample.describe()


#Columns to drop from the dataframe: FFMC, DMC, DC, ISI, BUI, Gras, Vindhastighet

Unnamed: 0,E,N,Kommun,FWI_index,HBV_o,HBV_u,HBV,HBV_index,Gras
count,50.0,50.0,50.0,48.0,48.0,48.0,48.0,48.0,29.0
mean,542643.2,6793925.0,1771.94,1.770833,68.75,63.604167,73.375,1.875,1.689655
std,150584.882608,395248.6,716.307522,1.308557,21.800351,20.540297,20.399859,1.023656,1.198316
min,281328.0,6206228.0,120.0,-1.0,28.0,31.0,37.0,1.0,-1.0
25%,429931.75,6505194.0,1437.75,1.0,49.75,45.0,56.75,1.0,1.0
50%,499391.5,6680458.0,1957.0,2.0,69.0,59.5,73.5,1.5,2.0
75%,684115.25,7160384.0,2403.0,3.0,85.25,81.0,91.0,3.0,2.0
max,837151.0,7541826.0,2584.0,5.0,100.0,100.0,100.0,4.0,4.0


In [9]:
#Functions for converting easting and northing to latitudes and longitudes.
def toLat(E,N):
    lat, lon = tm.grid_to_geodetic(N,E)
    return lat
def toLon(E,N):
    lat, lon = tm.grid_to_geodetic(N,E)
    return lon

In [10]:
#Applying functions to create to new columns, Longitude and Latitude for all measuring occurances in dataframe.
sample['Latitude'] = sample.apply(lambda row: toLat(row['E'],row['N']),axis=1)
sample['Longitude'] = sample.apply(lambda row: toLon(row['E'],row['N']),axis=1)
sample

Unnamed: 0,PunktID,E,N,Kommun,Datum,Temp,Tmedel,Nederbord,RH,Vindhastighet,...,BUI,FWI,FWI_index,HBV_o,HBV_u,HBV,HBV_index,Gras,Latitude,Longitude
7732768,3169,622491,6995342,2282.0,2013-06-08,156,13,26,38,29,...,338.0,28.0,2.0,56.0,54.0,65.0,2.0,2.0,63.066794,17.424225
5500156,2288,758768,7492962,2523.0,2011-03-03,-15,0,0,575,87,...,,,,,,,,1.0,67.439059,21.051839
2416893,1004,709601,6578167,120.0,2006-06-05,139,114,4,54,9,...,244.0,147.0,3.0,56.0,51.0,54.0,3.0,,59.290284,18.680487
17359444,7137,440695,7079020,2309.0,2015-10-04,68,4,1,641,68,...,11.0,3.0,1.0,98.0,87.0,100.0,1.0,2.0,63.833493,13.794633
18459521,7590,358100,6476782,1487.0,2001-04-06,76,53,38,79,17,...,26.0,0.0,1.0,81.0,88.0,95.0,1.0,,58.408755,12.571296
1723795,722,499087,6864256,2361.0,2007-07-05,215,174,3,57,42,...,442.0,9.0,3.0,56.0,38.0,48.0,3.0,,61.910913,14.98262
6220052,2561,642134,7347013,2506.0,2008-05-17,24,4,1,41,15,...,0.0,0.0,1.0,96.0,97.0,100.0,1.0,1.0,66.210979,18.15882
3045147,1272,479880,7327307,2421.0,2019-05-12,7,0,88,908,10,...,0.0,0.0,-1.0,100.0,100.0,100.0,1.0,1.0,66.065722,14.55555
9816488,4039,560931,6504816,581.0,2007-06-26,214,166,0,49,43,...,236.0,97.0,3.0,66.0,42.0,57.0,3.0,,58.679263,16.050821
5632202,2340,376209,6218437,1260.0,2016-07-31,208,16,4,516,326324,...,357.0,29.0,2.0,53.0,44.0,59.0,3.0,2.0,56.094997,13.010112
