# Using NumPy to Compute the Mean, Median, Variance, and Standard Deviation of a Dataset

<b> Import NumPy </b>

In [1]:
import numpy as np

<b> Load the normal_distribution.csv dataset </b>

In [2]:
dataset = np.genfromtxt('Datasets/normal_distribution.csv', delimiter=',')
dataset

array([[ 99.14931546, 104.03852715, 107.43534677,  97.85230675,
         98.74986914,  98.80833412,  96.81964892,  98.56783189],
       [ 92.02628776,  97.10439252,  99.32066924,  97.24584816,
         92.9267508 ,  92.65657752, 105.7197853 , 101.23162942],
       [ 95.66253664,  95.17750125,  90.93318132, 110.18889465,
         98.80084371, 105.95297652,  98.37481387, 106.54654286],
       [ 91.37294597, 100.96781394, 100.40118279, 113.42090475,
        105.48508838,  91.6604946 , 106.1472841 ,  95.08715803],
       [101.20862522, 103.5730309 , 100.28690912, 105.85269352,
         93.37126331, 108.57980357, 100.79478953,  94.20019732],
       [102.80387079,  98.29687616,  93.24376389,  97.24130034,
         89.03452725,  96.2832753 , 104.60344836, 101.13442416],
       [106.71751618, 102.97585605,  98.45723272, 100.72418901,
        106.39798503,  95.46493436,  94.35373179, 106.83273763],
       [ 96.02548256, 102.82360856, 106.47551845, 101.34745901,
        102.45651798,  98.7476749

<b> Print a subset of the first two rows of the dataset </b>

In [3]:
dataset[:2, :]

array([[ 99.14931546, 104.03852715, 107.43534677,  97.85230675,
         98.74986914,  98.80833412,  96.81964892,  98.56783189],
       [ 92.02628776,  97.10439252,  99.32066924,  97.24584816,
         92.9267508 ,  92.65657752, 105.7197853 , 101.23162942]])

<b> Load the dataset and calculate the mean of the third row </b>

In [4]:
np.mean(dataset[2])

100.20466135250001

<b> Index the last element of an ndarray </b>

In [5]:
np.mean(dataset[:, -1])

100.4404927375

<b> Get a submatrix of the first three elements of every row of the first three columns </b>

In [6]:
np.mean(dataset[0:3, 0:3])

97.87197312333333

<b> Calculate the median for the last row of the dataset </b>

In [7]:
np.mean(dataset[-1, :])

99.34233100624999

<b> Use reverse indexing to define a range to get the last three columns </b>

In [8]:
np.mean(dataset[:, -3:])

99.96791031722222

<b> Aggregate the values along an axis to calculate the rows </b>

In [9]:
np.median(dataset, axis=1)

array([ 98.77910163,  97.17512034,  98.58782879, 100.68449836,
       101.00170737,  97.76908825, 101.85002253, 100.04756697,
       102.24292555,  99.59514997, 100.4955753 ,  99.8860714 ,
        99.00647994,  98.67276177, 102.44376222,  96.61933565,
       104.0968893 , 100.72023043,  98.70877396,  99.75008654,
       104.89344428, 101.00634942,  98.30543801,  99.18748092])

<b> Calculate the variance for each column </b>

In [10]:
np.var(dataset, axis=0)

array([23.64757465, 29.78886109, 20.50542011, 26.03204443, 28.38853175,
       19.09960817, 17.67291174, 16.17923204])

<b> Calculate the variance of the intersection of the last two rows and the first two columns </b>

In [11]:
np.var(dataset[-2:, :2])

4.674691991769191

<b> Calculate the standard deviation for the dataset </b>

In [12]:
np.std(dataset)

4.838197554269257

# Forest Fire Size and Temperature Analysis

We will use pandas features to derive some insights from a forest fire dataset. We will get the mean size of forest fires, what the largest recorded fire in our dataset is, and whether the amount of forest fires grows proportionally to the temperature in each month.

Our forest fires dataset has the following structure:

    X: X-axis spatial coordinate within the Montesinho park map: 1 to 9
    Y: Y-axis spatial coordinate within the Montesinho park map: 2 to 9
    month: Month of the year: 'jan' to 'dec'
    day: Day of the week: 'mon' to 'sun'
    FFMC: FFMC index from the FWI system: 18.7 to 96.20
    DMC: DMC index from the FWI system: 1.1 to 291.3
    DC: DC index from the FWI system: 7.9 to 860.6
    ISI: ISI index from the FWI system: 0.0 to 56.10
    temp: Temperature in degrees Celsius: 2.2 to 33.30
    RH: Relative humidity in %: 15.0 to 100
    wind: Wind speed in km/h: 0.40 to 9.40
    rain: Outside rain in mm/m2: 0.0 to 6.4
    area: The burned area of the forest (in ha): 0.00 to 1090.84

<b> Import pandas </b>

In [13]:
import pandas as pd

<b> Load the forestfires.csv dataset </b>

In [14]:
dataset = pd.read_csv('Datasets/forestfires.csv')

<b> Print the first two rows of the dataset to get a feeling for its structure </b>

In [15]:
dataset.head(2)

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0


<b> Filter the dataset so that it only contains entries that have an area larger than 0 </b>

In [16]:
area_dataset = dataset[dataset['area'] > 0]
area_dataset.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
138,9,9,jul,tue,85.8,48.3,313.4,3.9,18.0,42,2.7,0.0,0.36
139,1,4,sep,tue,91.0,129.5,692.6,7.0,21.7,38,2.2,0.0,0.43
140,2,5,sep,mon,90.9,126.5,686.5,7.0,21.9,39,1.8,0.0,0.47
141,1,2,aug,wed,95.5,99.9,513.3,13.2,23.3,31,4.5,0.0,0.55
142,8,6,aug,fri,90.1,108.0,529.8,12.5,21.2,51,8.9,0.0,0.61


<b> Get the mean, min, max, and std of the area column </b>

In [17]:
area_dataset['area'].mean()

24.60018518518518

In [18]:
area_dataset['area'].min()

0.09

In [19]:
area_dataset['area'].max()

1090.84

In [20]:
area_dataset['area'].std()

86.50163460412125

<b> Sort the filtered dataset using the area column and print the last 20 entries </b>

In [21]:
area_dataset.sort_values(by=['area']).tail(20)

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
469,6,3,apr,sun,91.0,14.6,25.6,12.3,13.7,33,9.4,0.0,61.13
228,4,6,sep,sun,93.5,149.3,728.6,8.1,28.3,26,3.1,0.0,64.1
473,9,4,jun,sat,90.5,61.1,252.6,9.4,24.5,50,3.1,0.0,70.32
392,1,3,sep,sun,91.0,276.3,825.1,7.1,21.9,43,4.0,0.0,70.76
229,8,6,aug,sat,92.2,81.8,480.8,11.9,16.4,43,4.0,0.0,71.3
457,1,4,aug,wed,91.7,191.4,635.9,7.8,19.9,50,4.0,0.0,82.75
293,7,6,jul,tue,93.1,180.4,430.8,11.0,26.9,28,5.4,0.0,86.45
230,4,4,sep,wed,92.9,133.3,699.6,9.2,26.4,21,4.5,0.0,88.49
231,1,5,sep,sun,93.5,149.3,728.6,8.1,27.8,27,3.1,0.0,95.18
232,6,4,sep,tue,91.0,129.5,692.6,7.0,18.7,43,2.7,0.0,103.39


<b> Get the median of the area column and visually compare it to the mean value</b>

In [22]:
area_dataset['area'].median()

6.37

In [23]:
print(f'The median of area column is {area_dataset["area"].median()} and the mean of area columns is {area_dataset["area"].mean()}')

The median of area column is 6.37 and the mean of area columns is 24.60018518518518


<b> Get a list of unique values from the month column of the dataset </b>

In [24]:
months = dataset['month'].unique()
months

array(['mar', 'oct', 'aug', 'sep', 'apr', 'jun', 'jul', 'feb', 'jan',
       'dec', 'may', 'nov'], dtype=object)

<b> Get the number of entries for the month of March </b>

In [25]:
dataset[dataset['month'] == 'mar'].shape[0]

54

<b> Iterate over all the months, filter our dataset for the rows containing the given month, and calculate the mean temperature. Print a statement with the number of fires, the mean temperature, and the month </b>

In [26]:
for month in months:
    df = dataset[dataset['month'] == month]
    print(month)
    print(f'Mean temperature: {round(df["temp"].mean(), 1)}')
    print(f'Fires in {month}: {df.shape[0]}\n')

mar
Mean temperature: 13.1
Fires in mar: 54

oct
Mean temperature: 17.1
Fires in oct: 15

aug
Mean temperature: 21.6
Fires in aug: 184

sep
Mean temperature: 19.6
Fires in sep: 172

apr
Mean temperature: 12.0
Fires in apr: 9

jun
Mean temperature: 20.5
Fires in jun: 17

jul
Mean temperature: 22.1
Fires in jul: 32

feb
Mean temperature: 9.6
Fires in feb: 20

jan
Mean temperature: 5.2
Fires in jan: 2

dec
Mean temperature: 4.5
Fires in dec: 9

may
Mean temperature: 14.7
Fires in may: 2

nov
Mean temperature: 11.8
Fires in nov: 1

