# STORMS PROJECT

# =========================================

# This dataset contains an EDA for NOAA NHC storm data from 1975-2021 of Atlantic basin storms. There are two main sections:

## A. Clean your data. Conduct some basic data cleaning and consistency checks in Jupyter to ensure your data is ready for further analysis.

## B. Understand your data. Develop a basic understanding of your data set by reviewing the variables and performing basic descriptive statistical analysis. 

# =========================================

# A. Clean your data. Conduct some basic data cleaning and consistency checks in Jupyter to ensure your data is ready for further analysis.

# =========================================

# 1. Import libraries and data

In [2]:
# Import libraries

import pandas as pd
import numpy as np
import os

In [3]:
# Define path

path = r'/Users/matthewschleigh/Desktop/CareerFoundry/Achievement 6/Storms Project'

In [4]:
# Import data

storms = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'storms.csv'))

In [5]:
# Check output

storms.shape

(12230, 13)

In [6]:
storms.head()

Unnamed: 0,name,year,month,day,hour,lat,long,status,category,wind,pressure,tropicalstorm_force_diameter,hurricane_force_diameter
0,Amy,1975,6,27,0,27.5,-79.0,tropical depression,-1,25,1013,,
1,Amy,1975,6,27,6,28.5,-79.0,tropical depression,-1,25,1013,,
2,Amy,1975,6,27,12,29.5,-79.0,tropical depression,-1,25,1013,,
3,Amy,1975,6,27,18,30.5,-79.0,tropical depression,-1,25,1013,,
4,Amy,1975,6,28,0,31.5,-78.8,tropical depression,-1,25,1012,,


# 2. Rename columns

In [13]:
storms.rename(columns = {'wind':'wind_knots', 'pressure':'pressure_hpa', 'tropicalstorm_force_diameter':'trop_storm_force_wind_diameter', 'hurricane_force_diameter':'hurricane_force_wind_diameter', 'lat':'latitude', 'long':'longitude'}, inplace=True)
              

In [14]:
# Check code

storms.head()

Unnamed: 0,name,year,month,day,hour,latitude,longitude,status,category,wind_knots,pressure_hpa,trop_storm_force_wind_diameter,hurricane_force_wind_diameter
0,Amy,1975,6,27,0,27.5,-79.0,tropical depression,-1,25,1013,,
1,Amy,1975,6,27,6,28.5,-79.0,tropical depression,-1,25,1013,,
2,Amy,1975,6,27,12,29.5,-79.0,tropical depression,-1,25,1013,,
3,Amy,1975,6,27,18,30.5,-79.0,tropical depression,-1,25,1013,,
4,Amy,1975,6,28,0,31.5,-78.8,tropical depression,-1,25,1012,,


# 3. Check missing values

In [15]:
storms['name'].value_counts(dropna=False)

Emily        217
Bonnie       209
Claudette    194
Alberto      184
Felix        178
            ... 
AL022003       4
AL092003       4
AL012000       4
Nestor         3
Five           2
Name: name, Length: 220, dtype: int64

In [19]:
storms['trop_storm_force_wind_diameter'].value_counts(dropna=False)

NaN      6509
0.0      1106
90.0      273
120.0     220
100.0     217
         ... 
780.0       1
335.0       1
395.0       1
10.0        1
525.0       1
Name: trop_storm_force_wind_diameter, Length: 118, dtype: int64

In [20]:
storms['hurricane_force_wind_diameter'].value_counts(dropna=False)

NaN      6509
0.0      4067
40.0      161
50.0      140
30.0      131
20.0      127
60.0      112
25.0       83
70.0       80
90.0       72
100.0      70
35.0       69
45.0       66
65.0       59
15.0       47
80.0       42
85.0       42
55.0       39
75.0       38
150.0      36
120.0      32
135.0      28
110.0      26
105.0      25
10.0       25
130.0      22
160.0      17
165.0      13
125.0      12
180.0       8
140.0       8
95.0        8
115.0       5
170.0       4
190.0       3
230.0       2
300.0       1
200.0       1
Name: hurricane_force_wind_diameter, dtype: int64

In [21]:
storms['year'].value_counts(dropna=False)

1995    652
2020    570
2005    498
2012    454
2003    422
1998    410
2004    410
2010    402
2016    396
2001    371
2021    371
1990    355
1989    354
2008    335
2019    330
2011    323
2000    318
1996    315
2017    306
1979    301
2002    285
2018    266
1985    263
1988    252
1984    236
2015    220
2007    213
1999    210
1994    206
2013    202
2006    190
1992    185
1981    164
1980    161
1997    154
2009    153
2014    139
1991    131
1993    128
1982    105
1975     86
1987     80
1983     79
1986     70
1978     54
1977     53
1976     52
Name: year, dtype: int64

In [22]:
storms['month'].value_counts(dropna=False)

9     5034
8     2812
10    2024
7      962
11     734
6      403
12     125
5       93
1       30
4       13
Name: month, dtype: int64

In [23]:
storms['day'].value_counts(dropna=False)

1     449
29    427
30    427
2     426
25    421
6     419
28    417
15    415
8     413
7     413
24    404
3     402
26    401
16    400
22    399
23    396
5     396
17    395
9     395
18    392
14    391
4     386
27    385
21    379
11    378
19    373
20    371
13    369
12    369
10    367
31    255
Name: day, dtype: int64

In [24]:
storms['hour'].value_counts(dropna=False)

12    2999
18    2987
0     2955
6     2944
3       37
21      32
15      27
16      21
14      20
9       18
2       17
8       17
22      17
10      16
11      16
13      15
5       14
4       14
7       13
23      13
17      12
1        9
19       9
20       8
Name: hour, dtype: int64

In [25]:
storms['latitude'].value_counts(dropna=False)

31.5    74
20.0    72
20.5    63
29.0    63
18.0    61
        ..
49.5     1
48.2     1
51.9     1
8.8      1
8.7      1
Name: latitude, Length: 410, dtype: int64

In [26]:
storms['longitude'].value_counts(dropna=False)

-84.0     37
-62.8     35
-58.0     35
-40.0     33
-72.0     33
          ..
-19.4      1
-18.2      1
-17.4      1
-107.7     1
-20.6      1
Name: longitude, Length: 865, dtype: int64

In [27]:
storms['status'].value_counts(dropna=False)

tropical storm         5537
hurricane              3720
tropical depression    2973
Name: status, dtype: int64

In [28]:
storms['category'].value_counts(dropna=False)

 0    5536
-1    2973
 1    1974
 2     766
 3     455
 4     440
 5      86
Name: category, dtype: int64

In [29]:
storms['wind_knots'].value_counts(dropna=False)

30     1859
35     1209
45     1085
40     1032
50      871
25      824
55      743
65      708
60      596
70      501
75      449
80      316
90      297
85      278
20      231
95      191
100     184
110     142
115     142
105     129
120      99
125      98
130      69
15       50
140      34
135      32
145      24
150      16
10        9
155       8
160       4
Name: wind_knots, dtype: int64

In [30]:
storms['pressure_hpa'].value_counts(dropna=False)

1005    663
1006    644
1007    596
1008    575
1004    487
       ... 
902       1
888       1
889       1
900       1
907       1
Name: pressure_hpa, Length: 124, dtype: int64

## Missing values found in 'trop_storm_force_wind_diameter' and 'hurricane_force_wind_diameter'. NaN kept for the moment since a value of "0" would imply the storm had no size, as opposed to data that was not collected.

# 4. Check duplicates

In [31]:
storms_dups = storms[storms.duplicated()]

In [32]:
storms_dups

Unnamed: 0,name,year,month,day,hour,latitude,longitude,status,category,wind_knots,pressure_hpa,trop_storm_force_wind_diameter,hurricane_force_wind_diameter


## No output means there are no duplicates in the df.

# 5. Check data types

In [34]:
storms.dtypes

name                               object
year                                int64
month                               int64
day                                 int64
hour                                int64
latitude                          float64
longitude                         float64
status                             object
category                            int64
wind_knots                          int64
pressure_hpa                        int64
trop_storm_force_wind_diameter    float64
hurricane_force_wind_diameter     float64
dtype: object

## All data types look to be accurate for type of data expected based on variables. It's possible 'category' could be changed to object since the number represents a classification of storm size, but I'll leave it as an integer for the time being.

# 6. Check for mixed types

In [35]:
for col in storms.columns.tolist():
  weird = (storms[[col]].applymap(type) != storms[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (storms[weird]) > 0:
    print (col)

## No output means there are no mixed type data types within df.

# 7. Additional missing values check

In [36]:
storms.isnull().sum()

name                                 0
year                                 0
month                                0
day                                  0
hour                                 0
latitude                             0
longitude                            0
status                               0
category                             0
wind_knots                           0
pressure_hpa                         0
trop_storm_force_wind_diameter    6509
hurricane_force_wind_diameter     6509
dtype: int64

# 8. Check output again 

In [37]:
storms.head()

Unnamed: 0,name,year,month,day,hour,latitude,longitude,status,category,wind_knots,pressure_hpa,trop_storm_force_wind_diameter,hurricane_force_wind_diameter
0,Amy,1975,6,27,0,27.5,-79.0,tropical depression,-1,25,1013,,
1,Amy,1975,6,27,6,28.5,-79.0,tropical depression,-1,25,1013,,
2,Amy,1975,6,27,12,29.5,-79.0,tropical depression,-1,25,1013,,
3,Amy,1975,6,27,18,30.5,-79.0,tropical depression,-1,25,1013,,
4,Amy,1975,6,28,0,31.5,-78.8,tropical depression,-1,25,1012,,


In [38]:
storms.tail()

Unnamed: 0,name,year,month,day,hour,latitude,longitude,status,category,wind_knots,pressure_hpa,trop_storm_force_wind_diameter,hurricane_force_wind_diameter
12225,Wanda,2021,11,6,6,37.6,-38.3,tropical storm,0,45,1000,110.0,0.0
12226,Wanda,2021,11,6,12,37.2,-38.4,tropical storm,0,40,1001,100.0,0.0
12227,Wanda,2021,11,6,18,37.1,-38.0,tropical storm,0,35,1002,100.0,0.0
12228,Wanda,2021,11,7,0,37.4,-37.4,tropical storm,0,35,1003,60.0,0.0
12229,Wanda,2021,11,7,6,38.1,-36.4,tropical storm,0,35,1004,60.0,0.0


In [39]:
storms.shape

(12230, 13)

# =========================================

# B. Understand your data. Develop a basic understanding of your data set by reviewing the variables and performing basic descriptive statistical analysis. 

In [40]:
# Print head for reference

storms.head(15)

Unnamed: 0,name,year,month,day,hour,latitude,longitude,status,category,wind_knots,pressure_hpa,trop_storm_force_wind_diameter,hurricane_force_wind_diameter
0,Amy,1975,6,27,0,27.5,-79.0,tropical depression,-1,25,1013,,
1,Amy,1975,6,27,6,28.5,-79.0,tropical depression,-1,25,1013,,
2,Amy,1975,6,27,12,29.5,-79.0,tropical depression,-1,25,1013,,
3,Amy,1975,6,27,18,30.5,-79.0,tropical depression,-1,25,1013,,
4,Amy,1975,6,28,0,31.5,-78.8,tropical depression,-1,25,1012,,
5,Amy,1975,6,28,6,32.4,-78.7,tropical depression,-1,25,1012,,
6,Amy,1975,6,28,12,33.3,-78.0,tropical depression,-1,25,1011,,
7,Amy,1975,6,28,18,34.0,-77.0,tropical depression,-1,30,1006,,
8,Amy,1975,6,29,0,34.4,-75.8,tropical storm,0,35,1004,,
9,Amy,1975,6,29,6,34.0,-74.8,tropical storm,0,40,1002,,


# 1. Check mean, min, and max values for continuous variables.

In [42]:
storms.describe()

Unnamed: 0,year,month,day,hour,latitude,longitude,category,wind_knots,pressure_hpa,trop_storm_force_wind_diameter,hurricane_force_wind_diameter
count,12230.0,12230.0,12230.0,12230.0,12230.0,12230.0,12230.0,12230.0,12230.0,5721.0,5721.0
mean,2001.880458,8.774734,15.812428,9.118479,24.782739,-64.006778,0.33426,53.704824,991.962388,144.103304,18.129698
std,12.291926,1.262258,8.974545,6.727572,8.547812,19.718927,1.276595,26.272792,19.597726,125.121619,35.341584
min,1975.0,1.0,1.0,0.0,7.2,-109.3,-1.0,10.0,882.0,0.0,0.0
25%,1993.0,8.0,8.0,6.0,17.5,-80.6,0.0,35.0,985.0,50.0,0.0
50%,2003.0,9.0,16.0,12.0,24.6,-64.2,0.0,45.0,999.0,120.0,0.0
75%,2012.0,9.0,24.0,18.0,31.3,-48.4,1.0,65.0,1006.0,210.0,25.0
max,2021.0,12.0,31.0,23.0,51.9,-6.0,5.0,160.0,1022.0,870.0,300.0


## The aggregate values for time variables appear accurate based on expected minimums and maximums (no value greater than 12 for month, 31 for day, or 23 for hour). Further investigation should be performed for 'hurricane_force_wind_diameter' values containing '0' as opposed to 'NaN'.

In [43]:
# Export descriptive statistics as csv

storms.describe().to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'storms_stats.csv'))

# 2. Export cleaned dataset as csv.

In [44]:
storms.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'storms_cleaned.csv'))