In [1]:
! python3 --version

Python 3.8.12


In [2]:
import pandas as pd
import numpy as np
import os
import re
import functools

pd.__version__, np.__version__

('1.4.1', '1.21.2')

In [3]:
project_dir = '/Users/gurdeep/Documents/tb2/DSMP/G32-Butterfly-Data'
module_dir = project_dir+'/modules'

dataset_filepath = '/Users/gurdeep/Documents/tb2/DSMP/meteorological_data_bristol_lulsgate.csv'

os.chdir(project_dir)
current_working_dir = os.getcwd()
print("Current working dir:", current_working_dir)
print("All paths correct:", os.path.isdir(project_dir) == os.path.isdir(module_dir) == os.path.isfile(dataset_filepath) == True)

Current working dir: /Users/gurdeep/Documents/tb2/DSMP/G32-Butterfly-Data
All paths correct: True


In [130]:
# importing preprocessing module
os.chdir(module_dir)
import Preprocessing
os.chdir(current_working_dir)

## Loading File

In [131]:
df = pd.read_csv(dataset_filepath, delimiter=";")

df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210489 entries, 0 to 210488
Data columns (total 10 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Date Time                 210489 non-null  object 
 1   Temperature               210452 non-null  float64
 2   Dewpoint Temperature      210453 non-null  float64
 3   Wind Speed                210074 non-null  float64
 4   Wind Direction            205007 non-null  float64
 5   Atmospheric Pressure      210457 non-null  float64
 6   Visibility                210463 non-null  float64
 7   Wind Chill Temperature    204768 non-null  float64
 8   Relative Humidity         209967 non-null  float64
 9   Wind Chill Temperature.1  204768 non-null  float64
dtypes: float64(9), object(1)
memory usage: 16.1+ MB


Unnamed: 0,Date Time,Temperature,Dewpoint Temperature,Wind Speed,Wind Direction,Atmospheric Pressure,Visibility,Wind Chill Temperature,Relative Humidity,Wind Chill Temperature.1
0,2022-02-14T23:50:00+00:00,3.0,2.0,5.0,250.0,1010.0,10.0,-1.0,93.0,-1.0
1,2022-02-14T23:20:00+00:00,3.0,2.0,5.0,260.0,1010.0,10.0,-1.0,93.0,-1.0
2,2022-02-14T22:50:00+00:00,4.0,2.0,7.0,270.0,1010.0,10.0,-1.0,87.0,-1.0
3,2022-02-14T22:20:00+00:00,4.0,2.0,6.0,270.0,1010.0,10.0,0.0,87.0,0.0
4,2022-02-14T21:50:00+00:00,4.0,2.0,4.0,270.0,1010.0,10.0,1.0,87.0,1.0


## Renaming columns

In [132]:
{col: '' for col in df.columns}

{'Date Time': '',
 'Temperature': '',
 'Dewpoint Temperature': '',
 'Wind Speed': '',
 'Wind Direction': '',
 'Atmospheric Pressure': '',
 'Visibility': '',
 'Wind Chill Temperature': '',
 'Relative Humidity': '',
 'Wind Chill Temperature.1': ''}

In [133]:
mapper = {
     'Date Time': 'datetime',
     'Temperature': 'tempt',
     'Dewpoint Temperature': 'dewpoint_tempt',
     'Wind Speed': 'wind_speed',
     'Wind Direction': 'wind_dir',
     'Atmospheric Pressure': 'atm_pressure',
     'Visibility': 'visibility',
     'Wind Chill Temperature': 'wind_chill_tempt',
     'Relative Humidity': 'relative_humidity',
     'Wind Chill Temperature.1': 'wind_chill_tempt.1'}

In [134]:
df.rename(columns=mapper, inplace=True)

In [135]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210489 entries, 0 to 210488
Data columns (total 10 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   datetime            210489 non-null  object 
 1   tempt               210452 non-null  float64
 2   dewpoint_tempt      210453 non-null  float64
 3   wind_speed          210074 non-null  float64
 4   wind_dir            205007 non-null  float64
 5   atm_pressure        210457 non-null  float64
 6   visibility          210463 non-null  float64
 7   wind_chill_tempt    204768 non-null  float64
 8   relative_humidity   209967 non-null  float64
 9   wind_chill_tempt.1  204768 non-null  float64
dtypes: float64(9), object(1)
memory usage: 16.1+ MB


Unnamed: 0,datetime,tempt,dewpoint_tempt,wind_speed,wind_dir,atm_pressure,visibility,wind_chill_tempt,relative_humidity,wind_chill_tempt.1
0,2022-02-14T23:50:00+00:00,3.0,2.0,5.0,250.0,1010.0,10.0,-1.0,93.0,-1.0
1,2022-02-14T23:20:00+00:00,3.0,2.0,5.0,260.0,1010.0,10.0,-1.0,93.0,-1.0
2,2022-02-14T22:50:00+00:00,4.0,2.0,7.0,270.0,1010.0,10.0,-1.0,87.0,-1.0
3,2022-02-14T22:20:00+00:00,4.0,2.0,6.0,270.0,1010.0,10.0,0.0,87.0,0.0
4,2022-02-14T21:50:00+00:00,4.0,2.0,4.0,270.0,1010.0,10.0,1.0,87.0,1.0


## Creating columns for holding units

In [136]:
df['tempt_unit'] = 'celsius'
df['dewpoint_tempt_unit'] = 'celsius'
df['wind_speed_unit'] = 'm/s'
df['wind_dir_unit'] = 'degrees from north'
df['atm_pressure_unit'] = 'mbar'
df['visibility_unit'] = 'km'
df['wind_chill_tempt_unit'] = 'celsius'
df['relative_humidity_unit'] = 'percentage (%)'

In [137]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210489 entries, 0 to 210488
Data columns (total 18 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   datetime                210489 non-null  object 
 1   tempt                   210452 non-null  float64
 2   dewpoint_tempt          210453 non-null  float64
 3   wind_speed              210074 non-null  float64
 4   wind_dir                205007 non-null  float64
 5   atm_pressure            210457 non-null  float64
 6   visibility              210463 non-null  float64
 7   wind_chill_tempt        204768 non-null  float64
 8   relative_humidity       209967 non-null  float64
 9   wind_chill_tempt.1      204768 non-null  float64
 10  tempt_unit              210489 non-null  object 
 11  dewpoint_tempt_unit     210489 non-null  object 
 12  wind_speed_unit         210489 non-null  object 
 13  wind_dir_unit           210489 non-null  object 
 14  atm_pressure_unit   

Unnamed: 0,datetime,tempt,dewpoint_tempt,wind_speed,wind_dir,atm_pressure,visibility,wind_chill_tempt,relative_humidity,wind_chill_tempt.1,tempt_unit,dewpoint_tempt_unit,wind_speed_unit,wind_dir_unit,atm_pressure_unit,visibility_unit,wind_chill_tempt_unit,relative_humidity_unit
0,2022-02-14T23:50:00+00:00,3.0,2.0,5.0,250.0,1010.0,10.0,-1.0,93.0,-1.0,celsius,celsius,m/s,degrees from north,mbar,km,celsius,percentage (%)
1,2022-02-14T23:20:00+00:00,3.0,2.0,5.0,260.0,1010.0,10.0,-1.0,93.0,-1.0,celsius,celsius,m/s,degrees from north,mbar,km,celsius,percentage (%)
2,2022-02-14T22:50:00+00:00,4.0,2.0,7.0,270.0,1010.0,10.0,-1.0,87.0,-1.0,celsius,celsius,m/s,degrees from north,mbar,km,celsius,percentage (%)
3,2022-02-14T22:20:00+00:00,4.0,2.0,6.0,270.0,1010.0,10.0,0.0,87.0,0.0,celsius,celsius,m/s,degrees from north,mbar,km,celsius,percentage (%)
4,2022-02-14T21:50:00+00:00,4.0,2.0,4.0,270.0,1010.0,10.0,1.0,87.0,1.0,celsius,celsius,m/s,degrees from north,mbar,km,celsius,percentage (%)


## NaN Values

In [138]:
nan = Preprocessing.NanAnalyzer(df)
nan.printNaNInfo()


NAN INFO IN THE DATASET:

No of columns with only NaN values: 0
Column list with only NaN values:
[]

No of columns with partial NaN values: 9
Column list with partial NaN values:
['tempt', 'dewpoint_tempt', 'wind_speed', 'wind_dir', 'atm_pressure', 'visibility', 'wind_chill_tempt', 'relative_humidity', 'wind_chill_tempt.1']

No of columns with no NaN values: 9
Column list with no NaN values:
['datetime', 'tempt_unit', 'dewpoint_tempt_unit', 'wind_speed_unit', 'wind_dir_unit', 'atm_pressure_unit', 'visibility_unit', 'wind_chill_tempt_unit', 'relative_humidity_unit']

Total Values in dataset: 3788802

Total NaN Values in the dataset: 17992

Total Partial NaN Values in the dataset: 17992

Partial NaN Values Info:
              Columns  Partial_NaN_Count  NaN_Vals_By_Total_Vals
0               tempt                 37                   0.000
1      dewpoint_tempt                 36                   0.000
2          wind_speed                415                   0.002
3            wind_

#### Imputing null values based on previous values, this is done since the data interval is about 30 mins. So, what can go wrong ?!? uwu

In [139]:
df.fillna(method='ffill', inplace=True)

In [140]:
nan = Preprocessing.NanAnalyzer(df)
nan.printNaNInfo()
del nan


NAN INFO IN THE DATASET:

No of columns with only NaN values: 0
Column list with only NaN values:
[]

No of columns with partial NaN values: 0
Column list with partial NaN values:
[]

No of columns with no NaN values: 18
Column list with no NaN values:
['datetime', 'tempt', 'dewpoint_tempt', 'wind_speed', 'wind_dir', 'atm_pressure', 'visibility', 'wind_chill_tempt', 'relative_humidity', 'wind_chill_tempt.1', 'tempt_unit', 'dewpoint_tempt_unit', 'wind_speed_unit', 'wind_dir_unit', 'atm_pressure_unit', 'visibility_unit', 'wind_chill_tempt_unit', 'relative_humidity_unit']

Total Values in dataset: 3788802

Total NaN Values in the dataset: 0

Total Partial NaN Values in the dataset: 0

Partial NaN Values Info:
Empty DataFrame
Columns: [Columns, Partial_NaN_Count, NaN_Vals_By_Total_Vals]
Index: []

Total NaN Values/Total Datset Values: 0.0 %


In [141]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210489 entries, 0 to 210488
Data columns (total 18 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   datetime                210489 non-null  object 
 1   tempt                   210489 non-null  float64
 2   dewpoint_tempt          210489 non-null  float64
 3   wind_speed              210489 non-null  float64
 4   wind_dir                210489 non-null  float64
 5   atm_pressure            210489 non-null  float64
 6   visibility              210489 non-null  float64
 7   wind_chill_tempt        210489 non-null  float64
 8   relative_humidity       210489 non-null  float64
 9   wind_chill_tempt.1      210489 non-null  float64
 10  tempt_unit              210489 non-null  object 
 11  dewpoint_tempt_unit     210489 non-null  object 
 12  wind_speed_unit         210489 non-null  object 
 13  wind_dir_unit           210489 non-null  object 
 14  atm_pressure_unit   

Unnamed: 0,datetime,tempt,dewpoint_tempt,wind_speed,wind_dir,atm_pressure,visibility,wind_chill_tempt,relative_humidity,wind_chill_tempt.1,tempt_unit,dewpoint_tempt_unit,wind_speed_unit,wind_dir_unit,atm_pressure_unit,visibility_unit,wind_chill_tempt_unit,relative_humidity_unit
0,2022-02-14T23:50:00+00:00,3.0,2.0,5.0,250.0,1010.0,10.0,-1.0,93.0,-1.0,celsius,celsius,m/s,degrees from north,mbar,km,celsius,percentage (%)
1,2022-02-14T23:20:00+00:00,3.0,2.0,5.0,260.0,1010.0,10.0,-1.0,93.0,-1.0,celsius,celsius,m/s,degrees from north,mbar,km,celsius,percentage (%)
2,2022-02-14T22:50:00+00:00,4.0,2.0,7.0,270.0,1010.0,10.0,-1.0,87.0,-1.0,celsius,celsius,m/s,degrees from north,mbar,km,celsius,percentage (%)
3,2022-02-14T22:20:00+00:00,4.0,2.0,6.0,270.0,1010.0,10.0,0.0,87.0,0.0,celsius,celsius,m/s,degrees from north,mbar,km,celsius,percentage (%)
4,2022-02-14T21:50:00+00:00,4.0,2.0,4.0,270.0,1010.0,10.0,1.0,87.0,1.0,celsius,celsius,m/s,degrees from north,mbar,km,celsius,percentage (%)


## DataType Check

In [142]:
{c: '' for c in df.columns}

{'datetime': '',
 'tempt': '',
 'dewpoint_tempt': '',
 'wind_speed': '',
 'wind_dir': '',
 'atm_pressure': '',
 'visibility': '',
 'wind_chill_tempt': '',
 'relative_humidity': '',
 'wind_chill_tempt.1': '',
 'tempt_unit': '',
 'dewpoint_tempt_unit': '',
 'wind_speed_unit': '',
 'wind_dir_unit': '',
 'atm_pressure_unit': '',
 'visibility_unit': '',
 'wind_chill_tempt_unit': '',
 'relative_humidity_unit': ''}

In [143]:
dta = Preprocessing.DataTypeAnalyzer()

In [144]:
dta.dtypeInfo(df)

Unnamed: 0,datetime,tempt,dewpoint_tempt,wind_speed,wind_dir,atm_pressure,visibility,wind_chill_tempt,relative_humidity,wind_chill_tempt.1,tempt_unit,dewpoint_tempt_unit,wind_speed_unit,wind_dir_unit,atm_pressure_unit,visibility_unit,wind_chill_tempt_unit,relative_humidity_unit
0,-,int64,int64,int64,int64,int64,int64,int64,int64,int64,-,-,-,-,-,-,-,-
1,-,float64,float64,float64,float64,float64,float64,float64,float64,float64,-,-,-,-,-,-,-,-
2,datetime,datetime,datetime,datetime,datetime,datetime,datetime,datetime,datetime,datetime,-,-,-,-,-,-,-,-
3,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object,object


In [145]:
dtype_mapper = {
                 'datetime': 'datetime',
                 'tempt': 'float64',
                 'dewpoint_tempt': 'float64',
                 'wind_speed': 'float64',
                 'wind_dir': 'float64',
                 'atm_pressure': 'float64',
                 'visibility': 'float64',
                 'wind_chill_tempt': 'float64',
                 'relative_humidity': 'float64',
                 'wind_chill_tempt.1': 'float64',
                 'tempt_unit': 'str',
                 'visibility_unit': 'str',
                 'atm_pressure_unit': 'str',
                 'dewpoint_tempt_unit': 'str',
                 'wind_speed_unit': 'str',
                 'wind_dir_unit': 'str',
                 'wind_chill_tempt_unit': 'str',
                 'relative_humidity_unit': 'str'
                }

In [146]:
dta.findUnExpectedDtypeValues(df, dtype_mapper)

Unnamed: 0,Columns,ExpectedDtype,MismatchedValues
0,datetime,datetime,[]
1,tempt,float64,[]
2,dewpoint_tempt,float64,[]
3,wind_speed,float64,[]
4,wind_dir,float64,[]
5,atm_pressure,float64,[]
6,visibility,float64,[]
7,wind_chill_tempt,float64,[]
8,relative_humidity,float64,[]
9,wind_chill_tempt.1,float64,[]


In [147]:
df = dta.convertDtypes(df, {k:v if v != 'str' else 'object' for k, v in dtype_mapper.items()}).reset_index(drop=True)

In [148]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210489 entries, 0 to 210488
Data columns (total 18 columns):
 #   Column                  Non-Null Count   Dtype              
---  ------                  --------------   -----              
 0   datetime                210489 non-null  datetime64[ns, UTC]
 1   tempt                   210489 non-null  float64            
 2   dewpoint_tempt          210489 non-null  float64            
 3   wind_speed              210489 non-null  float64            
 4   wind_dir                210489 non-null  float64            
 5   atm_pressure            210489 non-null  float64            
 6   visibility              210489 non-null  float64            
 7   wind_chill_tempt        210489 non-null  float64            
 8   relative_humidity       210489 non-null  float64            
 9   wind_chill_tempt.1      210489 non-null  float64            
 10  tempt_unit              210489 non-null  object             
 11  dewpoint_tempt_unit     21

Unnamed: 0,datetime,tempt,dewpoint_tempt,wind_speed,wind_dir,atm_pressure,visibility,wind_chill_tempt,relative_humidity,wind_chill_tempt.1,tempt_unit,dewpoint_tempt_unit,wind_speed_unit,wind_dir_unit,atm_pressure_unit,visibility_unit,wind_chill_tempt_unit,relative_humidity_unit
0,2022-02-14 23:50:00+00:00,3.0,2.0,5.0,250.0,1010.0,10.0,-1.0,93.0,-1.0,celsius,celsius,m/s,degrees from north,mbar,km,celsius,percentage (%)
1,2022-02-14 23:20:00+00:00,3.0,2.0,5.0,260.0,1010.0,10.0,-1.0,93.0,-1.0,celsius,celsius,m/s,degrees from north,mbar,km,celsius,percentage (%)
2,2022-02-14 22:50:00+00:00,4.0,2.0,7.0,270.0,1010.0,10.0,-1.0,87.0,-1.0,celsius,celsius,m/s,degrees from north,mbar,km,celsius,percentage (%)
3,2022-02-14 22:20:00+00:00,4.0,2.0,6.0,270.0,1010.0,10.0,0.0,87.0,0.0,celsius,celsius,m/s,degrees from north,mbar,km,celsius,percentage (%)
4,2022-02-14 21:50:00+00:00,4.0,2.0,4.0,270.0,1010.0,10.0,1.0,87.0,1.0,celsius,celsius,m/s,degrees from north,mbar,km,celsius,percentage (%)


## Duplicates

In [149]:
Preprocessing.DuplicateAnalyzer(df, df.columns.tolist()).printDuplicateInfo()


Matching columns count: 5

Matching columns dict:
{'wind_chill_tempt': ['wind_chill_tempt.1'], 'wind_chill_tempt.1': ['wind_chill_tempt'], 'tempt_unit': ['dewpoint_tempt_unit', 'wind_chill_tempt_unit'], 'dewpoint_tempt_unit': ['tempt_unit', 'wind_chill_tempt_unit'], 'wind_chill_tempt_unit': ['tempt_unit', 'dewpoint_tempt_unit']}

Duplicate Row count: 0

DUplicate Rows:


Unnamed: 0,datetime,tempt,dewpoint_tempt,wind_speed,wind_dir,atm_pressure,visibility,wind_chill_tempt,relative_humidity,wind_chill_tempt.1,tempt_unit,dewpoint_tempt_unit,wind_speed_unit,wind_dir_unit,atm_pressure_unit,visibility_unit,wind_chill_tempt_unit,relative_humidity_unit


#### Ignoring unit columns as even though they have same values, those columns are there to hold a meaning rather than a value

In [151]:
df.drop(columns=['wind_chill_tempt.1'], inplace=True)

In [152]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210489 entries, 0 to 210488
Data columns (total 17 columns):
 #   Column                  Non-Null Count   Dtype              
---  ------                  --------------   -----              
 0   datetime                210489 non-null  datetime64[ns, UTC]
 1   tempt                   210489 non-null  float64            
 2   dewpoint_tempt          210489 non-null  float64            
 3   wind_speed              210489 non-null  float64            
 4   wind_dir                210489 non-null  float64            
 5   atm_pressure            210489 non-null  float64            
 6   visibility              210489 non-null  float64            
 7   wind_chill_tempt        210489 non-null  float64            
 8   relative_humidity       210489 non-null  float64            
 9   tempt_unit              210489 non-null  object             
 10  dewpoint_tempt_unit     210489 non-null  object             
 11  wind_speed_unit         21

Unnamed: 0,datetime,tempt,dewpoint_tempt,wind_speed,wind_dir,atm_pressure,visibility,wind_chill_tempt,relative_humidity,tempt_unit,dewpoint_tempt_unit,wind_speed_unit,wind_dir_unit,atm_pressure_unit,visibility_unit,wind_chill_tempt_unit,relative_humidity_unit
0,2022-02-14 23:50:00+00:00,3.0,2.0,5.0,250.0,1010.0,10.0,-1.0,93.0,celsius,celsius,m/s,degrees from north,mbar,km,celsius,percentage (%)
1,2022-02-14 23:20:00+00:00,3.0,2.0,5.0,260.0,1010.0,10.0,-1.0,93.0,celsius,celsius,m/s,degrees from north,mbar,km,celsius,percentage (%)
2,2022-02-14 22:50:00+00:00,4.0,2.0,7.0,270.0,1010.0,10.0,-1.0,87.0,celsius,celsius,m/s,degrees from north,mbar,km,celsius,percentage (%)
3,2022-02-14 22:20:00+00:00,4.0,2.0,6.0,270.0,1010.0,10.0,0.0,87.0,celsius,celsius,m/s,degrees from north,mbar,km,celsius,percentage (%)
4,2022-02-14 21:50:00+00:00,4.0,2.0,4.0,270.0,1010.0,10.0,1.0,87.0,celsius,celsius,m/s,degrees from north,mbar,km,celsius,percentage (%)


## Finalizing and Saving

In [153]:
df.head(10)

Unnamed: 0,datetime,tempt,dewpoint_tempt,wind_speed,wind_dir,atm_pressure,visibility,wind_chill_tempt,relative_humidity,tempt_unit,dewpoint_tempt_unit,wind_speed_unit,wind_dir_unit,atm_pressure_unit,visibility_unit,wind_chill_tempt_unit,relative_humidity_unit
0,2022-02-14 23:50:00+00:00,3.0,2.0,5.0,250.0,1010.0,10.0,-1.0,93.0,celsius,celsius,m/s,degrees from north,mbar,km,celsius,percentage (%)
1,2022-02-14 23:20:00+00:00,3.0,2.0,5.0,260.0,1010.0,10.0,-1.0,93.0,celsius,celsius,m/s,degrees from north,mbar,km,celsius,percentage (%)
2,2022-02-14 22:50:00+00:00,4.0,2.0,7.0,270.0,1010.0,10.0,-1.0,87.0,celsius,celsius,m/s,degrees from north,mbar,km,celsius,percentage (%)
3,2022-02-14 22:20:00+00:00,4.0,2.0,6.0,270.0,1010.0,10.0,0.0,87.0,celsius,celsius,m/s,degrees from north,mbar,km,celsius,percentage (%)
4,2022-02-14 21:50:00+00:00,4.0,2.0,4.0,270.0,1010.0,10.0,1.0,87.0,celsius,celsius,m/s,degrees from north,mbar,km,celsius,percentage (%)
5,2022-02-14 21:20:00+00:00,4.0,2.0,5.0,280.0,1009.0,10.0,0.0,87.0,celsius,celsius,m/s,degrees from north,mbar,km,celsius,percentage (%)
6,2022-02-14 20:50:00+00:00,4.0,2.0,7.0,280.0,1009.0,10.0,-1.0,87.0,celsius,celsius,m/s,degrees from north,mbar,km,celsius,percentage (%)
7,2022-02-14 20:20:00+00:00,5.0,2.0,7.0,280.0,1009.0,10.0,0.0,81.0,celsius,celsius,m/s,degrees from north,mbar,km,celsius,percentage (%)
8,2022-02-14 19:50:00+00:00,5.0,3.0,7.0,260.0,1008.0,10.0,0.0,87.0,celsius,celsius,m/s,degrees from north,mbar,km,celsius,percentage (%)
9,2022-02-14 19:20:00+00:00,5.0,2.0,6.0,260.0,1008.0,10.0,1.0,81.0,celsius,celsius,m/s,degrees from north,mbar,km,celsius,percentage (%)


In [154]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210489 entries, 0 to 210488
Data columns (total 17 columns):
 #   Column                  Non-Null Count   Dtype              
---  ------                  --------------   -----              
 0   datetime                210489 non-null  datetime64[ns, UTC]
 1   tempt                   210489 non-null  float64            
 2   dewpoint_tempt          210489 non-null  float64            
 3   wind_speed              210489 non-null  float64            
 4   wind_dir                210489 non-null  float64            
 5   atm_pressure            210489 non-null  float64            
 6   visibility              210489 non-null  float64            
 7   wind_chill_tempt        210489 non-null  float64            
 8   relative_humidity       210489 non-null  float64            
 9   tempt_unit              210489 non-null  object             
 10  dewpoint_tempt_unit     210489 non-null  object             
 11  wind_speed_unit         21

In [155]:
df.agg(['count', 'min', 'max'])

Unnamed: 0,datetime,tempt,dewpoint_tempt,wind_speed,wind_dir,atm_pressure,visibility,wind_chill_tempt,relative_humidity,tempt_unit,dewpoint_tempt_unit,wind_speed_unit,wind_dir_unit,atm_pressure_unit,visibility_unit,wind_chill_tempt_unit,relative_humidity_unit
count,210489,210489.0,210489.0,210489.0,210489.0,210489.0,210489.0,210489.0,210489.0,210489,210489,210489,210489,210489,210489,210489,210489
min,2010-01-01 00:20:00+00:00,-18.0,-18.0,0.0,0.0,0.0,0.0,-18.0,0.0,celsius,celsius,m/s,degrees from north,mbar,km,celsius,percentage (%)
max,2022-02-14 23:50:00+00:00,32.0,21.0,19.547,360.0,1049.103,11.265,33.017,100.0,celsius,celsius,m/s,degrees from north,mbar,km,celsius,percentage (%)


In [156]:
"/".join(project_dir.split("/")[:-1]) + "/meteorological_data_bristol_lulsgate_airport.csv"

'/Users/gurdeep/Documents/tb2/DSMP/meteorological_data_bristol_lulsgate_airport.csv'

In [157]:
df.to_csv("/".join(project_dir.split("/")[:-1]) + "/meteorological_data_bristol_lulsgate_airport.csv", index=False)