## Actions
- Convert Timestamp to datetime instead of object **Done**
- there are 2 Unknown value in RadioNetworkGeneration column **Leave to Modeling Phase**
- there are 30 Duplicate rows should be dropped **Done**

In [1]:
# Setup Libraries
import pandas as pd
import numpy as np
import re

In [2]:
# Setup Dataset Path
DATA_PATH = '../data/raw/TrafficVolume.csv'

In [3]:
# load Data
df = pd.read_csv(DATA_PATH)

#### Convert Timestamp to datetime instead of object

In [4]:
#there is no Date in below rows we will drop them in next phase which be blank values as a result of regex
df_00 = df[df['Timestamp']=='00:00.0']
df_15 = df[df['Timestamp']=='15:00.0']
df_30 = df[df['Timestamp']=='30:00.0']
df_45 = df[df['Timestamp']=='45:00.0']
df_converted = df_00.append(df_15, ignore_index = True)
df_converted = df_converted.append(df_30, ignore_index = True)
df_converted = df_converted.append(df_45, ignore_index = True)
print(df_converted.shape)
df_converted.head(2)

(80, 10)


Unnamed: 0,Timestamp,LocationLatitude,LocationLongitude,RadioConnectionType,Country,RadioNetworkGeneration,RadioOperatorName,TrafficDirection,TrafficVolume,RadioMobileDataEnabled
0,00:00.0,24.58759,46.606747,Mobile,Saudi Arabia,4G,Operator A,Downlink,21.853164,Enabled
1,00:00.0,24.58759,46.606747,Mobile,Saudi Arabia,4G,Operator A,Uplink,0.158825,Enabled


In [5]:
def TimeStamp_Parser(TimeStampValue):
    return re.findall(r'\d{4}-\d?\d-\d?\d (?:2[0-3]|[01]?[0-9]):[0-5]?[0-9]:[0-5]?[0-9]', TimeStampValue)

In [6]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'].apply(lambda x: TimeStamp_Parser(x)).str[0])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138469 entries, 0 to 138468
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   Timestamp               138389 non-null  datetime64[ns]
 1   LocationLatitude        138469 non-null  float64       
 2   LocationLongitude       138469 non-null  float64       
 3   RadioConnectionType     138469 non-null  object        
 4   Country                 138469 non-null  object        
 5   RadioNetworkGeneration  138469 non-null  object        
 6   RadioOperatorName       138469 non-null  object        
 7   TrafficDirection        138469 non-null  object        
 8   TrafficVolume           138469 non-null  float64       
 9   RadioMobileDataEnabled  138469 non-null  object        
dtypes: datetime64[ns](1), float64(3), object(6)
memory usage: 10.6+ MB


In [12]:
# Missing Values
total = df.isnull().sum().sort_values(ascending=False)
percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(25)

Unnamed: 0,Total,Percent
Timestamp,80,0.000578
LocationLatitude,0,0.0
LocationLongitude,0,0.0
RadioConnectionType,0,0.0
Country,0,0.0
RadioNetworkGeneration,0,0.0
RadioOperatorName,0,0.0
TrafficDirection,0,0.0
TrafficVolume,0,0.0
RadioMobileDataEnabled,0,0.0


In [13]:
df.dropna(inplace=True)

#### there are 1812 Duplicate rows should be dropped

In [16]:
df.drop_duplicates(inplace=True)

#### Save File to parquet file

In [17]:
df.to_parquet("../data/processed/TrafficVolume_processed.parquet")