# **General imports**

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
from config.path_config import ROOT_DIR
import connectors
import downloader as kaggle_downloader

# **Global configuration**

In [4]:
os.chdir(ROOT_DIR)

In [5]:
pd.set_option('max_colwidth', 400)
sns.set_style("dark")
plt.rcParams["figure.figsize"] = (50,30)

# **Download data from Kaggle**

In [6]:
kaggle_downloader.download(config_file = "./config/kaggle_config.yaml", credentials_file = "./config/credentials_map.yaml")

# **Connect to s3 bucket and read in the data**

In [8]:
container = connectors.build_s3_connector(
    storage_config = "config/storage_map.yaml",
    s3_credentials = "config/credentials_map.yaml"
)

In [9]:
df = connectors.read_data(
    name = "traffic_violaions.csv",
    prefix = "traffic-violations-dataset",
    layer = "RAW",
    container = container
)

In [10]:
df.head()

Unnamed: 0,stop_date,stop_time,country_name,driver_gender,driver_age_raw,driver_age,driver_race,violation_raw,violation,search_conducted,search_type,stop_outcome,is_arrested,stop_duration,drugs_related_stop
0,1/2/2005,1:55,,M,1985.0,20.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
1,1/18/2005,8:15,,M,1965.0,40.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
2,1/23/2005,23:15,,M,1972.0,33.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
3,2/20/2005,17:15,,M,1986.0,19.0,White,Call for Service,Other,False,,Arrest Driver,True,16-30 Min,False
4,3/14/2005,10:00,,F,1984.0,21.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False


In [11]:
df.shape

(52966, 15)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52966 entries, 0 to 52965
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   stop_date           52966 non-null  object 
 1   stop_time           52965 non-null  object 
 2   country_name        0 non-null      float64
 3   driver_gender       49580 non-null  object 
 4   driver_age_raw      49588 non-null  float64
 5   driver_age          49346 non-null  float64
 6   driver_race         49581 non-null  object 
 7   violation_raw       49581 non-null  object 
 8   violation           49581 non-null  object 
 9   search_conducted    52965 non-null  object 
 10  search_type         2085 non-null   object 
 11  stop_outcome        49581 non-null  object 
 12  is_arrested         49581 non-null  object 
 13  stop_duration       49581 non-null  object 
 14  drugs_related_stop  52965 non-null  object 
dtypes: float64(3), object(12)
memory usage: 6.1+ MB
