In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats as stats

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV,\
cross_val_score, RandomizedSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import roc_auc_score, plot_roc_curve
from sklearn.metrics import log_loss
from sklearn.metrics import make_scorer
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

### Import, explore, and clean Crash Data

In [2]:
#import Crash DataFrame 
crash_df = pd.read_csv('data/Traffic_Crashes_-_Crashes.csv')

In [3]:
crash_df

Unnamed: 0,CRASH_RECORD_ID,RD_NO,CRASH_DATE_EST_I,CRASH_DATE,POSTED_SPEED_LIMIT,TRAFFIC_CONTROL_DEVICE,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,FIRST_CRASH_TYPE,...,INJURIES_NON_INCAPACITATING,INJURIES_REPORTED_NOT_EVIDENT,INJURIES_NO_INDICATION,INJURIES_UNKNOWN,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LATITUDE,LONGITUDE,LOCATION
0,0001dc2c34878baec9b7223e7ead101e0487e2e994c977...,JF221668,,04/27/2022 09:30:00 AM,20,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,ANGLE,...,1.0,0.0,2.0,0.0,9,4,4,41.926951,-87.661559,POINT (-87.661558949813 41.926951230142)
1,00554edcbf68c6eb4d438e92ce71a593e858971fd885a4...,JF228356,,05/03/2022 06:40:00 AM,15,OTHER REG. SIGN,FUNCTIONING PROPERLY,RAIN,"DARKNESS, LIGHTED ROAD",SIDESWIPE SAME DIRECTION,...,0.0,0.0,5.0,0.0,6,3,5,41.927526,-87.765423,POINT (-87.765422741103 41.927525861297)
2,01143c127253f877ec850422012fae34b6b0e58bf678ae...,JD337690,,08/19/2020 09:30:00 AM,25,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,ANGLE,...,0.0,0.0,2.0,0.0,9,4,8,41.932352,-87.688045,POINT (-87.688044974908 41.932351848527)
3,00bf78dfa54ff84306859dc5d220341f1891eaf5fcf6a6...,JF213955,,04/20/2022 04:32:00 PM,15,OTHER,OTHER,CLEAR,DAYLIGHT,REAR TO FRONT,...,0.0,0.0,2.0,0.0,16,4,4,41.794998,-87.622620,POINT (-87.622620128003 41.794997895717)
4,0161c604b1fd2e187d5f4239be87a8b2d8be36b193f01b...,JF221562,Y,04/27/2022 07:00:00 AM,30,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,PARKED MOTOR VEHICLE,...,0.0,0.0,1.0,0.0,7,4,4,41.752961,-87.550746,POINT (-87.550746027529 41.752960600041)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
617341,8f081a009f98c6fe3ffa5968b81d24607b38ef78da4838...,JF224470,,04/29/2022 04:25:00 PM,30,TRAFFIC SIGNAL,UNKNOWN,CLEAR,DAYLIGHT,REAR END,...,0.0,0.0,3.0,0.0,16,6,4,41.876198,-87.686155,POINT (-87.686155352745 41.876198079481)
617342,a990787c46a181e9611488f24a7e8453931c9d9057206f...,JF230804,,05/01/2022 09:00:00 AM,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLOUDY/OVERCAST,DAYLIGHT,REAR END,...,0.0,0.0,2.0,0.0,9,1,5,41.980961,-87.839166,POINT (-87.839165640216 41.980961196676)
617343,cf0c350753ba62e99a06f77de5947b775b9e55917bd66b...,JF230794,,05/05/2022 07:27:00 AM,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,TURNING,...,0.0,0.0,3.0,0.0,7,5,5,41.814837,-87.743501,POINT (-87.743501480634 41.81483675036)
617344,d33181def1c0c3a57ecabef00a109440b361c01fb392bf...,JF230806,,05/05/2022 07:40:00 AM,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,TURNING,...,0.0,0.0,2.0,0.0,7,5,5,41.707680,-87.584816,POINT (-87.584816108862 41.707680407853)


In [4]:
crash_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 617346 entries, 0 to 617345
Data columns (total 49 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   CRASH_RECORD_ID                617346 non-null  object 
 1   RD_NO                          613078 non-null  object 
 2   CRASH_DATE_EST_I               46685 non-null   object 
 3   CRASH_DATE                     617346 non-null  object 
 4   POSTED_SPEED_LIMIT             617346 non-null  int64  
 5   TRAFFIC_CONTROL_DEVICE         617346 non-null  object 
 6   DEVICE_CONDITION               617346 non-null  object 
 7   WEATHER_CONDITION              617346 non-null  object 
 8   LIGHTING_CONDITION             617346 non-null  object 
 9   FIRST_CRASH_TYPE               617346 non-null  object 
 10  TRAFFICWAY_TYPE                617346 non-null  object 
 11  LANE_CNT                       198984 non-null  float64
 12  ALIGNMENT                     

In [5]:
crash_df.describe()

Unnamed: 0,POSTED_SPEED_LIMIT,LANE_CNT,STREET_NO,BEAT_OF_OCCURRENCE,NUM_UNITS,INJURIES_TOTAL,INJURIES_FATAL,INJURIES_INCAPACITATING,INJURIES_NON_INCAPACITATING,INJURIES_REPORTED_NOT_EVIDENT,INJURIES_NO_INDICATION,INJURIES_UNKNOWN,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LATITUDE,LONGITUDE
count,617346.0,198984.0,617346.0,617341.0,617346.0,616067.0,616067.0,616067.0,616067.0,616067.0,616067.0,616067.0,617346.0,617346.0,617346.0,613604.0,613604.0
mean,28.346645,13.33148,3678.675046,1238.055031,2.034235,0.182629,0.001156,0.019678,0.102456,0.059339,2.012385,0.0,13.223039,4.127933,6.596502,41.854468,-87.673255
std,6.329037,2961.787,2903.917903,705.722836,0.450624,0.554216,0.036907,0.163796,0.412342,0.312482,1.161061,0.0,5.537447,1.979825,3.450357,0.331937,0.674144
min,0.0,0.0,0.0,111.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,-87.936193
25%,30.0,2.0,1230.0,712.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,9.0,2.0,4.0,41.78096,-87.721366
50%,30.0,2.0,3200.0,1135.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,14.0,4.0,7.0,41.874571,-87.673567
75%,30.0,4.0,5600.0,1822.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,17.0,6.0,10.0,41.923961,-87.632862
max,99.0,1191625.0,451100.0,6100.0,18.0,21.0,4.0,7.0,21.0,15.0,61.0,0.0,23.0,7.0,12.0,42.02278,0.0


In [6]:
#Drop Irrelevant columns 
crash_df.drop(['RD_NO', 'LANE_CNT','TRAFFIC_CONTROL_DEVICE','DEVICE_CONDITION', 'SEC_CONTRIBUTORY_CAUSE', 'CRASH_DATE_EST_I','TRAFFICWAY_TYPE','ALIGNMENT','ROAD_DEFECT','REPORT_TYPE','DATE_POLICE_NOTIFIED','STREET_NO','STREET_DIRECTION','STREET_NAME','PHOTOS_TAKEN_I','STATEMENTS_TAKEN_I','DOORING_I','WORK_ZONE_I','BEAT_OF_OCCURRENCE','WORK_ZONE_TYPE','WORKERS_PRESENT_I','INJURIES_TOTAL','INJURIES_FATAL','INJURIES_REPORTED_NOT_EVIDENT','INJURIES_NON_INCAPACITATING','INJURIES_NO_INDICATION','INJURIES_UNKNOWN','LATITUDE','LONGITUDE','LOCATION'], axis=1, inplace=True)

In [7]:
crash_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 617346 entries, 0 to 617345
Data columns (total 19 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   CRASH_RECORD_ID          617346 non-null  object 
 1   CRASH_DATE               617346 non-null  object 
 2   POSTED_SPEED_LIMIT       617346 non-null  int64  
 3   WEATHER_CONDITION        617346 non-null  object 
 4   LIGHTING_CONDITION       617346 non-null  object 
 5   FIRST_CRASH_TYPE         617346 non-null  object 
 6   ROADWAY_SURFACE_COND     617346 non-null  object 
 7   CRASH_TYPE               617346 non-null  object 
 8   INTERSECTION_RELATED_I   141110 non-null  object 
 9   NOT_RIGHT_OF_WAY_I       29078 non-null   object 
 10  HIT_AND_RUN_I            189199 non-null  object 
 11  DAMAGE                   617346 non-null  object 
 12  PRIM_CONTRIBUTORY_CAUSE  617346 non-null  object 
 13  NUM_UNITS                617346 non-null  int64  
 14  MOST

In [8]:
#Fill/Drop relevant nulls 
crash_df["INTERSECTION_RELATED_I"].fillna("Unknown", inplace=True)
crash_df["NOT_RIGHT_OF_WAY_I"].fillna("Unknown", inplace=True)
crash_df["HIT_AND_RUN_I"].fillna("Unknown", inplace=True)
crash_df["MOST_SEVERE_INJURY"].fillna("Unknown", inplace=True)
crash_df.dropna(subset=["INJURIES_INCAPACITATING"], inplace=True)

### Import, explore, and clean People DataFrame

In [9]:
#import People DataFrame 
people_df = pd.read_csv('data/Traffic_Crashes_-_People.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [10]:
#people_df

In [11]:
#people_df.info()

In [12]:
#Drop irrelevant columns
people_df.drop(['RD_NO', 'CRASH_DATE', 'SEAT_NO','CITY','STATE','ZIPCODE','DRIVERS_LICENSE_STATE','DRIVERS_LICENSE_CLASS','EJECTION','INJURY_CLASSIFICATION','HOSPITAL','EMS_AGENCY','EMS_RUN_NO','PEDPEDAL_ACTION','PEDPEDAL_VISIBILITY','PEDPEDAL_LOCATION','BAC_RESULT','BAC_RESULT VALUE','CELL_PHONE_USE'], axis=1, inplace=True)

In [13]:
#Remove nulls from relevant rows 
people_df.dropna(subset=["VEHICLE_ID"], inplace=True)
people_df.dropna(subset=["SEX"], inplace=True)
people_df.dropna(subset=["SAFETY_EQUIPMENT"], inplace=True)
people_df.dropna(subset=["AIRBAG_DEPLOYED"], inplace=True)
people_df.dropna(subset=["DRIVER_ACTION"], inplace=True)
people_df.dropna(subset=["DRIVER_VISION"], inplace=True)
people_df.dropna(subset=["PHYSICAL_CONDITION"], inplace=True)
people_df.dropna(subset=["AGE"], inplace=True)

In [14]:
#people_df.info()

### Import, explore, and clean Car DataFrame

In [15]:
car_df = pd.read_csv('data/Traffic_Crashes_-_Vehicles.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [16]:
#car_df

In [17]:
#car_df.info()

In [18]:
#Create new Car DataFrame with relevant columns 
clean_car_df = car_df.filter(['CRASH_RECORD_ID','UNIT_TYPE','MAKE','MODEL','VEHICLE_YEAR','VEHICLE_DEFECT','VEHICLE_TYPE','VEHICLE_USE','MANEUVER', 'TOWED_I','EXCEED_SPEED_LIMIT_I'], axis=1)

In [19]:
#clean_car_df

In [20]:
#clean_car_df.info()

In [21]:
#Drop nulls 
clean_car_df.dropna(subset=["UNIT_TYPE"], inplace=True)
clean_car_df.dropna(subset=["MAKE"], inplace=True)
clean_car_df.dropna(subset=["MODEL"], inplace=True)
clean_car_df.dropna(subset=["VEHICLE_YEAR"], inplace=True)
clean_car_df.dropna(subset=["VEHICLE_DEFECT"], inplace=True)
clean_car_df.dropna(subset=["VEHICLE_USE"], inplace=True)
clean_car_df.dropna(subset=["MANEUVER"], inplace=True)
clean_car_df["TOWED_I"].fillna("Unknown", inplace=True)
clean_car_df["EXCEED_SPEED_LIMIT_I"].fillna("Unknown", inplace=True)

In [22]:
clean_car_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1035864 entries, 0 to 1266485
Data columns (total 11 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   CRASH_RECORD_ID       1035864 non-null  object 
 1   UNIT_TYPE             1035864 non-null  object 
 2   MAKE                  1035864 non-null  object 
 3   MODEL                 1035864 non-null  object 
 4   VEHICLE_YEAR          1035864 non-null  float64
 5   VEHICLE_DEFECT        1035864 non-null  object 
 6   VEHICLE_TYPE          1035864 non-null  object 
 7   VEHICLE_USE           1035864 non-null  object 
 8   MANEUVER              1035864 non-null  object 
 9   TOWED_I               1035864 non-null  object 
 10  EXCEED_SPEED_LIMIT_I  1035864 non-null  object 
dtypes: float64(1), object(10)
memory usage: 94.8+ MB


### Merge Crash, People, and Car DataFrames, then explore and clean new DataFrame

In [23]:
#merge crash data and people data 
crash_people_df = pd.merge(crash_df,people_df, how='left',left_on = 'CRASH_RECORD_ID', right_on = "CRASH_RECORD_ID", indicator=True)

#remove duplicates 
crash_people_df.drop_duplicates(subset = 'CRASH_RECORD_ID', inplace = True)

In [24]:
#rename '_merge' column to 'Check', necessary for second merge 
crash_people_df.rename(columns = {'_merge':'Check'}, inplace = True)

In [25]:
#Merge crash, people, and car DataFrames together(CPC) 
cpc_df = pd.merge(crash_people_df, clean_car_df, how='left',left_on = 'CRASH_RECORD_ID', right_on = "CRASH_RECORD_ID", indicator=True)

#Drop duplicates 
cpc_df.drop_duplicates(subset = 'CRASH_RECORD_ID', inplace = True)

In [26]:
pd.set_option('display.max_columns', None)

In [27]:
cpc_df

Unnamed: 0,CRASH_RECORD_ID,CRASH_DATE,POSTED_SPEED_LIMIT,WEATHER_CONDITION,LIGHTING_CONDITION,FIRST_CRASH_TYPE,ROADWAY_SURFACE_COND,CRASH_TYPE,INTERSECTION_RELATED_I,NOT_RIGHT_OF_WAY_I,HIT_AND_RUN_I,DAMAGE,PRIM_CONTRIBUTORY_CAUSE,NUM_UNITS,MOST_SEVERE_INJURY,INJURIES_INCAPACITATING,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,PERSON_ID,PERSON_TYPE,VEHICLE_ID,SEX,AGE,SAFETY_EQUIPMENT,AIRBAG_DEPLOYED,DRIVER_ACTION,DRIVER_VISION,PHYSICAL_CONDITION,Check,UNIT_TYPE,MAKE,MODEL,VEHICLE_YEAR,VEHICLE_DEFECT,VEHICLE_TYPE,VEHICLE_USE,MANEUVER,TOWED_I,EXCEED_SPEED_LIMIT_I,_merge
0,0001dc2c34878baec9b7223e7ead101e0487e2e994c977...,04/27/2022 09:30:00 AM,20,CLEAR,DAYLIGHT,ANGLE,DRY,INJURY AND / OR TOW DUE TO CRASH,Unknown,Unknown,Unknown,"OVER $1,500",UNABLE TO DETERMINE,2,NONINCAPACITATING INJURY,0.0,9,4,4,O1324770,DRIVER,1258370.0,M,24.0,USAGE UNKNOWN,DID NOT DEPLOY,UNKNOWN,UNKNOWN,NORMAL,both,DRIVER,TOYOTA,RAV4,2007.0,UNKNOWN,PASSENGER,PERSONAL,STRAIGHT AHEAD,Unknown,Unknown,both
2,00554edcbf68c6eb4d438e92ce71a593e858971fd885a4...,05/03/2022 06:40:00 AM,15,RAIN,"DARKNESS, LIGHTED ROAD",SIDESWIPE SAME DIRECTION,WET,INJURY AND / OR TOW DUE TO CRASH,Unknown,Unknown,Unknown,"OVER $1,500",FAILING TO YIELD RIGHT-OF-WAY,3,NO INDICATION OF INJURY,0.0,6,3,5,O1328703,DRIVER,1262176.0,F,20.0,SAFETY BELT USED,DID NOT DEPLOY,FAILED TO YIELD,UNKNOWN,NORMAL,both,DRIVER,NISSAN,VERSA,2007.0,NONE,PASSENGER,PERSONAL,ENTERING TRAFFIC LANE FROM PARKING,Y,Unknown,both
5,01143c127253f877ec850422012fae34b6b0e58bf678ae...,08/19/2020 09:30:00 AM,25,CLEAR,DAYLIGHT,ANGLE,DRY,NO INJURY / DRIVE AWAY,Unknown,Y,Unknown,"OVER $1,500",UNABLE TO DETERMINE,2,NO INDICATION OF INJURY,0.0,9,4,8,O941437,DRIVER,892450.0,M,26.0,USAGE UNKNOWN,DID NOT DEPLOY,UNKNOWN,UNKNOWN,UNKNOWN,both,,,,,,,,,,,left_only
6,00bf78dfa54ff84306859dc5d220341f1891eaf5fcf6a6...,04/20/2022 04:32:00 PM,15,CLEAR,DAYLIGHT,REAR TO FRONT,DRY,NO INJURY / DRIVE AWAY,Unknown,Unknown,Unknown,"OVER $1,500",UNABLE TO DETERMINE,2,NO INDICATION OF INJURY,0.0,16,4,4,O1320543,DRIVER,1254277.0,M,30.0,USAGE UNKNOWN,DID NOT DEPLOY,IMPROPER BACKING,UNKNOWN,UNKNOWN,both,DRIVER,JEEP,COMPASS,2012.0,WINDOWS,SPORT UTILITY VEHICLE (SUV),PERSONAL,BACKING,Unknown,Unknown,both
8,0161c604b1fd2e187d5f4239be87a8b2d8be36b193f01b...,04/27/2022 07:00:00 AM,30,CLEAR,DAYLIGHT,PARKED MOTOR VEHICLE,DRY,INJURY AND / OR TOW DUE TO CRASH,Unknown,Unknown,Y,"OVER $1,500",UNABLE TO DETERMINE,2,NO INDICATION OF INJURY,0.0,7,4,4,,,,,,,,,,,left_only,,,,,,,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1059017,8f081a009f98c6fe3ffa5968b81d24607b38ef78da4838...,04/29/2022 04:25:00 PM,30,CLEAR,DAYLIGHT,REAR END,DRY,INJURY AND / OR TOW DUE TO CRASH,Unknown,Unknown,Y,"OVER $1,500",UNABLE TO DETERMINE,2,NO INDICATION OF INJURY,0.0,16,6,4,O1326355,DRIVER,1260025.0,M,34.0,USAGE UNKNOWN,NOT APPLICABLE,NONE,NOT OBSCURED,NORMAL,both,DRIVER,CHEVROLET,EQUINOX,2006.0,UNKNOWN,PASSENGER,PERSONAL,STRAIGHT AHEAD,Y,Unknown,both
1059019,a990787c46a181e9611488f24a7e8453931c9d9057206f...,05/01/2022 09:00:00 AM,30,CLOUDY/OVERCAST,DAYLIGHT,REAR END,DRY,NO INJURY / DRIVE AWAY,Unknown,Unknown,Unknown,"OVER $1,500",UNABLE TO DETERMINE,2,NO INDICATION OF INJURY,0.0,9,1,5,O1330192,DRIVER,1263607.0,M,36.0,USAGE UNKNOWN,DEPLOYMENT UNKNOWN,OTHER,UNKNOWN,NORMAL,both,DRIVER,MERCEDES-BENZ,OTHER (EXPLAIN IN NARRATIVE),2012.0,UNKNOWN,PASSENGER,PERSONAL,SLOW/STOP IN TRAFFIC,Unknown,Unknown,both
1059021,cf0c350753ba62e99a06f77de5947b775b9e55917bd66b...,05/05/2022 07:27:00 AM,30,CLEAR,DAYLIGHT,TURNING,DRY,NO INJURY / DRIVE AWAY,Unknown,Unknown,Unknown,"OVER $1,500",IMPROPER TURNING/NO SIGNAL,2,NO INDICATION OF INJURY,0.0,7,5,5,O1330248,DRIVER,1263658.0,F,40.0,SAFETY BELT USED,NOT APPLICABLE,IMPROPER TURN,NOT OBSCURED,NORMAL,both,DRIVER,HONDA,CR-V,2013.0,NONE,PASSENGER,PERSONAL,TURNING RIGHT,Unknown,Unknown,both
1059023,d33181def1c0c3a57ecabef00a109440b361c01fb392bf...,05/05/2022 07:40:00 AM,30,CLEAR,DAYLIGHT,TURNING,DRY,INJURY AND / OR TOW DUE TO CRASH,Unknown,Unknown,Unknown,"OVER $1,500",IMPROPER TURNING/NO SIGNAL,2,NO INDICATION OF INJURY,0.0,7,5,5,O1330195,DRIVER,1263606.0,M,38.0,USAGE UNKNOWN,NOT APPLICABLE,UNKNOWN,UNKNOWN,NORMAL,both,DRIVER,CHEVROLET,CAMARO,2018.0,UNKNOWN,PASSENGER,PERSONAL,STRAIGHT AHEAD,Unknown,Unknown,both


In [28]:
#cpc_df.info()

In [29]:
#create a new column with only the top 100 makes, and an 'Other' category for all the others 
#chanaged from 100 to 150
TOP_MAKES = cpc_df['MAKE'].value_counts()
threshold = 150
cpc_df['TOP_MAKES'] = np.where(cpc_df['MAKE'].isin(TOP_MAKES.index[TOP_MAKES >= threshold ]), cpc_df['MAKE'], 'other')

In [30]:
#Use map function to create a binary target column 
#helps to create more balanced dataset 
map = {"OVER $1,500":1,"$501 - $1,500": 0, "$500 OR LESS": 0}

cpc_df["Target"] = cpc_df["DAMAGE"].map(map)

In [31]:
#check for balanced dataset 
cpc_df["Target"].value_counts(normalize=True)

1    0.597682
0    0.402318
Name: Target, dtype: float64

In [32]:
#cpc_df.info()

In [33]:
#drop irrelevant columns 
#removed 'maneuvers' from this list (so it is now included in the data)
cpc_df.drop(['PERSON_ID','CRASH_RECORD_ID','DAMAGE','CRASH_DATE','PERSON_TYPE', 'VEHICLE_ID','SAFETY_EQUIPMENT','DRIVER_VISION','Check','_merge','MODEL','MAKE','VEHICLE_DEFECT','VEHICLE_USE','EXCEED_SPEED_LIMIT_I'], axis=1, inplace=True)

In [34]:
#drop nulls 
cpc_df.dropna(subset=["SEX"], inplace=True)
cpc_df.dropna(subset=["VEHICLE_YEAR"], inplace=True)

In [35]:
cpc_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 481160 entries, 0 to 1059025
Data columns (total 28 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   POSTED_SPEED_LIMIT       481160 non-null  int64  
 1   WEATHER_CONDITION        481160 non-null  object 
 2   LIGHTING_CONDITION       481160 non-null  object 
 3   FIRST_CRASH_TYPE         481160 non-null  object 
 4   ROADWAY_SURFACE_COND     481160 non-null  object 
 5   CRASH_TYPE               481160 non-null  object 
 6   INTERSECTION_RELATED_I   481160 non-null  object 
 7   NOT_RIGHT_OF_WAY_I       481160 non-null  object 
 8   HIT_AND_RUN_I            481160 non-null  object 
 9   PRIM_CONTRIBUTORY_CAUSE  481160 non-null  object 
 10  NUM_UNITS                481160 non-null  int64  
 11  MOST_SEVERE_INJURY       481160 non-null  object 
 12  INJURIES_INCAPACITATING  481160 non-null  float64
 13  CRASH_HOUR               481160 non-null  int64  
 14  CRA

### Create numeric feature DF for DecisionTreeClassifier 

In [36]:
#create new DF with just our numeric values
numeric_df = cpc_df[['POSTED_SPEED_LIMIT','NUM_UNITS','INJURIES_INCAPACITATING','CRASH_HOUR','CRASH_DAY_OF_WEEK','CRASH_MONTH','AGE','VEHICLE_YEAR', 'Target']]

In [37]:
numeric_df

Unnamed: 0,POSTED_SPEED_LIMIT,NUM_UNITS,INJURIES_INCAPACITATING,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,AGE,VEHICLE_YEAR,Target
0,20,2,0.0,9,4,4,24.0,2007.0,1
2,15,3,0.0,6,3,5,20.0,2007.0,1
6,15,2,0.0,16,4,4,30.0,2012.0,1
10,30,2,0.0,2,3,5,41.0,2013.0,1
11,30,2,0.0,15,4,4,57.0,2012.0,1
...,...,...,...,...,...,...,...,...,...
1059017,30,2,0.0,16,6,4,34.0,2006.0,1
1059019,30,2,0.0,9,1,5,36.0,2012.0,1
1059021,30,2,0.0,7,5,5,40.0,2013.0,1
1059023,30,2,0.0,7,5,5,38.0,2018.0,1


In [38]:
cpc_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 481160 entries, 0 to 1059025
Data columns (total 28 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   POSTED_SPEED_LIMIT       481160 non-null  int64  
 1   WEATHER_CONDITION        481160 non-null  object 
 2   LIGHTING_CONDITION       481160 non-null  object 
 3   FIRST_CRASH_TYPE         481160 non-null  object 
 4   ROADWAY_SURFACE_COND     481160 non-null  object 
 5   CRASH_TYPE               481160 non-null  object 
 6   INTERSECTION_RELATED_I   481160 non-null  object 
 7   NOT_RIGHT_OF_WAY_I       481160 non-null  object 
 8   HIT_AND_RUN_I            481160 non-null  object 
 9   PRIM_CONTRIBUTORY_CAUSE  481160 non-null  object 
 10  NUM_UNITS                481160 non-null  int64  
 11  MOST_SEVERE_INJURY       481160 non-null  object 
 12  INJURIES_INCAPACITATING  481160 non-null  float64
 13  CRASH_HOUR               481160 non-null  int64  
 14  CRA

#### Train Test Split for numeric features 

In [39]:
# Split df into X and y
X = numeric_df.drop("Target", axis=1)
y = numeric_df["Target"]

# Perform train-test split with random_state=42 and stratify=y
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [40]:
#assert X and Y have same number of rows and columns 
assert X_train.shape[0] == y_train.shape[0] 

# X and y testing data should have the same number of rows
assert X_test.shape[0] == y_test.shape[0] 

# Both X should have 52 columns
assert X_train.shape[1] == X_test.shape[1] 

# Both y should have 1 column
assert len(y_train.shape) == len(y_test.shape)

#### Decision Tree Classifier 

In [41]:
#Decision Tree Classifier 
numeric_tree_1 = DecisionTreeClassifier(max_depth=8, random_state=42)

numeric_tree_1.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=8, random_state=42)

In [42]:
#accuracy on training data
numeric_tree_1.score(X_train, y_train)

0.6097846870064012

In [43]:
# Accuracy on test data
numeric_tree_1.score(X_test, y_test)

0.6076897497713858

In [44]:
# Allow it to run the full default hyperparameters
numeric_tree_1 = DecisionTreeClassifier(random_state=42)
numeric_tree_1.fit(X_train, y_train)

# Accuracy on training data & test data
print('Training:', numeric_tree_1.score(X_train, y_train))
print('Testing:', numeric_tree_1.score(X_test, y_test))

Training: 0.9762601490841577
Testing: 0.5350236927425389


In [45]:
# Add in a stop function - min_impurity_decrease
numeric_tree_1 = DecisionTreeClassifier(min_impurity_decrease=0.3, random_state=42)
numeric_tree_1.fit(X_train, y_train)

# Accuracy on training data & test data
print('Training:', numeric_tree_1.score(X_train, y_train))
print('Testing:', numeric_tree_1.score(X_test, y_test))

Training: 0.6061878238700917
Testing: 0.6061850527890931


In [46]:
feature_used = ['POSTED_SPEED_LIMIT','NUM_UNITS','INJURIES_INCAPACITATING','CRASH_HOUR','CRASH_DAY_OF_WEEK','CRASH_MONTH','AGE','VEHICLE_YEAR'] 
X = numeric_df.drop("Target", axis=1)
y = numeric_df["Target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

X_train.shape, X_test.shape

((384928, 8), (96232, 8))

In [47]:
numeric_tree_1 = DecisionTreeClassifier(max_depth = 10, random_state=42)

numeric_tree_1.fit(X, y)

for fi, feature in zip(numeric_tree_1.feature_importances_, feature_used):
    print(fi, feature)

0.12438274228250057 POSTED_SPEED_LIMIT
0.5054602398837356 NUM_UNITS
0.04035832003946407 INJURIES_INCAPACITATING
0.16137167324895446 CRASH_HOUR
0.011935469604048803 CRASH_DAY_OF_WEEK
0.01734885785502332 CRASH_MONTH
0.08440392034248823 AGE
0.05473877674378491 VEHICLE_YEAR


### Logistic Regression on numeric data


In [48]:
#create baseline model 
baseline_model = LogisticRegression(random_state=42)

# Use cross_val_score with scoring="neg_log_loss" to evaluate the model
# on X_train and y_train
baseline_neg_log_loss_cv = cross_val_score(baseline_model, X_train, y_train, scoring="neg_log_loss")

baseline_log_loss = -(baseline_neg_log_loss_cv.mean())
baseline_log_loss

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.6636212745678178

In [49]:
#see what our log-loss would be if our model choose the majority class 
log_loss(y_train, np.zeros(len(y_train)))

20.938989624794132

##### The lower the Log-Loss the better. So, although log-loss isn't the most translational metric, we can see that our model us definitely better than just guesing the majority every time. 

### Longer way of doing what we did above, by hand 

In [50]:
# Negative log loss doesn't exist as something we can import,
# but we can create it
neg_log_loss = make_scorer(log_loss, greater_is_better=False, needs_proba=True)
# Instantiate the model (same as previous example)
baseline_model = LogisticRegression(random_state=42)
# Create a list to hold the score from each fold
kfold_scores = np.ndarray(5)

# Instantiate a splitter object and loop over its result
kfold = StratifiedKFold()
for fold, (train_index, val_index) in enumerate(kfold.split(X_train, y_train)):
    # Extract train and validation subsets using the provided indices
    X_t, X_val = X_train.iloc[train_index], X_train.iloc[val_index]
    y_t, y_val = y_train.iloc[train_index], y_train.iloc[val_index]
    
    # Clone the baseline model and fit it on the train subset
    train_model_1 = clone(baseline_model)
    train_model_1.fit(X_t, y_t)
    
    # Evaluate the provided model on the validation subset
    neg_log_loss_score = neg_log_loss(train_model_1, X_val, y_val)
    kfold_scores[fold] = neg_log_loss_score
    
-(kfold_scores.mean())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.6636212745678178

In [51]:
print(baseline_neg_log_loss_cv)
print(kfold_scores)

[-0.66341176 -0.6637553  -0.66417904 -0.66389296 -0.66286732]
[-0.66341176 -0.6637553  -0.66417904 -0.66389296 -0.66286732]


### Writing a Custom Cross Validation Function with StratifiedKFold - Trying Something


In [52]:
# #Import relevant sklearn and imblearn classes
# from sklearn.preprocessing import StandardScaler
# from imblearn.over_sampling import SMOTE

# def custom_cross_val_score(estimator, X, y):
#     # Create a list to hold the scores from each fold
#     kfold_train_scores = np.ndarray(5)
#     kfold_val_scores = np.ndarray(5)

#     # Instantiate a splitter object and loop over its result
#     kfold = StratifiedKFold(n_splits=5)
#     for fold, (train_index, val_index) in enumerate(kfold.split(X, y)):
#         # Extract train and validation subsets using the provided indices
#         X_t, X_val = X.iloc[train_index], X.iloc[val_index]
#         y_t, y_val = y.iloc[train_index], y.iloc[val_index]
        
#         # Instantiate StandardScaler
#         scaler = StandardScaler()
#         # Fit and transform X_t
#         X_t_scaled = scaler.fit_transform(X_t)
#         # Transform X_val
#         X_val_scaled = scaler.transform(X_val)
        
#         # Instantiate SMOTE with random_state=42 and sampling_strategy=0.78
#         sm = SMOTE(random_state=42, sampling_strategy=0.78)
#         # Fit and transform X_t_scaled and y_t using sm
#         X_t_oversampled, y_t_oversampled = sm.fit_resample(X_t_scaled, y_t)
        

        
#         # Clone the provided model and fit it on the train subset
#         temp_model = clone(estimator)
#         temp_model.fit(X_t_oversampled, y_t_oversampled)
        
#         # Evaluate the provided model on the train and validation subsets
#         neg_log_loss_score_train = neg_log_loss(temp_model, X_t_oversampled, y_t_oversampled)
#         neg_log_loss_score_val = neg_log_loss(temp_model, X_val_scaled, y_val)
#         kfold_train_scores[fold] = neg_log_loss_score_train
#         kfold_val_scores[fold] = neg_log_loss_score_val
        
#     return kfold_train_scores, kfold_val_scores

# model_with_preprocessing = LogisticRegression(random_state=42, class_weight={1: 0.78})
# preprocessed_train_scores, preprocessed_neg_log_loss_cv = custom_cross_val_score(model_with_preprocessing, X_train, y_train)
# - (preprocessed_neg_log_loss_cv.mean())

In [53]:
# print(-baseline_neg_log_loss_cv.mean())
# print(-preprocessed_neg_log_loss_cv.mean())

In [54]:
# print("Train:     ", -preprocessed_train_scores)
# print("Validation:", -preprocessed_neg_log_loss_cv)

In [55]:
# model_with_preprocessing.get_params()

### Trying something

In [56]:
# Import relevant sklearn and imblearn classes
# from sklearn.preprocessing import StandardScaler
# from imblearn.over_sampling import SMOTE

# def custom_cross_val_score(estimator, X, y):
#     # Create a list to hold the scores from each fold
#     kfold_train_scores = np.ndarray(5)
#     kfold_val_scores = np.ndarray(5)

#     # Instantiate a splitter object and loop over its result
#     kfold = StratifiedKFold(n_splits=5)
#     for fold, (train_index, val_index) in enumerate(kfold.split(X, y)):
#         # Extract train and validation subsets using the provided indices
#         X_t, X_val = X.iloc[train_index], X.iloc[val_index]
#         y_t, y_val = y.iloc[train_index], y.iloc[val_index]
        
#         # Instantiate StandardScaler
#         scaler = StandardScaler()
#         # Fit and transform X_t
#         X_t_scaled = scaler.fit_transform(X_t)
#         # Transform X_val
#         X_val_scaled = scaler.transform(X_val)
        
#         # Instantiate SMOTE with random_state=42 and sampling_strategy=0.28
#         sm = SMOTE(random_state=42, sampling_strategy=0.78)
#         # Fit and transform X_t_scaled and y_t using sm
#         X_t_oversampled, y_t_oversampled = sm.fit_resample(X_t_scaled, y_t)
        

        
#         # Clone the provided model and fit it on the train subset
#         temp_model = clone(estimator)
#         temp_model.fit(X_t_oversampled, y_t_oversampled)
        
#         # Evaluate the provided model on the train and validation subsets
#         neg_log_loss_score_train = neg_log_loss(temp_model, X_t_oversampled, y_t_oversampled)
#         neg_log_loss_score_val = neg_log_loss(temp_model, X_val_scaled, y_val)
#         kfold_train_scores[fold] = neg_log_loss_score_train
#         kfold_val_scores[fold] = neg_log_loss_score_val
        
#     return kfold_train_scores, kfold_val_scores

# model_with_preprocessing = LogisticRegression(random_state=42, class_weight={1: 0.28})
# preprocessed_train_scores, preprocessed_neg_log_loss_cv = custom_cross_val_score(model_with_preprocessing, X_train, y_train)
# - (preprocessed_neg_log_loss_cv.mean())

In [57]:
# model_less_regularization = LogisticRegression(
#     random_state=42,
#     class_weight={1: 0.28},
#     C=1e5
# )

In [58]:
# # Check variable type
# assert type(model_less_regularization) == LogisticRegression

# # Check params
# assert model_less_regularization.get_params()["random_state"] == 42
# assert model_less_regularization.get_params()["class_weight"] == {1: 0.28}
# assert model_less_regularization.get_params()["C"] != 1.0

In [59]:
# less_regularization_train_scores, less_regularization_val_scores = custom_cross_val_score(
#     model_less_regularization,
#     X_train,
#     y_train
# )

# print("Previous Model")
# print("Train average:     ", -preprocessed_train_scores.mean())
# print("Validation average:", -preprocessed_neg_log_loss_cv.mean())
# print("Current Model")
# print("Train average:     ", -less_regularization_train_scores.mean())
# print("Validation average:", -less_regularization_val_scores.mean())

### Train Test Split on full dataset


In [70]:
X = cpc_df.drop(columns='Target')
y = cpc_df["Target"]

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=42)

In [71]:
#get categorical columns
cat_cols = [c for c in cpc_df.columns if cpc_df[c].dtype == "O"]
cat_cols

['WEATHER_CONDITION',
 'LIGHTING_CONDITION',
 'FIRST_CRASH_TYPE',
 'ROADWAY_SURFACE_COND',
 'CRASH_TYPE',
 'INTERSECTION_RELATED_I',
 'NOT_RIGHT_OF_WAY_I',
 'HIT_AND_RUN_I',
 'PRIM_CONTRIBUTORY_CAUSE',
 'MOST_SEVERE_INJURY',
 'SEX',
 'AIRBAG_DEPLOYED',
 'DRIVER_ACTION',
 'PHYSICAL_CONDITION',
 'UNIT_TYPE',
 'VEHICLE_TYPE',
 'MANEUVER',
 'TOWED_I',
 'TOP_MAKES']

In [72]:
from sklearn.compose import ColumnTransformer

#create encoder object - to help convert cat. variables to new columns
encoder = OneHotEncoder(handle_unknown = 'error',
                       drop = 'first',
                       categories='auto')

#create columntransformer object - to help merge transformed columns
#with the rest of the dataset

ct = ColumnTransformer(transformers=[('ohe', encoder, cat_cols)],
                      remainder='passthrough')

ct.fit(X_train)
X_train_enc = ct.transform(X_train)
X_test_enc = ct.transform(X_test)

In [77]:
X_train

Unnamed: 0,POSTED_SPEED_LIMIT,WEATHER_CONDITION,LIGHTING_CONDITION,FIRST_CRASH_TYPE,ROADWAY_SURFACE_COND,CRASH_TYPE,INTERSECTION_RELATED_I,NOT_RIGHT_OF_WAY_I,HIT_AND_RUN_I,PRIM_CONTRIBUTORY_CAUSE,NUM_UNITS,MOST_SEVERE_INJURY,INJURIES_INCAPACITATING,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,SEX,AGE,AIRBAG_DEPLOYED,DRIVER_ACTION,PHYSICAL_CONDITION,UNIT_TYPE,VEHICLE_YEAR,VEHICLE_TYPE,MANEUVER,TOWED_I,TOP_MAKES
221371,30,RAIN,DAYLIGHT,PARKED MOTOR VEHICLE,WET,INJURY AND / OR TOW DUE TO CRASH,Unknown,Unknown,Unknown,UNDER THE INFLUENCE OF ALCOHOL/DRUGS (USE WHEN...,3,NONINCAPACITATING INJURY,0.0,13,4,4,M,63.0,DEPLOYMENT UNKNOWN,UNKNOWN,IMPAIRED - ALCOHOL,DRIVER,2018.0,SPORT UTILITY VEHICLE (SUV),STRAIGHT AHEAD,Y,JEEP
1040937,30,CLEAR,DAYLIGHT,REAR END,UNKNOWN,NO INJURY / DRIVE AWAY,Unknown,Unknown,Unknown,FOLLOWING TOO CLOSELY,2,NO INDICATION OF INJURY,0.0,16,4,4,F,20.0,DEPLOYMENT UNKNOWN,FOLLOWED TOO CLOSELY,UNKNOWN,DRIVER,2012.0,PASSENGER,STRAIGHT AHEAD,Unknown,CHEVROLET
407456,30,CLEAR,DAYLIGHT,PARKED MOTOR VEHICLE,DRY,INJURY AND / OR TOW DUE TO CRASH,Unknown,Unknown,Unknown,DISTRACTION - FROM INSIDE VEHICLE,3,NO INDICATION OF INJURY,0.0,16,5,3,F,23.0,"DEPLOYED, FRONT",TOO FAST FOR CONDITIONS,NORMAL,DRIVER,2016.0,PASSENGER,STRAIGHT AHEAD,Y,CHEVROLET
505664,30,CLEAR,DAYLIGHT,TURNING,DRY,NO INJURY / DRIVE AWAY,Unknown,Unknown,Unknown,IMPROPER OVERTAKING/PASSING,2,NO INDICATION OF INJURY,0.0,9,6,11,M,34.0,DEPLOYMENT UNKNOWN,IMPROPER PASSING,UNKNOWN,DRIVER,2006.0,UNKNOWN/NA,PASSING/OVERTAKING,Unknown,CADILLAC
455488,30,CLEAR,"DARKNESS, LIGHTED ROAD",OVERTURNED,DRY,INJURY AND / OR TOW DUE TO CRASH,Y,Unknown,Unknown,DISTRACTION - FROM OUTSIDE VEHICLE,1,NO INDICATION OF INJURY,0.0,2,2,1,M,44.0,"DEPLOYED, COMBINATION",OTHER,NORMAL,DRIVER,2015.0,PASSENGER,SKIDDING/CONTROL LOSS,Y,LEXUS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
574527,30,CLEAR,DAYLIGHT,REAR END,DRY,NO INJURY / DRIVE AWAY,Unknown,Unknown,Unknown,FOLLOWING TOO CLOSELY,2,NO INDICATION OF INJURY,0.0,16,5,3,F,31.0,DID NOT DEPLOY,FOLLOWED TOO CLOSELY,NORMAL,DRIVER,2007.0,PASSENGER,STRAIGHT AHEAD,Unknown,"TOYOTA MOTOR COMPANY, LTD."
807821,30,CLEAR,"DARKNESS, LIGHTED ROAD",REAR END,DRY,NO INJURY / DRIVE AWAY,Unknown,Unknown,Y,FAILING TO YIELD RIGHT-OF-WAY,2,NO INDICATION OF INJURY,0.0,4,7,4,M,47.0,DID NOT DEPLOY,NONE,NORMAL,DRIVER,2013.0,PASSENGER,STRAIGHT AHEAD,Unknown,"TOYOTA MOTOR COMPANY, LTD."
296290,30,CLEAR,"DARKNESS, LIGHTED ROAD",REAR END,DRY,NO INJURY / DRIVE AWAY,Unknown,Unknown,Unknown,FOLLOWING TOO CLOSELY,2,NO INDICATION OF INJURY,0.0,19,7,11,M,32.0,DID NOT DEPLOY,FOLLOWED TOO CLOSELY,NORMAL,DRIVER,2011.0,PASSENGER,STRAIGHT AHEAD,Unknown,BMW
329096,30,CLEAR,DAYLIGHT,SIDESWIPE SAME DIRECTION,DRY,NO INJURY / DRIVE AWAY,Unknown,Unknown,Unknown,IMPROPER OVERTAKING/PASSING,2,NO INDICATION OF INJURY,0.0,17,1,8,F,45.0,DID NOT DEPLOY,IMPROPER LANE CHANGE,NORMAL,DRIVER,2007.0,PASSENGER,MERGING,Unknown,"TOYOTA MOTOR COMPANY, LTD."


In [80]:
ct.index.is_unique

AttributeError: 'ColumnTransformer' object has no attribute 'index'

In [79]:
X_train.index.is_unique

True

In [75]:
pd.DataFrame(X_train_enc, columns=ct.get_feature_names_out()).head()

ValueError: Shape of passed values is (384928, 1), indices imply (384928, 257)

In [None]:
cpc_df.info()

In [None]:
cpc_cat_df = cpc_df[['WEATHER_CONDITION','LIGHTING_CONDITION','FIRST_CRASH_TYPE','ROADWAY_SURFACE_COND','ROADWAY_SURFACE_COND','CRASH_TYPE','INTERSECTION_RELATED_I','NOT_RIGHT_OF_WAY_I','HIT_AND_RUN_I','PRIM_CONTRIBUTORY_CAUSE','MOST_SEVERE_INJURY','SEX','AIRBAG_DEPLOYED','DRIVER_ACTION','PHYSICAL_CONDITION','UNIT_TYPE','VEHICLE_TYPE','MANEUVER','TOWED_I','TOP_MAKES']]

In [None]:
from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder()

transformed_data = onehotencoder.fit_transform(cpc_cat_df)

# the above transformed_data is an array so convert it to dataframe
encoded_data = pd.DataFrame(transformed_data, index=cpc_cat_df.index)

# now concatenate the original data and the encoded data using pandas
concatenated_data = pd.concat([cpc_cat_df, encoded_data], axis=1)

In [None]:
concatenated_data.info()