# テーブル概要
accident:  
event:  
gv(general vehicle):車両一般  
ve(Exterior Vehicle):車両外部  
vi(Interior Vehicle):車両内部  
oa(OCCUPANT ASSESSMENT):乗員の調査  
oi(OCCUPANT INJURY):乗員の傷害(mergeに使用できるkeyの値が同一でも傷害箇所によってレコードが増加)  

# 最終的な作成データ
- Crash year 2010–2015
- Vehicle model year 2001–2015
- Light vehicles (passenger cars, pick-ups and mini-vans) 
- Non-ejected occupants
- Occupant age 15 or higher
- Occupants with known injury status or fatality

# ライブラリのインポート Pandasの表示設定
同一cellに複数テーブルを表示  
全カラムを表示  
最大表示行数:500  
1つのカラムの最大表示文字数:200  
floatの有効桁数:4  
色付き文字の出力:print(pycolor.RED + '文字列' + pycolor.END)  

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import csv
import sys
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import  confusion_matrix, classification_report
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (roc_curve, auc, accuracy_score)
from sklearn.linear_model import Lasso
from sklearn import linear_model
from sklearn.feature_selection import SelectFromModel
from IPython import embed
from IPython.core.interactiveshell import InteractiveShell
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', 200)
pd.options.display.float_format = '{:.4g}'.format
class pycolor:
    BLACK = '\033[30m'
    RED = '\033[31m'
    GREEN = '\033[32m'
    YELLOW = '\033[33m'
    BLUE = '\033[34m'
    PURPLE = '\033[35m'
    CYAN = '\033[36m'
    WHITE = '\033[37m'
    END = '\033[0m'
    BOLD = '\038[1m'
    UNDERLINE = '\033[4m'
    INVISIBLE = '\033[08m'
    REVERCE = '\033[07m'

# NASS CDSデータの読み込み

In [None]:
path = os.path.dirname(os.path.abspath('__file__'))
file_name = ['accident', 'event', 'gv', 'oa', 'oi', 've', 'vi']
cds_key = []
cds = {}
uyear = [str(x) for x in range(1, 16)]
for year in range(2001, 2016):
    for file in file_name:
        if year >= 2009:
            df = pd.read_sas(os.path.join(path, str(year), 'FormattedData', '{}.sas7bdat'.format(file)))
        elif year >= 2001:
            df = pd.read_sas(os.path.join(path, str(year), 'PCSAS', '{}.sas7bdat'.format(file)))
        cds_key.append('{}_{}'.format(file, year - 2000))
        cds['{}_{}'.format(file, year - 2000)] = df

## 整数の値が入っているはずなのに小数点以下の値が入っているもの，同じ値なのにpython内部で別の値として認識されているものを修正
## 複数のテーブルで重複しているが利用しないカラムの削除

In [None]:
for col in ['PSU', 'VEHNO']:
    for y in uyear:
        for og in ['oa', 'gv', 've', 'vi']:
            cds['{}_{}'.format(og, y)][col] = cds['{}_{}'.format(og, y)][col].astype(np.int64)
        if col !=  'VEHNO':
            og = 'accident'
            cds['{}_{}'.format(og, y)][col] = cds['{}_{}'.format(og, y)][col].astype(np.int64)
            
for y in uyear:
    cds['accident_{}'.format(y)]['VEHFORMS'] = cds['accident_{}'.format(y)]['VEHFORMS'].astype(np.int64)

for col in ['CASENO', 'RATWGT', 'STRATIF', 'VERSION']:
    #for y in range(10, 16):
    for y in uyear:
        for og in ['oa', 'gv', 've', 'vi', 'accident', 'event', 'oi']:
            cds['{}_{}'.format(og, y)] = cds['{}_{}'.format(og, y)].drop(col, axis = 1)

## テーブル毎の結合keyを宣言

In [None]:
merge_key = {}
for file in file_name:
    if file in ['accident', 'event']:
        merge_key[file] =  ['CASEID', 'PSU']
    if file in ['gv', 've', 'vi']:
        merge_key[file] = ['CASEID', 'PSU', 'VEHNO']
    if file in ['oa', 'oi']:
        merge_key[file] = ['CASEID', 'PSU', 'VEHNO', 'OCCNO']

## テーブルの結合
oi,eventテーブルを除き,2010~2015年のデータをそれぞれ結合する

In [None]:
cds_merge = {}
for year in uyear:
    cds_merge[year] = cds['oa_{}'.format(year)]
    for file in [x for x in file_name if not (x in ['oa', 'oi', 'event'])]:
        if file != 'gv':
            cds_merge[year] = pd.merge(cds_merge[year], cds['{}_{}'.format(file, year)], on = merge_key[file], how = 'left')
        else:
            cds_merge[year] = pd.merge(cds_merge[year], cds['{}_{}'.format(file, year)], on = merge_key[file], how = 'inner')

## 編集用辞書データの作成

In [None]:
cds_prepro = {}
for year in uyear:
    cds_prepro[year] = cds_merge[year]
    #print(year,len(cds_prepro[year]))

## Car to Car・シートベルト着用・15歳以上・最も高いデルタVを記録した衝突対象がVEHNO1or2・自車と衝突車両のボディタイプが特殊車両やバイクでない 
GVのデータが存在する車両数なので,VEHFORMSのみで事故に関連した車両数が2台とは言い切れない
ベルトは3点ベルトのみ着用しているとみなす

In [None]:
all_sum = 0
for year in uyear:
    cds_prepro[year] = cds_prepro[year].query('MANUSE == 4 & VEHFORMS == 2 & MAIS <= 6 & AGE >= 15 & OBJCONT1 <= 2 & otbdytyp < 80 & otbdytyp != 13 & BODYTYPE < 80 & BODYTYPE != 13')
    print(year, len(cds_prepro[year]))
    all_sum += len(cds_prepro[year])
print(all_sum)

## 自動運転中に取得可能な特徴量を全て抽出
絞り込みに使用したPARUSEは除く  


ACCIDENT:YEAR, MONTH, TIME, DAYWEEK, VEHFORMS  


GV:MODEL, MODELYR, MAKE, BODYTYPE, ALIGNMNT, ANTILOCK, CARGOWGT,  CONDTREE, CURBWGT, FOURWHDR, FRTWHLDR, FUELCODE,  LGTCOND, PROFILE, RELINTER, RESTYPE, SPLIMIT, SURCOND, SURTYPE, TRAFCONT, TRAFFLOW, TRAVELSP, TRCTLFCT, VEHTYPE, VEHUSE, VEHWGT, WGTCDTR, WHLDRWHL, OTBDYTYP, OTVEHWGT  


VE:FUELCAP1, FUELCAP2, FUELLOC1, FUELLOC2, FUELTYP1, FUELTYP2, FUELTNK1, FUELTNK2, ORIGAVTW, PDOF1, SHL1, SVL1, WHEELBAS 


VI:GLTYPWS, GLTYPLF, GLTYPLR, GLTYPRF, GLTYPRR, GLTYPBL, GLTYPRUF, GLTYPOTH, GLPREWS, GLPRELF, GLPRELR, GLPRERF, GLPRERR, GLPREBL, GLPRERUF, GLPREOTH, COLUMTYP, COLMTELE, COLMTILT, ODOMETER, ADAPTEQ  


OA:AGE, BAGAVAIL, BAGAVOTH, BAGMAINT, BAGTYPE, BELTANCH, EYEWEAR, HEIGHT, MANAVAIL, MANUSE, POSTURE, ROLE, SEATPOS, SEATRACK, SEATTYPE, SEX, STORIENT, WEIGHT  


作成した特徴量:otbsp

In [None]:
pcol = 'MAIS'
use_col = [pcol, \
           'ACCTYPE', 'ROLLOBJ', 'IMPACTSP', 'GAD1', 'EVENTS', \
           'CASEID', 'PSU', 'VEHNO', \
           'YEAR', 'MONTH', 'TIME', 'DAYWEEK', 'DVTOTAL', 'VEHFORMS', \
           'MODEL', 'MODELYR', 'MAKE', 'BODYTYPE', 'ALIGNMNT', 'ANTILOCK', 'CARGOWGT', 'CONDTREE', 'CURBWGT', 'FOURWHDR', 'FRTWHLDR', 'FUELCODE', 'LGTCOND', 'PROFILE', 'RELINTER', 'RESTYPE', 'SPLIMIT', 'SURCOND', 'SURTYPE', 'TRAFCONT', 'TRAFFLOW', 'TRAVELSP', 'TRCTLFCT', 'VEHTYPE', 'VEHUSE', 'VEHWGT', 'WGTCDTR', 'WHLDRWHL', 'otbdytyp', 'otvehwgt', 'MANCOLL','PREMOVE', \
           'FUELCAP1', 'FUELCAP2', 'FUELLOC1', 'FUELLOC2', 'FUELTYP1', 'FUELTYP2','FUELTNK1', 'FUELTNK2', 'ORIGAVTW', 'PDOF1', 'SHL1', 'SVL1', 'WHEELBAS', 'OBJCONT1', 'OBJCONT2', 'DVC1', 'DVC2', 'PDOF2', 'DOF1', \
           'GLTYPWS', 'GLTYPLF', 'GLTYPLR', 'GLTYPRF', 'GLTYPRR', 'GLTYPBL', 'GLTYPRUF', 'GLTYPOTH', 'GLPREWS', 'GLPRELF', 'GLPRELR', 'GLPRERF', 'GLPRERR', 'GLPREBL', 'GLPRERUF', 'GLPREOTH', 'COLUMTYP', 'COLMTELE', 'COLMTILT', 'ODOMETER', 'ADAPTEQ', \
           'AGE', 'BAGAVAIL', 'BAGAVOTH', 'BAGMAINT', 'BAGTYPE', 'BELTANCH', 'EYEWEAR', 'HEIGHT', 'MANAVAIL', 'MANUSE', 'POSTURE',  'ROLE', 'SEATPOS', 'SEATRACK', 'SEATTYPE', 'SEX', 'STORIENT', 'WEIGHT', 'PARUSE', 'STBACINC' \
           ]
for year in uyear:
    cds_prepro[year] = cds_prepro[year][use_col]

## SURCONDの年毎に違う値を修正
2009年以降がより細分化されて記録されているので、2008年の基準に統一

In [None]:
'''
for year in range(9, 16):
    year = str(year)
    cds_prepro[year].loc[cds_prepro[year]['SURCOND'] == 4, 'SURCOND'] =  3
    cds_prepro[year].loc[cds_prepro[year]['SURCOND'] == 5, 'SURCOND'] = 4
    cds_prepro[year].loc[(cds_prepro[year]['SURCOND'] >= 7)  & (cds_prepro[year]['SURCOND'] <= 9), 'SURCOND'] = 5
    cds_prepro[year].loc[(cds_prepro[year]['SURCOND'] > 87) | (cds_prepro[year]['SURCOND']  == 6), 'SURCOND'] = 8
'''

## グルーピング

In [None]:
#[カラム名, グルーピング後カラム名, 比較演算子リスト, グルーピング閾値リスト, グルーピング後数値リスト]
grouping = [['MAIS', 'MAIS3', ['<', '>='], [3, 3], [0, 1]],\
            ['BAGAVAIL', 'BAGAVAIL', ['>='], [2], [0]],\
            ['BAGAVOTH', 'BAGAVOTH', ['>='], [2], [0]],\
            ['SEATPOS', 'SEATROW', ['>=<', '>=<', '>='], [10, 20, 30], [1, 2, 3]],\
            ['SEATPOS', 'SEATROW2', ['>=<', '>='], [10, 20], [1, 2]],\
            ['SEATPOS', 'SEATLR', ['%', '%', '%>', '%'], [[10, 1], [10, 2], [10, 3], [10, 3]], [1, 2, 2, 3]],\
            ['SEATPOS', 'SEATLR2', ['%', '%>'], [[10, 1], [10, 1]], [1, 2]],\
            ['SEX', 'SEX', ['>='], [2], [2]],\
            ['DVTOTAL', 'DVper10', ['<'] + ['>=<' for i in range(9)] + ['>='], [10] + list(range(10, 110, 10)), list(range(11))],\
            ['AGE', 'AGEper5', ['<'] + ['>=<' for i in range(21, 86, 5)] + ['>='], [21] + list(range(21, 92, 5)), list(range(15, 91, 5))],\
            ['AGE', 'AGEG', ['<', '>=<', '>='], [60, 60, 70], [0, 1, 2]],\
            ['AGE', 'AGE55', ['<', '>='], [56, 56], [0, 1]],\
            ['CURBWGT', 'CURBper10', ['<'] + ['>=<' for i in range(46, 407, 10)] + ['>='], [46] + list(range(46, 417, 10)), list(range(45, 426, 10))],\
            ['otvehwgt', 'otvwper10', ['<'] + ['>=<' for i in range(46, 407, 10)] + ['>='], [46] + list(range(46, 417, 10)), list(range(45, 426, 10))],\
            ['MODELYR', 'MODELG', ['<', '>=<', '>=<', '>='], [1990, 1990, 1997, 2001], list(range(4))],\
            #['PDOF1', 'PDOFG', ['<', '>=<', '>=<', '>=<', '>='], [31, 40, 141, 211, 321], [0, 1, 2, 3, 0]],\
            ['PDOF1', 'PDOFG', ['<'] + ['>=<' for i in range(11)] +['>='], [15, 15, 45, 75, 105, 135, 165, 195, 225, 255, 285, 315, 345], list(range(12)) + [0]],\
            ['CURBWGT', 'WGTG', ['<', '>=<', '>='], [101, 101, 201], [0, 1, 2]],\
            #['CURBWGT', 'WGTG', ['<', '>=<', '>=<', '>='], [101, 101, 151, 201], [0, 1, 2, 3]],\
            ['otvehwgt', 'owgtg', ['<', '>=<', '>='], [101, 101, 201], [0, 1, 2]],\
            #['otvehwgt', 'owgtg', ['<', '>=<', '>=<', '>='], [101, 101, 151, 201], [0, 1, 2, 3]],\
            ['BMI', 'BMIG', ['<'] + ['>=<' for i in range(6)] + ['>='], [16, 16, 17, 18.5, 25, 30, 35, 40], list(range(8))],\
            ['BMI', 'BMIGJP', ['<'] + ['>=<' for i in range(4)] + ['>='], [18.5, 18.5, 25, 30, 35, 40], list(range(6))],\
            ['ACCTYPE', 'ACCG', ['<'] + ['>=<' for i in range(15)], [1, 1, 6, 11, 20, 34, 44, 50, 54, 64, 68, 76, 86, 94, 98, 99, 100], list(range(16))], \
            ['ACCTYPE', 'ACCG2',\
            ['==' for i in range(14)] + ['>=<', '==', '=='] + ['>=<', '==', '==', '=='] + ['==' for i in range(9)] + ['==' for i in range(6)] + ['>=<', '==', '>=<', '=='] + ['>=<' if i % 2 == 0 else '==' for i in range(6)] + ['==' for i in range(13)], \
            [0, 11, 32, 33, 42, 43, 74, 75, 84, 85, 90, 91, 98, 99] + [1, 10, 92] + [50, 63, 80, 81] + [20, 24, 28, 34, 36, 38, 40, 76, 78] + [69, 71, 73, 83, 86, 88] + [44, 49, 64, 67] + [21, 23, 25, 27, 29, 31, 35, 37, 39, 41, 68, 70, 72, 77, 79, 82, 87, 89, 93], \
            [0 for i in range(14)] + [1 for i in range(3)] + [2 for i in range(4)] + [3 for i in range(9)] + [4 for i in range(6)] + [5 for i in range(4)] + [6 for i in range(19)]], \
            ['ACCTYPE', 'ACCG3', \
            ['==' for i in range(11)] + ['>=<', '==', '>=<', '=='] + ['==' for i in range(3)] + ['==' for i in range(4)] + ['>=<', '==', '==', '=='] + ['>=<'] + ['==' for i in range(13)] + ['>=<'] + ['==' for i in range(5)] + ['==' for i in range(4)] + ['==' for i in range(4)] + ['==' for i in range(8)] + ['==' for i in range(4)] + ['==' for i in range(4)],\
            [0, 32, 33, 74, 75, 84, 85, 90, 91, 98, 99] + [50, 53, 64, 67] + [11, 42, 43] + [45, 48, 49, 68] + [1, 5, 73, 76] + [54, 63, 20, 24, 28, 34, 36, 38, 40, 69, 83, 86, 88, 93] + [6, 10, 71, 78, 80, 81] + [47, 72, 82, 89] + [23, 27, 31, 77] + [21, 25, 29, 35, 37, 39, 41, 92] + [22, 26, 30, 79] + [44, 46, 70, 87], \
            [0 for i in range(11)] + [1 for i in range(4)] + [5 for i in range(3)] + [7 for i in range(4)] + [3 for i in range(4)] + [4 for i in range(14)] + [2 for i in range(6)] + [6 for i in range(4)] + [9 for i in range(4)] + [11 for i in range(8)] + [10 for i in range(4)] + [8 for i in range(4)]]]
            
for year in uyear:
    cds_prepro[year]['BMI'] = cds_prepro[year]['WEIGHT'] /  ((cds_prepro[year]['HEIGHT'] / 100) ** 2)
    cds_prepro[year].loc[cds_prepro[year]['WEIGHT'].isnull(), 'BMI'] = None
    cds_prepro[year].loc[cds_prepro[year]['HEIGHT'].isnull(), 'BMI'] = None
    
    for glist in grouping:
        col = glist[0]
        ncol = glist[1]
        h2u = glist[2] 
        thr = glist[3]
        gnum = glist[4]
        
        for i, usage in enumerate(h2u):
            if usage == '<':
                cds_prepro[year].loc[cds_prepro[year][col] < thr[i], ncol] = gnum[i]
            elif usage == '>=<':
                cds_prepro[year].loc[(cds_prepro[year][col] >= thr[i]) & (cds_prepro[year][col] < thr[i + 1]), ncol] = gnum[i]
            elif usage == '>=':
                cds_prepro[year].loc[cds_prepro[year][col] >= thr[i], ncol] = gnum[i]
            elif usage == '==':
                cds_prepro[year].loc[cds_prepro[year][col] == thr[i], ncol] = gnum[i]
            elif usage == '%':
                cds_prepro[year].loc[cds_prepro[year][col] % thr[i][0] == thr[i][1], ncol] = gnum[i]
            elif usage == '%>':
                cds_prepro[year].loc[cds_prepro[year][col] % thr[i][0] > thr[i][1], ncol] = gnum[i]
    
    cds_prepro[year].loc[(cds_prepro[year]['BODYTYPE'] <= 9) | (cds_prepro[year]['BODYTYPE'] == 11) | (cds_prepro[year]['BODYTYPE'] == 12), 'BODYG'] = 0 #Sedan
    cds_prepro[year].loc[(cds_prepro[year]['BODYTYPE'] >= 14) & (cds_prepro[year]['BODYTYPE'] <= 19), 'BODYG'] = 1 #SUV
    cds_prepro[year].loc[((cds_prepro[year]['BODYTYPE'] >= 20) & (cds_prepro[year]['BODYTYPE'] <= 29)) | (cds_prepro[year]['BODYTYPE'] == 60), 'BODYG'] = 2 #Van
    cds_prepro[year].loc[(cds_prepro[year]['BODYTYPE'] == 10) | ((cds_prepro[year]['BODYTYPE'] >= 30) & (cds_prepro[year]['BODYTYPE'] <= 39)) | (cds_prepro[year]['BODYTYPE'] == 74), 'BODYG'] = 3 #Pickup
    cds_prepro[year].loc[((cds_prepro[year]['BODYTYPE'] >= 40) & (cds_prepro[year]['BODYTYPE'] <= 49)) | ((cds_prepro[year]['BODYTYPE'] >= 61) & (cds_prepro[year]['BODYTYPE'] <= 70)) | ((cds_prepro[year]['BODYTYPE'] >= 78) & (cds_prepro[year]['BODYTYPE'] <= 79)), 'BODYG'] = 4 #Truck
    cds_prepro[year].loc[(cds_prepro[year]['BODYTYPE'] >= 50) & (cds_prepro[year]['BODYTYPE'] <= 59), 'BODYG'] = 5 #Bus
    
    cds_prepro[year].loc[(cds_prepro[year]['otbdytyp'] <= 9) | (cds_prepro[year]['otbdytyp'] == 11) | (cds_prepro[year]['otbdytyp'] == 12), 'obodyg'] = 0 #Sedan
    cds_prepro[year].loc[(cds_prepro[year]['otbdytyp'] >= 14) & (cds_prepro[year]['otbdytyp'] <= 19), 'obodyg'] = 1 #SUV
    cds_prepro[year].loc[((cds_prepro[year]['otbdytyp'] >= 20) & (cds_prepro[year]['otbdytyp'] <= 29)) | (cds_prepro[year]['otbdytyp'] == 60), 'obodyg'] = 2 #Van
    cds_prepro[year].loc[(cds_prepro[year]['otbdytyp'] == 10) | ((cds_prepro[year]['otbdytyp'] >= 30) & (cds_prepro[year]['otbdytyp'] <= 39)) | (cds_prepro[year]['otbdytyp'] == 74), 'obodyg'] = 3 #Pickup
    cds_prepro[year].loc[((cds_prepro[year]['otbdytyp'] >= 40) & (cds_prepro[year]['otbdytyp'] <= 49)) | ((cds_prepro[year]['otbdytyp'] >= 61) & (cds_prepro[year]['otbdytyp'] <= 70)) | ((cds_prepro[year]['otbdytyp'] >= 78) & (cds_prepro[year]['otbdytyp'] <= 79)), 'obodyg'] = 4 #Truck
    cds_prepro[year].loc[(cds_prepro[year]['otbdytyp'] >= 50) & (cds_prepro[year]['otbdytyp'] <= 59), 'obodyg'] = 5 #Bus
        
    cds_prepro[year]['POS_DOF'] = 0
    cds_prepro[year].loc[(((cds_prepro[year]['PDOF1'] >= 0) & (cds_prepro[year]['PDOF1'] <= 110)) | ((cds_prepro[year]['PDOF1'] >= 340) & (cds_prepro[year]['PDOF1'] <= 360))) & ((cds_prepro[year]['SEATLR2'] == 2) & (cds_prepro[year]['SEATROW2'] == 1)), 'POS_DOF'] = 1
    cds_prepro[year].loc[((cds_prepro[year]['PDOF1'] >= 70) & (cds_prepro[year]['PDOF1'] <= 200)) & ((cds_prepro[year]['SEATLR2'] == 2) & (cds_prepro[year]['SEATROW2'] == 2)), 'POS_DOF'] = 1
    cds_prepro[year].loc[((cds_prepro[year]['PDOF1'] >= 160) & (cds_prepro[year]['PDOF1'] <= 290)) & ((cds_prepro[year]['SEATLR2'] == 1) & (cds_prepro[year]['SEATROW2'] == 2)), 'POS_DOF'] = 1
    cds_prepro[year].loc[(((cds_prepro[year]['PDOF1'] >= 250) & (cds_prepro[year]['PDOF1'] <= 360)) | ((cds_prepro[year]['PDOF1'] >= 0) & (cds_prepro[year]['PDOF1'] <= 20))) & ((cds_prepro[year]['SEATLR2'] == 1) & (cds_prepro[year]['SEATROW2'] == 1)), 'POS_DOF'] = 1    

In [None]:
#for year in  uyear:
#    cds_prepro[year] = cds_prepro[year].query('BODYG <=  3 and MANCOLL != 0')

## 車両毎にMAISの最大値のみ残す
MAISでソートし, VEHNOが重複しているレコードを上にあるものを残して削除  
MAIS3+のカラムを作成したのでMAISのカラムはドロップ

In [None]:
for year in uyear:
    #print(pycolor.RED + year + pycolor.END)
    #print(len(cds_prepro[year]))
    cds_prepro[year] = cds_prepro[year].sort_values(by = pcol, ascending = False)
    cds_prepro[year] = cds_prepro[year].drop_duplicates(subset = ['CASEID', 'PSU', 'VEHNO'])
    #cds_prepro[year] = cds_prepro[year].drop(['PSU', 'CASEID', 'VEHNO'], axis = 1)
    len(cds_prepro[year])
    #print(len(cds_prepro[year]))

## 1~15年を結合

In [None]:
cds_all = cds_prepro['1']
for year in range(2, 16):
    year = str(year)
    cds_all = pd.concat([cds_all, cds_prepro[year]])
len(cds_all)

## MANCOLLを整数値に変換

In [None]:
cds_all.reset_index(drop = True, inplace = True)
mc = pd.DataFrame(cds_all['MANCOLL'].dropna())
mc = mc.astype(np.int64)
#cds_all['MANCOLL'].mask(cds_all['MANCOLL'].isnull()).head()
#mc.head()
len(mc)
len(cds_all)
cds_all.drop('MANCOLL', axis = 1, inplace = True)
cds_all = pd.concat([cds_all, mc], axis = 1, join_axes = [cds_all.index])

## EVENTテーブルで車以外と衝突のあったデータを削除

In [None]:
#絞り込み用(もっと綺麗に書けそう)
cds_all['CASEID'] = cds_all['CASEID'].astype(str)
plist = list(cds_all['PSU'])
ylist = list(cds_all['YEAR'].astype(int)-2000)
vehno = list(cds_all['VEHNO'])
ixlist = list(cds_all.index)
for i, num in enumerate(ylist):
    cid = cds_all.at[ixlist[i], 'CASEID']
    cds['event_{}'.format(num)]['CASEID'] = cds['event_{}'.format(num)]['CASEID'].astype(str)
    if len(cds['event_{}'.format(num)].query('PSU == {} & CASEID == "{}" & VEHNUM == {} & OBJCONT > 40'.format(plist[i], cid, vehno[i]))) > 0:
        #if len(cds_all.query('(PSU == {} & CASEID == "{}" & VEHNO == {} & YEAR == {})'.format(plist[i], cid, vehno[i], num + 2000))) > 1:
            #print(cds_all.query('(PSU == {} & CASEID == "{}" & VEHNO == {} & YEAR == {})'.format(plist[i], cid, vehno[i], num + 2000)))
        cds_all = cds_all.query('~(PSU == {} & CASEID == "{}" & VEHNO == {} & YEAR == {})'.format(plist[i], cid, vehno[i], num + 2000))
len(cds_all)

## EVENTテーブルから最初に衝突した際の衝突箇所を抽出しGADに格納

In [None]:
cds_all['GAD'] = np.nan
cds_all['GAD']  = cds_all['GAD'].astype(str)
cds_all['ogad'] = np.nan
cds_all['ogad']  = cds_all['ogad'].astype(str)
plist = list(cds_all['PSU'])
ylist = list(cds_all['YEAR'].astype(int)-2000)
vehno = list(cds_all['VEHNO'])
ixlist = list(cds_all.index)
for i, num in enumerate(ylist):
    cid = cds_all.at[ixlist[i], 'CASEID']
    tempdf = cds['event_{}'.format(num)].query('PSU == {} & CASEID == "{}" & ((VEHNUM == 1 & OBJCONT == 2) | (VEHNUM == 2 & OBJCONT == 1))'.format(plist[i], cid))
    tempdf = tempdf.drop_duplicates(subset = ['PSU', 'CASEID'])
    if tempdf['ACCSEQ'].iloc[0] == 1:
        if tempdf['VEHNUM'].iloc[0] == 1:
            v1 = str(tempdf['GADEV1'].iloc[0])
            v2 = str(tempdf['GADEV2'].iloc[0])
        else:
            v1 = str(tempdf['GADEV2'].iloc[0])
            v2 = str(tempdf['GADEV1'].iloc[0])
        if vehno[i] == 1:
            cds_all.at[ixlist[i], 'GAD'] = v1
            cds_all.at[ixlist[i], 'ogad'] = v2
        elif vehno[i] == 2:
            cds_all.at[ixlist[i], 'GAD'] = v2
            cds_all.at[ixlist[i], 'ogad'] = v1
    else:
        cds_all = cds_all.query('~(PSU == {} & CASEID == "{}" & VEHNO == {} & YEAR == {})'.format(plist[i], cid, vehno[i], num + 2000))
        
gadval = list(cds_all['GAD'].unique())
for i, val in enumerate(gadval):
    print(val, ':', i)
    cds_all['GAD'] = cds_all['GAD'].str.replace(val, str(i))
cds_all['GAD'] = cds_all['GAD'].astype(int)

gadval = list(cds_all['ogad'].unique())
for i, val in enumerate(gadval):
    print(val, ':', i)
    cds_all['ogad'] = cds_all['ogad'].str.replace(val, str(i))
cds_all['ogad'] = cds_all['ogad'].astype(int)
cds_all = cds_all.query('GAD < 4 and ogad < 4')
len(cds_all)
cds_all['SHL1'] = cds_all['SHL1'].map({b'Y': 0, b'D': 1, b'Z': 2, b'L': 3, b'P': 4, b'F': 5, b'R': 6, b'9': 7, b'B': 8, b'C': 9})

In [None]:
plist = list(cds_all['PSU'])
ylist = list(cds_all['YEAR'].astype(int)-2000)
vehno = list(cds_all['VEHNO'])
ixlist = list(cds_all.index)
count = 0
count2 = 0
for i, num in enumerate(ylist):
    cid = cds_all.at[ixlist[i], 'CASEID']
    if len(cds['event_{}'.format(num)].query('PSU == {} & CASEID == "{}" & ((VEHNUM == 1 & OBJCONT == 2) | (VEHNUM == 2 & OBJCONT == 1)) & ACCSEQ == 1'.format(plist[i], cid))) == 1:
        count2 += 1
        if ~np.isnan(cds_all.at[ixlist[i], 'DVTOTAL']):
            count += 1
len(cds_all)
count2
count

## 出力用DF
リサンプリング用数値データで出力

In [None]:
col = ['YEAR', 'MAIS', 'MAIS3', 'MANCOLL', 'DVTOTAL', 'BODYG', 'obodyg', 'BAGAVAIL', 'BAGAVOTH', 'SEATROW2', 'SEATLR2', 'SEX', 'MODELYR', 'CURBWGT', 'otvehwgt', 'AGE', 'BMI', 'PDOF1']
option = ['SEATROW', 'PDOF1', 'SEATLR', 'BMIG']
output_df = cds_all[col]
output_df.to_csv(sep = ',', index = False, encoding = 'utf-8', path_or_buf = os.path.join(path, 'grouped_data.csv'))

## 出力用DF
グルーピング後を出力

In [None]:
col = ['YEAR', 'MAIS', 'MAIS3', 'ACCG', 'ACCG2', 'ACCG3', 'GAD', 'ogad', 'DVTOTAL', 'DVper10', 'BODYG', 'obodyg', 'BAGAVAIL', 'BAGAVOTH', 'SEATROW2', 'SEATLR2', 'SEX', 'MODELG', 'WGTG', 'owgtg', 'AGEG', 'BMIGJP', 'PDOFG']
option = ['SEATROW', 'PDOF1', 'SEATLR', 'BMIG']
output_df = cds_all[col]
output_df.to_csv(sep = ',', index = False, encoding = 'utf-8', path_or_buf = os.path.join(path, 'grouped_data.csv'))

In [None]:
X_train = cds_prepro['1']
for year in range(2, 15):
    year = str(year)
    X_train = pd.concat([X_train, cds_prepro[year]])
X_test = cds_prepro['15']

In [None]:
X_train = X_train[['MAIS', 'MAIS3', 'MANCOLL', 'DVper10', 'MODELG', 'WGTG', 'owgtg', 'PDOFG', 'BAGAVAIL', 'BAGAVOTH', 'SEATROW2', 'SEATLR2', 'SEX', 'AGEG', 'BMIG', 'BMIGJP', 'BODYG', 'obodyg', 'PREMOVE']]

In [None]:
X_train = X_train[['MODELG', 'BODYG', 'obodyg', 'WGTG', 'owgtg',  'BAGAVOTH', 'BAGAVAIL', 'SEX', 'AGEG', 'SEATLR2', 'SEATROW2', 'BMIGJP', 'MANCOLL', 'DVper10', 'PDOFG']]

## 特殊なACCTYPEに対応するEVENTデータの確認

In [None]:
cds['event_{}'.format(num)].head()
len(cds_all)

In [None]:
cds_all['SHL1'].value_counts()

In [None]:
actype = 11
hoge = cds_all.query('ACCTYPE == {}'.format(actype))
len(hoge)
len(cds_all)
#cds_all.query('EVENTS == 2').ACCTYPE.value_counts()
#cds_all.query('EVENTS == {}'.format(actype))
cds_all.EVENTS.value_counts()
hoge.head(10)
#hoge = cds_all
#hoge = cds_all
#len(hoge.query('VEHNO > 2'))
#hoge = cds_all.query('EVENTS == 3')
fuga = list(hoge['PSU'])
hogera = list(hoge['YEAR'].astype(int)-2000)
vehno = list(hoge['VEHNO'])
hogepiyo = 0
count = 0
for i, num in enumerate(hogera):
    piyo = hoge.at[hoge.index[i], 'CASEID']
    cds['event_{}'.format(num)]['CASEID'] = cds['event_{}'.format(num)]['CASEID'].astype(str)
    cds['event_{}'.format(num)]['VEHNUM'] = cds['event_{}'.format(num)]['VEHNUM'].astype(int)
    #cds['event_8'].query('PSU == 43')
    #cds['event_{}'.format(num)].query('PSU == {} & CASEID == "{}"'.format(fuga[i], piyo))
    #if 0 != len(cds['event_{}'.format(num)].query('PSU == {} & CASEID == "{}" & (CLASS1 == 0 or CLASS2 == 0)'.format(fuga[i], piyo))):
    if len(cds['event_{}'.format(num)].query('PSU == {} and CASEID == "{}"'.format(fuga[i], piyo))) > 0:
        if count < 30:
            count += 1
            cds['event_{}'.format(num)].query('PSU == {} & CASEID == "{}"'.format(fuga[i], piyo))
            #cds_all.query('PSU == {} & CASEID == "{}" & YEAR == {} & VEHNO == 2'.format(fuga[i], piyo, num + 2000))
        #cds['event_{}'.format(num)].query('PSU == {} & CASEID == "{}" & VEHNUM == {}'.format(fuga[i], piyo, vehno[i]))
        hogepiyo += 1
        #hogepiyo
hogepiyo

In [None]:
len(cds_all)
hoge = list(cds_all.index)
hoge[1000]
ixlist = list(cds_all.index)

In [None]:
cds_all.dtypes

In [None]:
len(cds_all[cds_all.duplicated(subset=['YEAR', 'CASEID', 'PSU', 'VEHNO'])])

In [None]:
cds_all[['DOF1', 'PDOFG']]

## 相関係数の出力

In [None]:
X_train = cds_all[col]
corr = X_train.corr().round(3)
corr.head(len(corr))
corr.to_csv('/Users/kei/Desktop/table.csv')
triang_mask = np.zeros((corr.shape[1], corr.shape[1]))
triang_mask[np.triu_indices_from(triang_mask)] = True
sns.heatmap(corr, center = 0, square = True, vmax = 1, vmin = -1)#, mask = triang_mask)
plt.savefig('/Users/kei/Desktop/fig.png')

In [None]:
col =  'BMIGJP'
cds_all[col].value_counts()
#sns.distplot(cds_all[col])
sns.countplot(x = col, data = cds_all)#, hue = 'MAIS3')
#, hue = 'MAIS')
#cds_all[col].hist()
count_df = cds_all[col].value_counts().rename_axis('unique_values').reset_index(name='counts')
count_df = count_df[:10]
#count_df.plot.bar(y = 'counts', x = 'unique_values')
plt.savefig('/Users/kei/Desktop/fig.png')

In [None]:
col =  'BMIGJP'
cds_all[col].value_counts()
sns.countplot(x = col, data = cds_all)
#, hue = 'MAIS')
#cds_all[col].hist(bins = 60)
count_df = cds_all[col].value_counts().rename_axis('unique_values').reset_index(name='counts')
count_df = count_df[:10]
count_df.plot.bar(y = 'counts', x = 'unique_values')
plt.savefig('/Users/kei/Desktop/fig.png')

In [None]:
pos = [[len(cds_all.query('SEATLR2 == 1 and SEATROW2 == 1')), len(cds_all.query('SEATLR2 == 2 and SEATROW2 == 1'))]
      , [len(cds_all.query('SEATLR2 == 1 and SEATROW2 == 2')), len(cds_all.query('SEATLR2 == 2 and SEATROW2 == 2'))]]
pos
sns.heatmap(pos)

In [None]:
vect = pd.DataFrame(cds_all[['CURBWGT', 'otvehwgt', 'MAIS', 'MAIS3']].dropna(how = 'any'))
vect['diff'] = vect['otvehwgt'] - vect['CURBWGT']
vect.describe()
sns.distplot(vect['diff'], kde = False, bins  = 100)
vect.loc[vect['diff'] <= -200, 'diffper50'] = -200
for i in range(-200, 200, 50):
    vect.loc[(vect['diff'] > i) & (vect['diff'] <= i + 50), 'diffper50'] = i + 25
vect.loc[vect['diff'] > 200, 'diffper50'] = 200
plt.savefig('/Users/kei/Desktop/fig.png')

In [None]:
vect = pd.DataFrame(cds_all[['HEIGHT', 'WEIGHT', 'MAIS', 'MAIS3']].dropna(how = 'any'))
len(cds_all)
len(vect)
vect['BMI'] = vect['WEIGHT'] / ((vect['HEIGHT'] / 100) * (vect['HEIGHT'] / 100))
#vect['BMI'] = vect['BMI'].round().astype(np.int64)
#sns.countplot(x = 'BMI', data = vect)
vect.loc[vect['BMI'] < 16 , 'BMI'] = 15
vect.loc[vect['BMI'] > 60 , 'BMI'] = 60

sns.distplot(vect['BMI'], kde = False, bins  = 36)
#sns.countplot(x = 'BMI', data = vect)
plt.savefig('/Users/kei/Desktop/fig.png')
#.astype(np.int64))
#cds_all[col].dropna() = vect

In [None]:
hoge = pd.DataFrame(cds_all['BMI'].dropna())
hoge.loc[hoge['BMI'] < 16 , 'BMI'] = 15
hoge.loc[hoge['BMI'] > 60 , 'BMI'] = 60
hoge = hoge.round().astype(np.int64)
len(hoge)
sns.distplot(hoge, kde = False, bins = 36)

In [None]:
col_s = vect[col].value_counts().rename_axis('unique_values').reset_index(name='counts')
col_s['counts'] = col_s['counts'] / (col_s['counts'].sum())
count = {}
for i in range(0, 2):
    hoge = vect.query('MAIS3 == {}'.format(i))
    count['{}'.format(i)] = hoge[col].value_counts().rename_axis('unique_values').reset_index(name='counts')
    cs = count['{}'.format(i)]['counts'].sum()
    count['{}'.format(i)]['MAIS3'] = i
    count['{}'.format(i)]['counts'] = count['{}'.format(i)]['counts'] / cs
    count['{}'.format(i)]['counts'] = count['{}'.format(i)]['counts'] / col_s['counts']

vect_b = count['0']
vect_b = pd.concat([vect_b, count['1']])
vect_b.head(len(vect_b))
sns.heatmap(vect_b.pivot(index = 'unique_values', columns = 'MAIS3', values = 'counts'))
plt.savefig('/Users/kei/Desktop/fig.png')

In [None]:
vect = pd.DataFrame(cds_all[[col, 'MAIS', 'MAIS3']]).dropna(how  = 'any').astype(np.int64)

In [None]:
col = 'MODELYR'
vect = pd.DataFrame(cds_all[[col, 'MAIS', 'MAIS3']].dropna(how = 'any'))
sns.distplot(vect[col], hist = True, kde = False)#, bins = 50)
plt.savefig('/Users/kei/Desktop/fig.png')

In [None]:
col = 'POS_DOF'
col_s = cds_all[col].value_counts().rename_axis('unique_values').reset_index(name='counts')
csum = col_s['counts'].sum()
col_s['counts'] = col_s['counts'] / (csum)
count = {}
for i in range(0, 7):
    hoge = cds_all.query('MAIS == {}'.format(i))
    count['{}'.format(i)] = hoge[col].value_counts().rename_axis('unique_values').reset_index(name='counts')
    count['{}'.format(i)] = count['{}'.format(i)].sort_values('unique_values', ascending = False)
    cs = count['{}'.format(i)]['counts'].sum()
    count['{}'.format(i)]['MAIS'] = i
    count['{}'.format(i)]['counts'] = count['{}'.format(i)]['counts'] / cs #MAIS別比率
    count['{}'.format(i)]['counts'] = count['{}'.format(i)]['counts'] / col_s['counts']

vect = count['0']
for i in range(1, 7):
    vect = pd.concat([vect, count['{}'.format(i)]])

sns.heatmap(vect.pivot(index = 'unique_values', columns = 'MAIS', values = 'counts'))
plt.savefig('/Users/kei/Desktop/fig.png')

In [None]:
sns.countplot(x=col, data=vect)
plt.savefig('/Users/kei/Desktop/fig.png')

In [None]:
col = 'obodyg'
col_s = cds_all[col].value_counts().rename_axis('unique_values').reset_index(name='counts')
csum = col_s['counts'].sum()
col_s['counts'] = col_s['counts'] / (csum)

count = {}
for i in range(0, 2):
    hoge = cds_all.query('MAIS3 == {}'.format(i))
    count['{}'.format(i)] = hoge[col].value_counts().rename_axis('unique_values').reset_index(name='counts')
    cs = count['{}'.format(i)]['counts'].sum()
    count['{}'.format(i)]['MAIS3'] = i
    count['{}'.format(i)]['counts'] = count['{}'.format(i)]['counts'] / cs
    count['{}'.format(i)]['counts'] = count['{}'.format(i)]['counts'] / col_s['counts']

vect = count['0']
vect = pd.concat([vect, count['1']])
vect.head(len(vect))
sns.heatmap(vect.pivot(index = 'unique_values', columns = 'MAIS3', values = 'counts'))
count['1'] = count['1'].sort_values('unique_values')
#count['1'].plot(x = 'unique_values', y = 'counts', marker = '.')
count['0'] = count['0'].sort_values('unique_values')
#count['0'].plot(x = 'unique_values', y = 'counts', marker = '.')
plt.savefig('/Users/kei/Desktop/fig.png')