In [121]:
import pandas as pd
import numpy as np
import random
import os
print(os.getcwd())


/Users/solomonzhang/Downloads


In [122]:
df = pd.read_csv("filtered_data.csv")
print(df.shape)
print(df.info())
print(df.head())
#(6129757, 9)

(6129757, 9)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6129757 entries, 0 to 6129756
Data columns (total 9 columns):
 #   Column              Dtype 
---  ------              ----- 
 0   Registration State  object
 1   Plate Type          object
 2   Violation Code      int64 
 3   Vehicle Body Type   object
 4   Vehicle Make        object
 5   Violation Precinct  int64 
 6   Violation Time      object
 7   Violation County    object
 8   Vehicle Color       object
dtypes: int64(2), object(7)
memory usage: 420.9+ MB
None
  Registration State Plate Type  Violation Code Vehicle Body Type  \
0                 NY        PAS              67               SDN   
1                 NY        PAS              51              SUBN   
2                 NJ        PAS              63               SDN   
3                 NY        PAS              63              SUBN   
4                 NJ        PAS              63               SDN   

  Vehicle Make  Violation Precinct Violation Time V

In [123]:
# question we are answering
"""
We will be building a model to see if we could predict violation 
code based on vehicle descriptions and location.
Hence, this brings the awareness where people could try to be more careful if they match description
"""

'\nWe will be building a model to see if we could predict violation \ncode based on vehicle descriptions and location.\nHence, this brings the awareness where people could try to be more careful if they match description\n'

In [124]:
# observe data
print(df.isnull().sum())
# there are only null values in the following columns
# Vehicle Body Type
# Vehicle Make
# Violation Time
# Violation County
# Vehicle Color

Registration State         0
Plate Type                 0
Violation Code             0
Vehicle Body Type      15645
Vehicle Make            5485
Violation Precinct         0
Violation Time            66
Violation County       16403
Vehicle Color         542695
dtype: int64


In [125]:
# violation code -> clean this first 
#     find lower threshold and remove violation tickets that dont lower threshold
# time -> format HHMM(A/P) -> use this to categorize time
# Vehicle Maker 
# Registration State
# Plate Type
# Vehicle Body Type
# Vehicle Color
#     find lower threshold by count and remove violation tickets that dont lower threshold
#     do by count 

# reason why we decide threshold by count
# the more people that has this description, the more relevant it is
# the less people that has this description, the less relevant it is 
# in terms of reflecting general population

# Hence
# reasoning to remove lower threshold
# this is because these descriptions do not reflect the overall population as the descriptions are not common
# lower threshold does not reflect 

# reason to keep upper threshold
# the more people that follows these descriptions, the more it reflects the general population
# hence, if we to remove upper threshold, it will essentially remove a huge chunk of our data that reflects general
# population

# for example
# if a violation code 'X' has only 1 violation 
# then it does not make sense to include 'X' since it does not reflect overall population
# and if a violation code 'Y' has 1000000
# then it does make sense to include Y

# in comparison to the total number of data
# a small portion of data has None values
# also in this context, we cannot use stuff we dont know as filling the none values will introduce inaccurate values
# so we could first drop Null values as it would not have a significant impact on the data

df = df.dropna()
print(df.isnull().sum())



Registration State    0
Plate Type            0
Violation Code        0
Vehicle Body Type     0
Vehicle Make          0
Violation Precinct    0
Violation Time        0
Violation County      0
Vehicle Color         0
dtype: int64


In [126]:
# approximately 500000 rows are removed mostly from color
print(df.shape)

(5562715, 9)


In [127]:
# inspect each column values to what needs further processing
print(df['Registration State'].unique())
# '99' do not look like a state so see how many vehicles are under this state
df = df[df['Registration State'] != '99']
print(df['Registration State'].unique())


['NY' 'NJ' 'PA' 'FL' 'OH' 'MA' 'NC' 'UT' '99' 'GA' 'TN' 'CA' 'CT' 'MS'
 'RI' 'TX' 'MN' 'AL' 'WA' 'ME' 'AR' 'IA' 'GV' 'OK' 'MI' 'DC' 'VA' 'VT'
 'ID' 'SC' 'AK' 'ON' 'MO' 'AZ' 'MD' 'LA' 'IN' 'NH' 'DE' 'OR' 'ND' 'NV'
 'QB' 'BC' 'NM' 'HI' 'IL' 'WI' 'MT' 'CO' 'MB' 'KY' 'PR' 'NS' 'WV' 'KS'
 'SD' 'NB' 'WY' 'NE' 'DP' 'AB' 'PE' 'SK' 'FO']
['NY' 'NJ' 'PA' 'FL' 'OH' 'MA' 'NC' 'UT' 'GA' 'TN' 'CA' 'CT' 'MS' 'RI'
 'TX' 'MN' 'AL' 'WA' 'ME' 'AR' 'IA' 'GV' 'OK' 'MI' 'DC' 'VA' 'VT' 'ID'
 'SC' 'AK' 'ON' 'MO' 'AZ' 'MD' 'LA' 'IN' 'NH' 'DE' 'OR' 'ND' 'NV' 'QB'
 'BC' 'NM' 'HI' 'IL' 'WI' 'MT' 'CO' 'MB' 'KY' 'PR' 'NS' 'WV' 'KS' 'SD'
 'NB' 'WY' 'NE' 'DP' 'AB' 'PE' 'SK' 'FO']


In [128]:
print(df['Plate Type'].unique())
# '999' do not look like a plate type so see how many under this and remove
df = df[df['Plate Type'] != '999']
print(df['Plate Type'].unique())
print(df['Plate Type'].value_counts())

['PAS' 'OMS' '999' 'OMT' 'SRF' 'COM' 'MED' 'ORG' 'MOT' 'TRL' 'HIS' 'SPO'
 'MCL' 'NYS' 'RGL' 'VPL' 'OMV' 'OML' 'VAS' 'HIR' 'PSD' 'HAM' 'TOW' 'SOS'
 'PHS' 'JSC' 'ITP' 'CMB' 'AGC' 'NLM' 'NYC' 'CSP' 'TRC' 'OMR' 'SEM' 'APP'
 'SRN' 'LMB' 'SCL' 'IRP' 'DLR' 'NYA' 'CHC' 'STA' 'TRA' 'LTR' 'RGC' 'ORC'
 'ARG' 'SPC' 'LMA' 'SUP' 'ATV' 'AYG' 'AMB' 'BOB' 'AGR' 'STG' 'CLG' 'CME'
 'LMC' 'CBS' 'JCA' 'MCD' 'CCK' 'CMH' 'PPH' 'OMF' 'THC' 'HSM' 'GSM' 'WUG'
 'USC' 'FAR' 'JWV' 'FPW' 'HOU' 'LUA' 'OMO']
['PAS' 'OMS' 'OMT' 'SRF' 'COM' 'MED' 'ORG' 'MOT' 'TRL' 'HIS' 'SPO' 'MCL'
 'NYS' 'RGL' 'VPL' 'OMV' 'OML' 'VAS' 'HIR' 'PSD' 'HAM' 'TOW' 'SOS' 'PHS'
 'JSC' 'ITP' 'CMB' 'AGC' 'NLM' 'NYC' 'CSP' 'TRC' 'OMR' 'SEM' 'APP' 'SRN'
 'LMB' 'SCL' 'IRP' 'DLR' 'NYA' 'CHC' 'STA' 'TRA' 'LTR' 'RGC' 'ORC' 'ARG'
 'SPC' 'LMA' 'SUP' 'ATV' 'AYG' 'AMB' 'BOB' 'AGR' 'STG' 'CLG' 'CME' 'LMC'
 'CBS' 'JCA' 'MCD' 'CCK' 'CMH' 'PPH' 'OMF' 'THC' 'HSM' 'GSM' 'WUG' 'USC'
 'FAR' 'JWV' 'FPW' 'HOU' 'LUA' 'OMO']
PAS    4516130
COM     551819
OMT     2093

In [129]:
print(df['Violation Code'].unique())
# all violation codes in data
# choose violations that most reflect general population
print(df['Violation Code'].value_counts())
# df violtion code where code not in code_set
# create a dataframe
# violation code : count
# 



[67 51 63 98 10 45 14 20 68 27 53 19 40 74 16 17 48 62 70 46 50 71 21 95
 24 41 99 75 83 18 37 38 78 97 61 96 52 13 11 39 54 49  5 31 60 90 91  6
 92 85 55 56  9 66  0 80 36 72 22 26  7 69 12 87 33 73 76 79 82 64 25  4
 47 42 35 84 32  8 77 58 23 65 34 89 57 81 44 59  1 29  3  2 30 93 94]
36    1954915
21     716783
38     343279
14     290820
20     244402
       ...   
34          3
57          3
81          1
93          1
94          1
Name: Violation Code, Length: 95, dtype: int64


In [130]:
print(df['Vehicle Body Type'].unique())
# choose body type that most reflect general population
print(df['Vehicle Body Type'].value_counts())


['SDN' 'SUBN' 'VAN' 'FOUR' 'P-U' 'LIMO' 'TWOD' 'CONV' 'SADO' 'BLAC' 'OE'
 'DELV' 'DUMP' 'TAXI' 'MERC' 'BMW' 'OUE' '4DSD' 'OLE' 'SPOR' 'ONE' 'TRUC'
 'OLNE' 'OLME' 'OIE' 'MCY' 'SUV' 'SEDN' 'TT' 'OTSE' 'ONL' 'CNV' 'PICK'
 'MOTO' 'ON' 'OME' 'BLK' 'NE' 'BUS' 'TRL' 'OTE' '2DSD' 'BUCK' 'OTLE'
 'TRLR' 'TRAC' 'I' 'HTCH' 'SEMI' 'OLSE' '4S' 'UT' 'COUP' 'H/WH' 'ATV'
 'UTIL' 'AMBU' 'LIM' 'CUST' 'SW' 'WAGN' '5D' '4D' 'SEDA' '2D' 'LL' 'Seda'
 '4H' 'SD' 'VN' 'HB' '2S' 'TK' 'PK' 'UV' 'Hatc' 'JP' 'SU' 'MP' 'MC' '4W'
 'HRSE' 'CV' '2DOO' 'UTPC' 'SV' 'LI' 'APUR' 'S' 'Coup' '2H' 'CG' 'CP'
 'H/IN' 'LM' '43' 'PV' 'WAGO' 'WG' '3D' 'ROVE' 'CN' 'PAS' 'TOW' 'EC'
 'MINI' 'Spor' 'PE' 'SULB' 'PKPC' 'BOAT' 'TR/C' 'FLAT' 'REFG' 'MOBL' 'N/S'
 'STAK' 'T/CR' 'TRAV' 'SWT' 'P/SH' 'MOPD' 'POLE' 'TRAI' 'TR' 'TRA' 'ACCO'
 'OLL' 'TLR' 'TOWT' 'FRTC' 'TRH' 'TRC' 'TRK' 'TRAT' 'TF' 'ALFA' 'CON' 'TL'
 'CXE' 'BT' 'LINC' 'BICK' 'POWE' 'MOT' 'HYBR' 'MOPE' 'BOX' 'PILO' 'MD'
 'EBIK' 'RANG' 'TRTR' 'E' 'MACK' 'COM' 'ANE' 'OINI' 'MEY' 'DEL

In [131]:
print(df['Vehicle Make'].unique(), len(df['Vehicle Make'].unique()))
# too many vehicle makers
# choose relevant vehicle makers that reflects general population

['TOYOT' 'JEEP' 'HYUND' ... 'RUGB' 'COU' 'MAGIR'] 1567


In [132]:
print(df['Violation Precinct'].unique(), len(df['Violation Precinct'].unique()))
# prob nothing to be done here

[ 10   1   5   6  45 100 101 122 123 120  84  75  42  46  44  41  52  40
 113  76 112  72  69 103 110 105  62  14  25 108  63  67  60  23 109 106
  48  47  43  90   4   9  66  32  83   0 114  49  78  64  13 115  94  18
 111  88  73  30  68  61  24  34  50  99  70 107 121  17  28 119  19  20
  33  71  77 102  79   7  81  39  80 104  26  56 170  74  11  65  21 116
 117   8 168  38  29 200  37  22 136  85  35  51 144 272  53 163  36 805
 174  87 161 164 118   3  15  89 166 127 803 203  57  91 808   2  96  95
 160  93  16  97  54  31 171  82 933  58] 136


In [133]:
print(df['Violation Time'].unique(), len(df['Violation Time'].unique()))

# remove rows of string length violation time less than 5
df = df[df['Violation Time'].apply(lambda x: len(str(x)) == 5)]
print(df.shape)
# too many time, standardize time

# Parts of the Day

# Morning     5 am to 12 pm (noon)

# Afternoon     12 pm to 5 pm

# Evening     5 pm to 9 pm

# Night         9 pm to 4 am

# create a new column "Violation Day" iterate through each row and 
violation_time = df["Violation Time"]

violation_day = []
start_morning = 5
end_morning = 12

start_noon = 12
end_noon = 5 + 12

start_evening = 5 + 12
end_evening = 9 + 12


for i, time in enumerate(violation_time):
    hour = int(time[0:2])
    # edge case
    if(hour == 12 and time[4] == 'A'):
        hour = 0
    elif(hour != 12 and time[4] == 'P'):
        hour = hour + 12

    if hour >= start_morning and hour < end_morning:
        violation_day.append("Morning")
    elif hour >= start_noon and hour < end_noon:
        violation_day.append("Afternoon")
    elif hour >= start_evening and hour < end_evening:
        violation_day.append("Evening")
    else:
        violation_day.append("Night")
        
print("done")    
df["Violation Day"] = violation_day
#drop violation column, no longer need it
df = df.drop(columns="Violation Time")
print(df["Violation Day"].value_counts())

['1037A' '1045A' '1116A' ... '0012P' '01 6A' '6020P'] 1545
(5538714, 9)
done
Morning      2448237
Afternoon    1837223
Evening       645292
Night         607962
Name: Violation Day, dtype: int64


In [134]:
print(df['Violation County'].unique(), len(df['Violation County'].unique()))
# violation county has same county but different abbreivations

county_mapping = {'NY': 'NY','BX':'BX', 'Q':'QNS', 'R':'R', 'K':'K', 'MS': 'MS', 'BK': 'BK',
           'QN': 'QNS', 'MN': 'MN', 'Kings': 'K',
           'Qns': 'QNS', 'Bronx': 'BX', 'Rich': 'R', 'ST':'ST', 'QNS':'QNS', 'ABX': 'ABX'}

df['Violation County'] = df['Violation County'].map(county_mapping)
print(df['Violation County'].unique())
print(df['Violation County'].value_counts())
#drop 'MS' and 'ABX'
df = df[df['Violation County'] != 'MS']
df = df[df['Violation County'] != 'ABX']



['NY' 'BX' 'Q' 'R' 'K' 'MS' 'BK' 'QN' 'MN' 'Kings' 'Qns' 'Bronx' 'Rich'
 'ST' 'QNS' 'ABX'] 16
['NY' 'BX' 'QNS' 'R' 'K' 'MS' 'BK' 'MN' 'ST' 'ABX']
QNS    1509564
NY     1316070
K       782040
BX      754488
BK      744460
MN      227268
ST      150381
R        54441
MS           1
ABX          1
Name: Violation County, dtype: int64


In [135]:
print(df['Violation County'].value_counts())
print(df['Vehicle Color'].unique(), len(df['Vehicle Color'].unique()))
# on closer inspection
# color abbreivation is not standardized
# so we see same colors with different abbreivation
# transform this

QNS    1509564
NY     1316070
K       782040
BX      754488
BK      744460
MN      227268
ST      150381
R        54441
Name: Violation County, dtype: int64
['BLK' 'GRAY' 'BLUE' 'WHITE' 'RED' 'GY' 'WHT' 'BLACK' 'GL' 'SILVE' 'GREY'
 'BL' 'TAN' 'WH' 'BLU' 'BRO' 'GRY' 'ORANG' 'PURPL' 'SLV' 'SIL' 'WT' 'ORG'
 'SUILV' 'BEIGE' 'BRN' 'GREEN' 'BK' 'GR' 'GOLD' 'W' 'NAVY' 'SILV' 'R' 'G'
 'WHI' 'BUR' 'BROW' 'B' 'BROWN' 'SI' 'RD' 'BKL' 'MAROO' 'SL' 'YLLW' 'GRV'
 'GRN' 'GLD' 'SV' 'SLVR' 'BURG' 'SW' 'TN' 'WHTE' 'BN' 'GRAN' 'BR' 'WHIT'
 'PINK' 'GARY' 'GN' 'BRW' 'M' 'BEGE' 'Gray' 'YELLO' 'PURP' 'BLCK' 'WTE'
 'OR' 'T' 'BLA' 'SILVR' 'BW' 'BLAAC' 'BLACJ' 'MRN' 'PK' 'YEL' '.' 'SN'
 'YE' 'BKUE' 'RM' 'MARON' 'GKD' 'BG' 'LTGY' 'YELL' 'BLC' 'OLIV' 'WI'
 'BLAC' 'GRA' 'BCK' 'TEAL' 'BLAK' 'GLN' 'BWN' 'GRE' 'SIVR' 'BIGE' 'BURGU'
 'ON' 'GD' 'MN' 'PR' 'BRWN' 'PW' 'BURGA' 'BLE' 'NLK' 'SLIVE' 'SILVA' 'MR'
 'GREN' 'WHE' 'DK/GY' 'BLGY' 'LT/GR' 'DKRD' 'YW' 'DKGY' 'LTBK' 'LT/GY'
 'BL/BK' 'DK/BL' 'WHTN' 'DK/PR' 'DKGR' 'WHB

In [136]:
# transformation to be done
# select data that reflects general population
# this includes
# plate type
# violation type
# vehicle maker
# color -> choose color that we could easily translate
# 
print(df['Vehicle Color'].value_counts())

GY       1202022
WH       1096921
BK       1054257
BL        403809
WHITE     357639
          ...   
WHM            1
GLOR           1
GYJUU          1
+++++          1
BK0            1
Name: Vehicle Color, Length: 748, dtype: int64


In [137]:
# write a funciton to get the counts + quartile + iqr
# find lower threshold remove it
# apply function on each of the data
# analyze data
