In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
import time
import tensorflow as tf

# Read our New York registrations Dataset

In [2]:
NYS_registrations_df = pd.read_csv('Resources/NYS Electric Vehicle Registrations.csv')
NYS_registrations_df.head()

Unnamed: 0,Record Type,VIN,Registration Class,City,State,Zip,County,Model Year,Make,Body Type,Fuel Type,Unladen Weight,Maximum Gross Weight,Passengers,Reg Valid Date,Reg Expiration Date,Color,Scofflaw Indicator,Suspension Indicator,Revocation Indicator
0,VEH,96722014661,SRF,BOVINA,NY,13740,DELAWARE,1972,SAAB,2DSD,ELECTRIC,2000.0,,,05/04/2021,06/26/2023,WH,N,N,N
1,VEH,9106843,SRF,GLENDALE,NY,11385,QUEENS,1974,EVA,4DSD,ELECTRIC,2180.0,,,09/21/2021,10/03/2023,BR,N,N,N
2,VEH,909SR2223A,SRF,STAFFORD,NY,14143,GENESEE,1979,CITIC,2DSD,ELECTRIC,850.0,,,05/12/2021,04/01/2023,GR,N,N,N
3,VEH,9E2SPAXBC113501,PSD,JOHNSTOWN,NY,12095,FULTON,2012,VANTA,PICK,ELECTRIC,1900.0,,,07/01/2014,,WH,N,N,N
4,VEH,9A204759,SPC,BROOKLYN,NY,11211,KINGS,2014,CROWN,P/SH,ELECTRIC,3335.0,,,12/09/2021,12/31/2022,WH,N,N,N


In [3]:
NYS_registrations_df.columns

Index(['Record Type', 'VIN', 'Registration Class', 'City', 'State', 'Zip',
       'County', 'Model Year', 'Make', 'Body Type', 'Fuel Type',
       'Unladen Weight', 'Maximum Gross Weight', 'Passengers',
       'Reg Valid Date', 'Reg Expiration Date', 'Color', 'Scofflaw Indicator',
       'Suspension Indicator', 'Revocation Indicator'],
      dtype='object')

In [4]:
#  Let's drop the columns we will not use
drop_cols = ['Record Type', 'Unladen Weight', 'Maximum Gross Weight', 'Passengers',
          'Reg Expiration Date', 'Color', 'Scofflaw Indicator',
       'Suspension Indicator', 'Revocation Indicator']
NYS_registrations_df.drop(drop_cols, axis=1, inplace=True)
NYS_registrations_df.head()

Unnamed: 0,VIN,Registration Class,City,State,Zip,County,Model Year,Make,Body Type,Fuel Type,Reg Valid Date
0,96722014661,SRF,BOVINA,NY,13740,DELAWARE,1972,SAAB,2DSD,ELECTRIC,05/04/2021
1,9106843,SRF,GLENDALE,NY,11385,QUEENS,1974,EVA,4DSD,ELECTRIC,09/21/2021
2,909SR2223A,SRF,STAFFORD,NY,14143,GENESEE,1979,CITIC,2DSD,ELECTRIC,05/12/2021
3,9E2SPAXBC113501,PSD,JOHNSTOWN,NY,12095,FULTON,2012,VANTA,PICK,ELECTRIC,07/01/2014
4,9A204759,SPC,BROOKLYN,NY,11211,KINGS,2014,CROWN,P/SH,ELECTRIC,12/09/2021


In [5]:
# Registration VIN count
Registration = NYS_registrations_df.VIN.count()
Registration

62063

In [6]:
NYS_registrations_df['Reg Valid Date'] = pd.to_datetime(NYS_registrations_df['Reg Valid Date'])
NYS_registrations_df.head()

Unnamed: 0,VIN,Registration Class,City,State,Zip,County,Model Year,Make,Body Type,Fuel Type,Reg Valid Date
0,96722014661,SRF,BOVINA,NY,13740,DELAWARE,1972,SAAB,2DSD,ELECTRIC,2021-05-04
1,9106843,SRF,GLENDALE,NY,11385,QUEENS,1974,EVA,4DSD,ELECTRIC,2021-09-21
2,909SR2223A,SRF,STAFFORD,NY,14143,GENESEE,1979,CITIC,2DSD,ELECTRIC,2021-05-12
3,9E2SPAXBC113501,PSD,JOHNSTOWN,NY,12095,FULTON,2012,VANTA,PICK,ELECTRIC,2014-07-01
4,9A204759,SPC,BROOKLYN,NY,11211,KINGS,2014,CROWN,P/SH,ELECTRIC,2021-12-09


In [7]:
# We will be using the m/y information to clean duplicates 
NYS_registrations_df['month/year'] = NYS_registrations_df['Reg Valid Date'].dt.strftime("%m-%Y")

# Then we will adjust the index of our New York registration Database
NYS_registrations_df.set_index('Reg Valid Date', inplace=True)
NYS_registrations_df.head()

Unnamed: 0_level_0,VIN,Registration Class,City,State,Zip,County,Model Year,Make,Body Type,Fuel Type,month/year
Reg Valid Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2021-05-04,96722014661,SRF,BOVINA,NY,13740,DELAWARE,1972,SAAB,2DSD,ELECTRIC,05-2021
2021-09-21,9106843,SRF,GLENDALE,NY,11385,QUEENS,1974,EVA,4DSD,ELECTRIC,09-2021
2021-05-12,909SR2223A,SRF,STAFFORD,NY,14143,GENESEE,1979,CITIC,2DSD,ELECTRIC,05-2021
2014-07-01,9E2SPAXBC113501,PSD,JOHNSTOWN,NY,12095,FULTON,2012,VANTA,PICK,ELECTRIC,07-2014
2021-12-09,9A204759,SPC,BROOKLYN,NY,11211,KINGS,2014,CROWN,P/SH,ELECTRIC,12-2021


# Looking for duplicates

In [10]:
# let's see the duplicates accross the data set
duplicates = NYS_registrations_df[NYS_registrations_df.duplicated()]
duplicates.count()

VIN                   0
Registration Class    0
City                  0
State                 0
Zip                   0
County                0
Model Year            0
Make                  0
Body Type             0
Fuel Type             0
month/year            0
dtype: int64

In [11]:
# inspection of City duplicates
NYS_registrations_df[NYS_registrations_df.duplicated('City')]

Unnamed: 0_level_0,VIN,Registration Class,City,State,Zip,County,Model Year,Make,Body Type,Fuel Type,month/year
Reg Valid Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2022-01-31,7SAYGDEF9NF365467,PAS,BROOKLYN,NY,11249,KINGS,2022,TESLA,SUBN,ELECTRIC,01-2022
2022-01-12,7SAYGDEF9NF337765,OMT,BROOKLYN,NY,11229,KINGS,2022,TESLA,SUBN,ELECTRIC,01-2022
2021-12-15,7SAYGDEF9NF328094,PAS,BRONX,NY,10465,BRONX,2022,TESLA,SUBN,ELECTRIC,12-2021
2021-12-18,7SAYGDEF9NF326491,PAS,NEW YORK,NY,10025,NEW YORK,2022,TESLA,SUBN,ELECTRIC,12-2021
2022-01-18,7SAYGDEF9NF325096,PAS,STATEN ISLAND,NY,10305,RICHMOND,2022,TESLA,SUBN,ELECTRIC,01-2022
...,...,...,...,...,...,...,...,...,...,...,...
2021-10-27,A976B08198T,SPC,BROOKLYN,NY,11232,KINGS,2019,YALE,SWT,ELECTRIC,10-2021
2021-10-27,A976B07325S,SPC,BROOKLYN,NY,11232,KINGS,2018,YALE,P/SH,ELECTRIC,10-2021
2021-10-27,A976B06198R,SPC,BROOKLYN,NY,11232,KINGS,2017,YALE,P/SH,ELECTRIC,10-2021
2014-07-18,A969N02801M,PSD,CORONA,NY,11368,QUEENS,2014,YALE,P/SH,ELECTRIC,07-2014


In [13]:
# inspection of Date/VIN/County duplicates
inspection= NYS_registrations_df[NYS_registrations_df.duplicated(subset=['month/year','VIN', 'County'], keep=False)].sort_values('VIN')
inspection.count()

VIN                   0
Registration Class    0
City                  0
State                 0
Zip                   0
County                0
Model Year            0
Make                  0
Body Type             0
Fuel Type             0
month/year            0
dtype: int64

In [15]:
# inspection of Date/VIN duplicates
inspect_date_vin = NYS_registrations_df[NYS_registrations_df.duplicated(subset=['month/year','VIN'], keep=False)].sort_values('VIN')
inspect_date_vin.count()

VIN                   0
Registration Class    0
City                  0
State                 0
Zip                   0
County                0
Model Year            0
Make                  0
Body Type             0
Fuel Type             0
month/year            0
dtype: int64

# Addressing potential Null Values

In [16]:
NYS_registrations_df.isna().sum()

VIN                   0
Registration Class    0
City                  0
State                 0
Zip                   0
County                0
Model Year            0
Make                  0
Body Type             0
Fuel Type             0
month/year            0
dtype: int64

In [17]:
NYS_registrations_df['Make'].unique()

array(['SAAB', 'EVA', 'CITIC', 'VANTA', 'CROWN', 'DE/EL', 'MANIT',
       'HONDA', 'TESLA', 'AMPER', 'PROTE', 'VOLVO', 'CENNT', 'ARCIM',
       'RIVIA', 'TOYOT', 'WORKH', 'GENZE', 'LEXUS', 'ALTA', 'ZERO',
       'ZE/MO', 'AZURE', 'FORD', 'THINK', 'GREEN', 'SCOUT', 'NAVIS',
       'INTER', 'STAR', 'POLAR', 'GEM', 'GLOBA', 'LINDE', 'BRAMM',
       'AM/GE', 'KARMA', 'KARM', 'LUCID', 'CRUS', 'MOKE', 'CR/CA', 'TESL',
       'BMW', 'TO/OU', 'ACURA', 'CLUBC', 'CL/CA', 'CLUB', 'PATH', 'ZO/EL',
       'NEWFL', 'NE/FL', 'COLUM', 'TOMBE', 'SPARK', 'SMITH', 'UTILI',
       'NOVA', 'NOVB', 'NO/BU', 'IC', 'CR/CO', 'MITSU', 'LINCO', 'RAM',
       'ZENIT', 'ZENTI', 'FIAT', 'PIAGG', 'CHEVR', 'GMC', 'ZENN', 'CHRYS',
       'LION', 'GILLI', 'DETRO', 'VOLKS', 'ASVE', 'EA/BE', 'COM', 'COMUT',
       'CO/CA', 'KALMA', 'AMPHI', 'CRAYL', 'CUSHM', 'EZ/GO', 'OHAWA',
       'ORANG', 'ORAN', 'ZELEC', 'ZEV', 'ORION', 'OROIN', 'ORON', 'IRION',
       'OTION', 'NISSA', 'PRO', 'JOHND', 'JO/DE', 'LI/SE', 'STING',
    

In [18]:
len(NYS_registrations_df['Make'].unique())

197

In [19]:
# Next we can see some vehicles name are just mispelled or have an a different name , we will group by the OEM.
NYS_registrations_df['Make'].replace(['CROWN', 'TOYOT'], 'TOYOTA', 
                    inplace=True)
NYS_registrations_df['Make'].replace(['ORION','OTION','OROIN','IRION','ORON','ENERG'], 'FORD', 
                    inplace=True)
NYS_registrations_df['Make'].replace(['NOVB', 'NOVA','IRION'], 'NOVA BUS', 
                    inplace=True)
NYS_registrations_df['Make'].replace(['NEWFL', 'NE/FL'], 'NEW FLYER', 
                    inplace=True)
NYS_registrations_df['Make'].replace(['DOOH', 'DOOHA'], 'DOOHAN', 
                    inplace=True)
NYS_registrations_df['Make'].replace(['JAGUA','JAG'], 'JAGUAR', 
                    inplace=True)
NYS_registrations_df['Make'].replace(['ORAN','ORANG'], 'ORANGE EV', 
                    inplace=True)
NYS_registrations_df['Make'].replace(['MILE','MILES'], 'MILES EV', 
                    inplace=True)
NYS_registrations_df['Make'].replace(['GENU','GENUI'], 'GENESIS', 
                    inplace=True)
NYS_registrations_df['Make'].replace(['KARM', 'FISKE','KALMA'], 'KARMA', 
                    inplace=True)
NYS_registrations_df['Make'].replace(['PIAGG','VESPA'], 'PIAGGIO', 
                    inplace=True)
NYS_registrations_df['Make'].replace(['ZERO','ZE/MO'], 'ZERO EV', 
                    inplace=True)
NYS_registrations_df['Make'].replace(['POLRD','POLAR','POLES','POLE'], 'POLESTAR', 
                    inplace=True)
NYS_registrations_df['Make'].replace(['CITIC','CITC'], 'HONDA', 
                    inplace=True)
NYS_registrations_df['Make'].replace(['VOLKS', 'SCOUT'],'VOLKSWAGEN',
                    inplace=True)
NYS_registrations_df['Make'].replace(['CHRIS','CHRYS','CRAYL'], 
                    'CHRYSLER', inplace=True)
NYS_registrations_df['Make'].replace('SUBAR', 'SUBARU', inplace=True)
NYS_registrations_df['Make'].replace('TESL', 'TESLA', inplace=True)
NYS_registrations_df['Make'].replace('FREIG', 'FREIGHT LINER', inplace=True)
NYS_registrations_df['Make'].replace('PORSC', 'PORSCHE', inplace=True)
NYS_registrations_df['Make'].replace('NIUU', 'NIU', inplace=True)
NYS_registrations_df['Make'].replace('LINCO', 'LINCOLN', inplace=True)
NYS_registrations_df['Make'].replace('VANTA', 'BMW', inplace=True)
NYS_registrations_df['Make'].replace('CASE', 'CAKE', inplace=True)
NYS_registrations_df['Make'].replace('NISSA', 'NISSAN', inplace=True)
NYS_registrations_df['Make'].replace('SUZUK', 'SUZUKI', inplace=True)
NYS_registrations_df['Make'].replace('RIVIA', 'RIVIAN', inplace=True)
NYS_registrations_df['Make'].replace('GREEN', 'GREENPOWER MOTORS', inplace=True)
NYS_registrations_df['Make'].replace('HYUND', 'HYUNDAI', inplace=True)
NYS_registrations_df['Make'].replace('CENNT', 'CENNTRO', inplace=True)
NYS_registrations_df['Make'].replace('PROTE', 'PROTERRA', inplace=True)
NYS_registrations_df['Make'].replace(['INTER','STAR','OTHER'], 'INTERNATIONAL',
                                     inplace=True)
NYS_registrations_df['Make'].replace(['MITSU','FUSO' ],'MITSUBISHI', 
                                     inplace=True)
NYS_registrations_df['Make'].replace(['IC','ICE','IC/PN' ],'ICBUS', 
                                     inplace=True)
NYS_registrations_df['Make'].replace('ZELEC', 'ZELECTRIC MOTORS', inplace=True)
NYS_registrations_df['Make'].replace('KAWAS', 'KAWASAKI', inplace=True)
NYS_registrations_df['Make'].replace('EVA', 'EVI', inplace=True)
NYS_registrations_df['Make'].replace('EVTA', 'EVT', inplace=True)
NYS_registrations_df['Make'].replace('CHEVR', 'CHEVROLET', inplace=True)

In [20]:
NYS_registrations_df['Make'].unique()

array(['SAAB', 'EVI', 'HONDA', 'BMW', 'TOYOTA', 'DE/EL', 'MANIT', 'TESLA',
       'AMPER', 'PROTERRA', 'VOLVO', 'CENNTRO', 'ARCIM', 'RIVIAN',
       'WORKH', 'GENZE', 'LEXUS', 'ALTA', 'ZERO EV', 'AZURE', 'FORD',
       'THINK', 'GREENPOWER MOTORS', 'VOLKSWAGEN', 'NAVIS',
       'INTERNATIONAL', 'POLESTAR', 'GEM', 'GLOBA', 'LINDE', 'BRAMM',
       'AM/GE', 'KARMA', 'LUCID', 'CRUS', 'MOKE', 'CR/CA', 'TO/OU',
       'ACURA', 'CLUBC', 'CL/CA', 'CLUB', 'PATH', 'ZO/EL', 'NEW FLYER',
       'COLUM', 'TOMBE', 'SPARK', 'SMITH', 'UTILI', 'NOVA BUS', 'NO/BU',
       'ICBUS', 'CR/CO', 'MITSUBISHI', 'LINCOLN', 'RAM', 'ZENIT', 'ZENTI',
       'FIAT', 'PIAGGIO', 'CHEVROLET', 'GMC', 'ZENN', 'CHRYSLER', 'LION',
       'GILLI', 'DETRO', 'ASVE', 'EA/BE', 'COM', 'COMUT', 'CO/CA',
       'AMPHI', 'CUSHM', 'EZ/GO', 'OHAWA', 'ORANGE EV',
       'ZELECTRIC MOTORS', 'ZEV', 'NISSAN', 'PRO', 'JOHND', 'JO/DE',
       'LI/SE', 'STING', 'HA/DA', 'LIVEW', 'MEVH', 'CADIL',
       'FREIGHT LINER', 'TRANT', 'EGO', 'ETU

In [21]:
len(NYS_registrations_df['Make'].unique())
# A DROP FROM 197 TO 162

162

In [22]:
NYS_registrations_df

Unnamed: 0_level_0,VIN,Registration Class,City,State,Zip,County,Model Year,Make,Body Type,Fuel Type,month/year
Reg Valid Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2021-05-04,96722014661,SRF,BOVINA,NY,13740,DELAWARE,1972,SAAB,2DSD,ELECTRIC,05-2021
2021-09-21,9106843,SRF,GLENDALE,NY,11385,QUEENS,1974,EVI,4DSD,ELECTRIC,09-2021
2021-05-12,909SR2223A,SRF,STAFFORD,NY,14143,GENESEE,1979,HONDA,2DSD,ELECTRIC,05-2021
2014-07-01,9E2SPAXBC113501,PSD,JOHNSTOWN,NY,12095,FULTON,2012,BMW,PICK,ELECTRIC,07-2014
2021-12-09,9A204759,SPC,BROOKLYN,NY,11211,KINGS,2014,TOYOTA,P/SH,ELECTRIC,12-2021
...,...,...,...,...,...,...,...,...,...,...,...
2021-10-27,A976B08198T,SPC,BROOKLYN,NY,11232,KINGS,2019,YALE,SWT,ELECTRIC,10-2021
2021-10-27,A976B07325S,SPC,BROOKLYN,NY,11232,KINGS,2018,YALE,P/SH,ELECTRIC,10-2021
2021-10-27,A976B06198R,SPC,BROOKLYN,NY,11232,KINGS,2017,YALE,P/SH,ELECTRIC,10-2021
2014-07-18,A969N02801M,PSD,CORONA,NY,11368,QUEENS,2014,YALE,P/SH,ELECTRIC,07-2014


In [23]:
NYS_registrations_df['Make' ].value_counts()

TESLA        37137
NIU           6918
HYUNDAI       3130
FORD          3082
CHEVROLET     3044
             ...  
CUSHM            1
AMPHI            1
CO/CA            1
COMUT            1
TCM              1
Name: Make, Length: 162, dtype: int64

In [24]:
# Looking at the Make for Yale to see if it's actually a car.
# base on the registration class  SPC stand for self parking cars as found here https://www.allacronyms.com/SPC/car
NYS_registrations_df[NYS_registrations_df['Make' ]=='YALE']

Unnamed: 0_level_0,VIN,Registration Class,City,State,Zip,County,Model Year,Make,Body Type,Fuel Type,month/year
Reg Valid Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2021-12-09,N336261,SPC,HAUPPAUGE,NY,11788,SUFFOLK,1978,YALE,SWT,ELECTRIC,12-2021
2021-11-21,N30346,SPC,NEW YORK,NY,10002,NEW YORK,1972,YALE,P/SH,ELECTRIC,11-2021
2021-12-14,G807N09683P,SPC,BROOKLYN,NY,11218,KINGS,2016,YALE,P/SH,ELECTRIC,12-2021
2021-11-24,G807N09057N,SPC,BROOKLYN,NY,11206,KINGS,2015,YALE,P/SH,ELECTRIC,11-2021
2021-10-29,F807N04372D,SPC,ASTORIA,NY,11105,QUEENS,2006,YALE,SWT,ELECTRIC,10-2021
2021-10-29,E216A01721E,SPC,ASTORIA,NY,11105,QUEENS,2007,YALE,SWT,ELECTRIC,10-2021
2021-10-29,E216A01715E,SPC,ASTORIA,NY,11105,QUEENS,2007,YALE,SWT,ELECTRIC,10-2021
2021-12-14,D875V06421P,SPC,BROOKLYN,NY,11218,KINGS,2016,YALE,P/SH,ELECTRIC,12-2021
2021-12-14,D875V06225P,SPC,BROOKLYN,NY,11218,KINGS,2016,YALE,P/SH,ELECTRIC,12-2021
2021-12-14,D875V05953P,SPC,BROOKLYN,NY,11218,KINGS,2016,YALE,P/SH,ELECTRIC,12-2021


In [25]:
# for the purpose of our analysis , we will filter  the dataset by Model year equal or  later than 2012 to have a decade of dataset.
NYS_registrations_df= NYS_registrations_df[NYS_registrations_df['Model Year'] >= 2012] 
NYS_registrations_df

Unnamed: 0_level_0,VIN,Registration Class,City,State,Zip,County,Model Year,Make,Body Type,Fuel Type,month/year
Reg Valid Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2014-07-01,9E2SPAXBC113501,PSD,JOHNSTOWN,NY,12095,FULTON,2012,BMW,PICK,ELECTRIC,07-2014
2021-12-09,9A204759,SPC,BROOKLYN,NY,11211,KINGS,2014,TOYOTA,P/SH,ELECTRIC,12-2021
2021-11-16,9A19709,SPC,FARMINGDALE,NY,11735,SUFFOLK,2013,TOYOTA,SWT,ELECTRIC,11-2021
2021-10-19,854087,SPC,DEPEW,NY,14043,ERIE,2016,MANIT,SWT,ELECTRIC,10-2021
2021-08-11,7.85E+14,LMB,STATEN ISLAND,NY,10302,RICHMOND,2019,HONDA,MOPD,ELECTRIC,08-2021
...,...,...,...,...,...,...,...,...,...,...,...
2022-01-24,A976B09580U,SPC,BROOKLYN,NY,11232,KINGS,2020,YALE,P/SH,ELECTRIC,01-2022
2021-10-27,A976B08198T,SPC,BROOKLYN,NY,11232,KINGS,2019,YALE,SWT,ELECTRIC,10-2021
2021-10-27,A976B07325S,SPC,BROOKLYN,NY,11232,KINGS,2018,YALE,P/SH,ELECTRIC,10-2021
2021-10-27,A976B06198R,SPC,BROOKLYN,NY,11232,KINGS,2017,YALE,P/SH,ELECTRIC,10-2021


In [32]:
# Determine the number of unique values in each column.
NYS_registrations_df.nunique()

VIN                   59955
Registration Class       38
City                   1956
State                    38
Zip                    1643
County                   63
Model Year               11
Make                    125
Body Type                22
Fuel Type                 1
month/year               87
dtype: int64

In [26]:
# Registration VIN count
Registration = NYS_registrations_df.VIN.count()
Registration

59955

In [27]:
#let's get values counts   for the Make
Make_counts = NYS_registrations_df.Make.value_counts()
Make_counts

TESLA        37113
NIU           6916
HYUNDAI       3130
CHEVROLET     3038
NISSAN        2232
             ...  
FLYWI            1
LINCOLN          1
MOFFE            1
CR/CO            1
DOOSA            1
Name: Make, Length: 125, dtype: int64

In [28]:
#let's get values counts   for the Model Year
Model_year_counts = NYS_registrations_df['Model Year'].value_counts()
Model_year_counts

2021    23413
2020    13130
2019     5978
2018     5863
2022     5415
2017     2320
2016     1527
2015      928
2014      562
2013      551
2012      268
Name: Model Year, dtype: int64

In [29]:
#let's get values counts   for the Registration class
Regist_class_counts = NYS_registrations_df['Registration Class'].value_counts()
Regist_class_counts

PAS    45616
LMB     6915
SRF     2907
PSD     1125
OMT      759
MED      531
ORG      482
MOT      373
LUA      354
OMS      230
COM      144
RGL      144
SPO      104
STA       51
OMR       37
HAM       29
ITP       28
SPC       24
VAS       18
ATV       15
BOB       11
LMC       10
SRN       10
TRC        8
LMA        7
OML        5
ARG        4
OMV        2
ORM        2
SUP        2
JCA        1
SOS        1
STG        1
NYA        1
AGR        1
SCL        1
AYG        1
JSC        1
Name: Registration Class, dtype: int64

In [30]:
#let's get values counts   for City
City_counts= NYS_registrations_df.City.value_counts()
City_counts

BROOKLYN         10075
NEW YORK          3017
STATEN ISLAND     1270
ROCHESTER         1113
BRONX              830
                 ...  
STERLINGTON          1
MELLENVILLE          1
WYANTSKILL           1
SHERMAN              1
BROOKLNY             1
Name: City, Length: 1956, dtype: int64

In [31]:
#let's get values counts  for County
County_counts= NYS_registrations_df.County.value_counts()
County_counts

KINGS          10212
NASSAU          8531
SUFFOLK         7369
WESTCHESTER     7170
QUEENS          3883
               ...  
SENECA            31
ALLEGANY          23
WYOMING           19
LEWIS             10
HAMILTON           6
Name: County, Length: 63, dtype: int64

In [33]:
#Saving the clean dataset
NYS_registrations_df.to_csv('Resources/NYS_registrations_clean.csv')