# Initialization

In [1]:
# Load libraries
import pandas as pd
import numpy as np
import plotly.express as px
from IPython.display import display

## Load Data

In [2]:
# load *.csv file into dataframes
try:
    vehicles_df = pd.read_csv('vehicles_us.csv')
except:
    vehicles_df = pd.read_csv('https://practicum-content.s3.us-west-1.amazonaws.com/datasets/vehicles_us.csv')

# EDA - Exploratory Data Analysis

<span style="color: blue;"><u>**OBJECTIVE**</u></span>
1. Review uploaded `vehicles_us.csv` file
2. Check for missing values, `Nulls`, `NaN` (*Not a Number*)

<br>

<span style="color: darkorange;"><u>**Findings / Observations**</u></span>
- 51525 row with 13 columns
- missing values, `Nulls`, & `NaNs` found in the following (5) attributes:
    1. `model_year` (3619)
    2. `cylinders` (5260)
    3. `odometer` (7892)
    4. `paint_color` (9267)
    5. `is_4wd` (25953)
- `is_4wd` column is boolean logic (1 = Y vs. 0 = N)
    - `NaN` found and may need to determine if these will be zero (0 = No)
- found several ford model names that can be consolidated
    - `ford f150` (counts=530) with `ford f-150` (counts=2796) 
    - `ford f250` (counts=339) with `ford f-250` (counts=422)
    - `ford f-250 sd` (counts=426) with `ford f-250 super duty` (counts=241)
    - `ford f250 super duty` (counts=370) with `ford f-250 super duty` (counts=241)
    - `ford f-350 sd` (counts=295) with `ford f350 super duty` (counts=246)
    - keep `f-###` model name format 
- <>

<br>

<span style="color: red;"><u>**Note-2-Self**</u></span>
- will need to parse out the manufacturer and vehicle model from `model` column
    - *see code shared by Jester* 
- `date_posted` and `model_year` column as `object` datatype
    - convert to `datetime` datatype
- `model_year` column as `float` datatype
    - convert to `datetime` datatype for ease of data visualization
- may want to clean the `model_year` column to be 4-digit year (YYYY)
- <>



In [3]:
# EDA: preview data
display(vehicles_df.sample(60))
#display(vehicles_df.head(60))
#display(vehicles_df.tail(60))

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
28136,1000,2001.0,ford explorer,fair,6.0,gas,234000.0,automatic,truck,green,1.0,2019-04-10,63
44029,3995,2001.0,ford f-150,good,8.0,gas,207269.0,automatic,truck,black,,2019-01-19,116
26428,10900,2006.0,toyota 4runner,excellent,8.0,gas,170215.0,automatic,wagon,,1.0,2019-02-17,197
7413,4995,2009.0,ford focus,good,,gas,93415.0,automatic,sedan,grey,,2019-04-18,59
38035,14500,2017.0,chevrolet silverado,like new,6.0,gas,84000.0,automatic,truck,white,,2018-08-16,36
20466,6600,2012.0,nissan rogue,excellent,4.0,gas,143000.0,automatic,SUV,white,1.0,2018-05-13,3
29720,13995,2016.0,nissan altima,excellent,6.0,gas,5500.0,automatic,sedan,white,,2019-02-26,20
21,5250,2007.0,toyota rav4,good,6.0,gas,154000.0,automatic,SUV,,,2018-08-22,8
41669,5995,2006.0,ford f-150,good,6.0,gas,108000.0,automatic,pickup,white,,2018-06-29,40
50323,19995,2006.0,chevrolet silverado 2500hd,good,8.0,gas,,automatic,truck,grey,1.0,2018-12-08,58


In [4]:
# EDA: check for nulls/NaN/missing values

# view & identify dataframe attributes with null/missing values
vehicles_df.info()
print('\n\n')

# counts of missing values, nulls, NaNs
print('Missing Value / Null / NaN Counts')
display(vehicles_df.isna().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    47906 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     46265 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      43633 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   42258 non-null  object 
 10  is_4wd        25572 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 5.1+ MB



Missing Value / Null / NaN Counts


price               0
model_year       3619
model               0
condition           0
cylinders        5260
fuel                0
odometer         7892
transmission        0
type                0
paint_color      9267
is_4wd          25953
date_posted         0
days_listed         0
dtype: int64

In [5]:
# EDA: view the unique value and counts in the following columns

display(vehicles_df['model_year'].value_counts(sort=True))
print()

display(vehicles_df['model'].value_counts(sort=True))
print()

display(vehicles_df['condition'].value_counts(sort=True))
print()

display(vehicles_df['cylinders'].value_counts(sort=True))
print()

display(vehicles_df['fuel'].value_counts(sort=True))
print()

display(vehicles_df['transmission'].value_counts(sort=False))
print()

display(vehicles_df['type'].value_counts(sort=True))
print()

display(vehicles_df['paint_color'].value_counts())
print()

display(vehicles_df['is_4wd'].value_counts(sort=True))
print()

model_year
2013.0    3549
2012.0    3468
2014.0    3448
2011.0    3375
2015.0    3323
          ... 
1948.0       1
1961.0       1
1936.0       1
1949.0       1
1929.0       1
Name: count, Length: 68, dtype: int64




model
ford f-150                           2796
chevrolet silverado 1500             2171
ram 1500                             1750
chevrolet silverado                  1271
jeep wrangler                        1119
                                     ... 
ford f-250 super duty                 241
acura tl                              236
kia sorento                           236
nissan murano                         235
mercedes-benz benze sprinter 2500      41
Name: count, Length: 100, dtype: int64




condition
excellent    24773
good         20145
like new      4742
fair          1607
new            143
salvage        115
Name: count, dtype: int64




cylinders
8.0     15844
6.0     15700
4.0     13864
10.0      549
5.0       272
3.0        34
12.0        2
Name: count, dtype: int64




fuel
gas         47288
diesel       3714
hybrid        409
other         108
electric        6
Name: count, dtype: int64




transmission
automatic    46902
manual        2829
other         1794
Name: count, dtype: int64




type
SUV            12405
truck          12353
sedan          12154
pickup          6988
coupe           2303
wagon           1541
mini-van        1161
hatchback       1047
van              633
convertible      446
other            256
offroad          214
bus               24
Name: count, dtype: int64




paint_color
white     10029
black      7692
silver     6244
grey       5037
blue       4475
red        4421
green      1396
brown      1223
custom     1153
yellow      255
orange      231
purple      102
Name: count, dtype: int64




is_4wd
1.0    25572
Name: count, dtype: int64




In [6]:
# EDA: view the unique value and counts in the 'type' column
# Note: will use `.isin()` method to filter 'type' with NaN for 
    # no_4wd_nan = [sedan, coupe, wagon, mini-van, hatchback, van, convertible, other, bus] 
    # yes_4wd_nan = [SUV, truck, pickup, offroad]

display(vehicles_df['type'].value_counts(sort=True))

type
SUV            12405
truck          12353
sedan          12154
pickup          6988
coupe           2303
wagon           1541
mini-van        1161
hatchback       1047
van              633
convertible      446
other            256
offroad          214
bus               24
Name: count, dtype: int64

### <span style="color: gold;">EDA: Missing Values (*Null / NaN*)</span>

<span style="color: blue;"><u>**OBJECTIVE**</u></span>
- deep dive review of all columns with missing values
- determine how to either replace or fill in the missing values

<br>

<span style="color: red;"><u>**Note-2-Self**</u></span>
- Read/Review the use  `fillna()`  `mode` parameter for fill-in `'model_year'`
- `'odometer'` - take average to fill-in NaN

In [7]:
# EDA: review rows with missing values

display(vehicles_df[vehicles_df.isna().any(axis=1)].head(60))

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
0,9400,2011.0,bmw x5,good,6.0,gas,145000.0,automatic,SUV,,1.0,2018-06-23,19
1,25500,,ford f-150,good,6.0,gas,88705.0,automatic,pickup,white,1.0,2018-10-19,50
2,5500,2013.0,hyundai sonata,like new,4.0,gas,110000.0,automatic,sedan,red,,2019-02-07,79
3,1500,2003.0,ford f-150,fair,8.0,gas,,automatic,pickup,,,2019-03-22,9
4,14900,2017.0,chrysler 200,excellent,4.0,gas,80903.0,automatic,sedan,black,,2019-04-02,28
6,12990,2015.0,toyota camry,excellent,4.0,gas,79212.0,automatic,sedan,white,,2018-12-27,73
8,11500,2012.0,kia sorento,excellent,4.0,gas,104174.0,automatic,SUV,,1.0,2018-07-16,19
9,9200,2008.0,honda pilot,excellent,,gas,147191.0,automatic,SUV,blue,1.0,2019-02-15,17
11,8990,2012.0,honda accord,excellent,4.0,gas,111142.0,automatic,sedan,grey,,2019-03-28,29
12,18990,2012.0,ram 1500,excellent,8.0,gas,140742.0,automatic,pickup,,1.0,2019-04-02,37


#### <span style="color: gold;">`'is_4wd'` column</span>

<span style="color: blue;"><u>**OBJECTIVE**</u></span>
- review & determine if the `NaN` in `is_4wd` column should be zero (`0` = not 4wd vehicle)



In [8]:
# `is_4wd`: see rows with NaN
rows_with_nan = vehicles_df[vehicles_df['is_4wd'].isna()]

display(rows_with_nan.sample(60))

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
39068,7950,2012.0,volkswagen jetta,excellent,4.0,diesel,62000.0,manual,sedan,silver,,2018-10-15,44
21093,8995,2014.0,toyota camry,excellent,4.0,gas,80000.0,automatic,sedan,black,,2019-01-12,2
32279,16990,2008.0,ford econoline,good,8.0,gas,104738.0,automatic,truck,white,,2018-10-31,29
12440,9999,2013.0,toyota tacoma,excellent,4.0,gas,158025.0,automatic,pickup,white,,2018-06-07,95
9742,12500,2018.0,nissan altima,like new,,gas,320000.0,automatic,sedan,silver,,2018-08-14,75
9416,12995,2016.0,kia soul,excellent,4.0,gas,28207.0,automatic,SUV,,,2018-06-11,129
24500,5500,2005.0,nissan maxima,excellent,6.0,gas,107190.0,automatic,sedan,silver,,2019-02-24,39
4974,3750,2004.0,ford taurus,excellent,6.0,gas,130000.0,automatic,sedan,silver,,2018-08-21,31
6969,7988,2009.0,chevrolet trailblazer,excellent,6.0,gas,72800.0,automatic,SUV,white,,2018-11-11,50
25541,5750,,toyota corolla,like new,4.0,gas,,automatic,sedan,silver,,2019-02-05,30


In [9]:
# EDA: filter & view NaN in the 'type' column for specifically for the following vehicle types
# OBJECTIVE: replace NaN with zero (0 = Non-4wd vehicle)

# list of vehicle types that will not have 4wd capability and use with `.isin()` to view then later with either `.fillna()` or `.replace()`
no_4wd_nan = ['sedan', 'coupe', 'mini-van', 'hatchback', 'van', 'convertible', 'other', 'bus']

# new dataframe to review rows
sorted_non_4wd_df = vehicles_df[vehicles_df['type'].isin(no_4wd_nan)][['model_year', 'model', 'type', 'is_4wd']].sort_values(by='model')

# review top, bottom, and random selected (60) rows to have confidence to replace NaN with zero (0 = not 4wd capable) in 'type' column
display(sorted_non_4wd_df.head(60))
display(sorted_non_4wd_df.tail(60))
display(sorted_non_4wd_df.sample(60))

Unnamed: 0,model_year,model,type,is_4wd
33040,2003.0,acura tl,sedan,
38743,2013.0,acura tl,sedan,
41145,2013.0,acura tl,sedan,
38760,2008.0,acura tl,sedan,
23124,2003.0,acura tl,sedan,
11355,2007.0,acura tl,sedan,
23110,2005.0,acura tl,sedan,
32707,2012.0,acura tl,sedan,
23106,2007.0,acura tl,sedan,
11413,2011.0,acura tl,sedan,1.0


Unnamed: 0,model_year,model,type,is_4wd
46569,2013.0,volkswagen passat,sedan,
42078,2014.0,volkswagen passat,sedan,
35025,2015.0,volkswagen passat,sedan,
11261,2004.0,volkswagen passat,sedan,
43403,2007.0,volkswagen passat,sedan,
14511,2013.0,volkswagen passat,sedan,
40209,2013.0,volkswagen passat,sedan,
27283,2008.0,volkswagen passat,hatchback,
48302,2015.0,volkswagen passat,sedan,
20865,2016.0,volkswagen passat,sedan,


Unnamed: 0,model_year,model,type,is_4wd
28808,2012.0,ford fusion,sedan,
34418,2014.0,volkswagen jetta,sedan,
15621,2018.0,honda civic lx,sedan,
24319,2018.0,chevrolet camaro lt coupe 2d,coupe,
3353,2015.0,volkswagen jetta,sedan,
46396,2018.0,nissan sentra,sedan,
4844,2010.0,chevrolet malibu,sedan,
46702,2011.0,chrysler 300,sedan,
24162,2015.0,ford mustang gt coupe 2d,coupe,
12251,2014.0,volkswagen jetta,sedan,


In [10]:
# EDA: filter & view NaN in the 'type' column for specifically for the following vehicle types
# OBJECTIVE: replace NaN with one (1 = 4wd vehicle)

yes_4wd_nan = ['SUV', 'truck', 'pickup', 'wagon', 'offroad']

#yes_4wd_nan = ['truck']

# new dataframe to review rows
sorted_yes_4wd_df = vehicles_df[vehicles_df['type'].isin(yes_4wd_nan)][['model_year', 'model', 'type', 'is_4wd']].sort_values(by='model')


# review top, bottom, and random selected (60) rows to have confidence to replace NaN with zero (0 = not 4wd capable) in 'type' column
display(sorted_yes_4wd_df.head(60))
display(sorted_yes_4wd_df.tail(60))
display(sorted_yes_4wd_df.sample(60))

Unnamed: 0,model_year,model,type,is_4wd
8133,2007.0,acura tl,SUV,
7029,2007.0,acura tl,SUV,
8675,2007.0,acura tl,SUV,
16496,2008.0,bmw x5,SUV,1.0
16449,2010.0,bmw x5,SUV,1.0
29694,2010.0,bmw x5,SUV,1.0
29704,2011.0,bmw x5,SUV,1.0
43159,2012.0,bmw x5,SUV,1.0
29745,2012.0,bmw x5,SUV,1.0
43196,2011.0,bmw x5,SUV,1.0


Unnamed: 0,model_year,model,type,is_4wd
27439,2007.0,toyota tundra,truck,1.0
16725,2006.0,toyota tundra,truck,
2110,2014.0,toyota tundra,truck,1.0
31366,2000.0,toyota tundra,truck,
16724,2003.0,toyota tundra,pickup,
20875,2007.0,toyota tundra,truck,
46479,2001.0,toyota tundra,truck,1.0
31407,,toyota tundra,pickup,1.0
20549,2005.0,toyota tundra,pickup,1.0
40796,2006.0,toyota tundra,pickup,1.0


Unnamed: 0,model_year,model,type,is_4wd
35991,2010.0,ram 1500,truck,1.0
4101,2004.0,dodge dakota,pickup,1.0
26595,2011.0,ford escape,SUV,1.0
32129,2017.0,ford expedition,SUV,1.0
48142,2005.0,chevrolet silverado,truck,1.0
46787,,ram 1500,pickup,
3722,2014.0,jeep cherokee,SUV,1.0
7254,2016.0,ford f250,truck,1.0
46821,2009.0,honda cr-v,SUV,1.0
898,2014.0,chevrolet silverado 1500,pickup,1.0


#### <span style="color: gold;">`model_year` column</span>

<span style="color: blue;"><u>**OBJECTIVE**</u></span>
- review & determine how replace the `NaN` with a year in `model_year` column
- <>

<br>

<span style="color: red;"><u>**Note-2-Self**</u></span>
- Read/Review the use  `fillna()`  `mode` parameter for fill-in `'model_year'`
- `'odometer'` - take average to fill-in NaN
- <>

In [11]:
# EDA: view all the unique value and counts in the 'model_year' column

# created a new dataframe to view the entire (68) unique values
model_year_counts_df = (vehicles_df["model_year"].value_counts(sort=False)).reset_index()

# create the columns to view  in the created dataframe to review the (68) unique years and counts
model_year_counts_df.columns = ['model_year', 'count']

# sort model years in ascending order (ascending=True) or descending order (ascending=False)
model_year_df_sorted = model_year_counts_df.sort_values(by='model_year', ascending=False)

# display all (68) unique model years and its counts in the `model_year` column
display(model_year_df_sorted.head(60))
display(model_year_df_sorted.tail(10))

Unnamed: 0,model_year,count
18,2019.0,380
8,2018.0,2193
3,2017.0,2419
20,2016.0,2954
5,2015.0,3323
4,2014.0,3448
1,2013.0,3549
6,2012.0,3468
0,2011.0,3375
10,2010.0,2691


Unnamed: 0,model_year,count
63,1961.0,1
61,1960.0,3
66,1958.0,2
54,1955.0,1
58,1954.0,1
65,1949.0,1
60,1948.0,1
64,1936.0,1
67,1929.0,1
62,1908.0,2


### <span style="color: gold;">EDA: Duplicate names in `model` column</span>

<span style="color: blue;"><u>**OBJECTIVE**</u></span>
- review and check if there is duplicate/similar model names in `model` column
    - if found then will need to consolidate by replacing/changing the model names
- <>

<br>

<span style="color: darkorange;"><u>**Findings / Observations**</u></span>
- found several ford model names that can be consolidated
    - `ford f150` (counts=530) with `ford f-150` (counts=2796) 
    - `ford f250` (counts=339) with `ford f-250` (counts=422)
    - `ford f-250 sd` (counts=426) with `ford f-250 super duty` (counts=241)
    - `ford f250 super duty` (counts=370) with `ford f-250 super duty` (counts=241)
    - `ford f-350 sd` (counts=295) with `ford f350 super duty` (counts=246)
    - keep `f-###` model name format 
- <>

<br>

<span style="color: red;"><u>**Note-2-Self**</u></span>
- Read/Review the use  `fillna()`  `mode` parameter for fill-in `'model_year'`
- `'odometer'` - take average to fill-in NaN

In [12]:
# EDA: view all the unique value and counts in the 'model' column
# found several ford model names that can be consolidated i.e. 'ford f250' with 'ford f-250'


# create another dataframe to review the (100) unique vehicle models and its counts
model_counts_df = (vehicles_df['model'].value_counts()).reset_index()


# create the columns to view  in the created dataframe to review the (100) unique vehicle models and counts
model_counts_df.columns = ['model', 'count']


# sort `model` column by alphabectical order 
model_counts_df_sorted = model_counts_df.sort_values(by='model')


display(model_counts_df_sorted.head(60))
display(model_counts_df_sorted.tail(42))

Unnamed: 0,model,count
96,acura tl,236
85,bmw x5,267
84,buick enclave,271
65,cadillac escalade,322
49,chevrolet camaro,414
69,chevrolet camaro lt coupe 2d,311
79,chevrolet colorado,286
31,chevrolet corvette,499
40,chevrolet cruze,457
25,chevrolet equinox,591


Unnamed: 0,model,count
86,honda civic lx,262
18,honda cr-v,685
39,honda odyssey,457
73,honda pilot,302
46,hyundai elantra,423
83,hyundai santa fe,273
35,hyundai sonata,477
78,jeep cherokee,293
15,jeep grand cherokee,806
87,jeep grand cherokee laredo,256


## <span style="color: teal;">Feature Engineering (FE)</span>
<span style="color: green;"><u>**Steps / Action Taken**</u></span>
- combined the following ford model names in `model` column
    - `ford f150` (counts=530) with `ford f-150` (counts=2796) 
    - `ford f250` (counts=339) with `ford f-250` (counts=422)
    - `ford f-250 sd` (counts=426) with `ford f-250 super duty` (counts=241)
    - `ford f250 super duty` (counts=370) with `ford f-250 super duty` (counts=241)
    - `ford f-350 sd` (counts=295) with `ford f350 super duty` (counts=246)
- renamed the following ford model names in `model` column
    - `ford f350 super duty` (counts=541) to `ford f-350 super duty`
    - `ford f150 supercrew cab xlt` (counts=327) to `ford f-150 supercrew cab xlt`
    - `ford f350` (counts=250) to `ford f-350`
- created two new columns, `make` and `model_name`, derived from `model`
- <>

<br>

<span style="color: darkorange;"><u>**Findings / Observations**</u></span>
- <>

<br>

<span style="color: red;"><u>**Note-2-Self**</u></span> 
- for now...okay to keep the `date_posted` column as `object` datatype
    - will determine and return if needed to change to `datetime` datatype
- may want to clean the `model_year` column to be 4-digit year (YYYY)
- <>


### <span style="color: green;">Combine & Rename ford model names in `model` column</span>

In [13]:
# FE: replace the following model names in the 'model' column

# `ford f150` (counts=530) with `ford f-150` (counts=2796)
vehicles_df['model'] = vehicles_df['model'].replace('ford f150', 'ford f-150')

# `ford f250` (counts=339) with `ford f-250` (counts=422)
vehicles_df['model'] = vehicles_df['model'].replace('ford f250', 'ford f-250')

# `ford f-250 sd` (counts=426) with `ford f-250 super duty` (counts=241)
vehicles_df['model'] = vehicles_df['model'].replace('ford f-250 sd', 'ford f-250 super duty')

# `ford f250 super duty` (counts=370) with `ford f-250 super duty` (counts=241)
vehicles_df['model'] = vehicles_df['model'].replace('ford f250 super duty', 'ford f-250 super duty')

# `ford f-350 sd` (counts=295) with `ford f350 super duty` (counts=246)
vehicles_df['model'] = vehicles_df['model'].replace('ford f-350 sd', 'ford f350 super duty')

# rename `ford f350 super duty` (counts=541) to `ford f-350 super duty`
vehicles_df['model'] = vehicles_df['model'].replace('ford f350 super duty', 'ford f-350 super duty')

# rename `ford f150 supercrew cab xlt` (counts=327) to `ford f-150 supercrew cab xlt`
vehicles_df['model'] = vehicles_df['model'].replace('ford f150 supercrew cab xlt', 'ford f-150 supercrew cab xlt')

# rename `ford f350` (counts=250) to `ford f-350`
vehicles_df['model'] = vehicles_df['model'].replace('ford f350', 'ford f-350')


# create another dataframe to review the changes above in the 'model' column
model_counts_df = vehicles_df['model'].value_counts().reset_index()


# create the columns to view  in the created dataframe to review the (100) unique vehicle models and counts
model_counts_df.columns = ['model', 'count']

# sort `model` column by alphabectical order 
sorted_model_df = model_counts_df.sort_values(by='model')


display(sorted_model_df)
display(sorted_model_df.head(60))
display(sorted_model_df.tail(36))

Unnamed: 0,model,count
91,acura tl,236
82,bmw x5,267
81,buick enclave,271
63,cadillac escalade,322
49,chevrolet camaro,414
...,...,...
60,toyota sienna,329
13,toyota tacoma,827
25,toyota tundra,603
32,volkswagen jetta,519


Unnamed: 0,model,count
91,acura tl,236
82,bmw x5,267
81,buick enclave,271
63,cadillac escalade,322
49,chevrolet camaro,414
67,chevrolet camaro lt coupe 2d,311
76,chevrolet colorado,286
33,chevrolet corvette,499
42,chevrolet cruze,457
27,chevrolet equinox,591


Unnamed: 0,model,count
37,hyundai sonata,477
75,jeep cherokee,293
16,jeep grand cherokee,806
84,jeep grand cherokee laredo,256
54,jeep liberty,355
4,jeep wrangler,1119
44,jeep wrangler unlimited,452
92,kia sorento,236
56,kia soul,349
94,mercedes-benz benze sprinter 2500,41


### <span style="color: green;">Create Two New Columns: `make` and `model_name`</span>

In [14]:
# FE: create two new columns, 'make' and 'model_name', derived from 'model' column

# `.str.split(' ')`: This splits each string in the model column by spaces. The default behavior is to split by whitespace.
# `n=1`: This limits the split to 1 occurrence. This means that the string will be split at the first space only. If there are more spaces in the string, they will be ignored for further splitting.
# `expand=True`: This returns a DataFrame where each split part becomes a separate column.
vehicles_df[['make', 'model_name']] = vehicles_df['model'].str.split(' ', n=1, expand=True)


# review & confirm two new columns created
display(vehicles_df.info())
display(vehicles_df.isna().sum())
display(vehicles_df)
display(vehicles_df.head(60))
#display(vehicles_df.tail(60))
#display(vehicles_df.sample(60))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    47906 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     46265 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      43633 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   42258 non-null  object 
 10  is_4wd        25572 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
 13  make          51525 non-null  object 
 14  model_name    51525 non-null  object 
dtypes: float64(4), int64(2), object(9)
memory usage: 5.9+ MB


None

price               0
model_year       3619
model               0
condition           0
cylinders        5260
fuel                0
odometer         7892
transmission        0
type                0
paint_color      9267
is_4wd          25953
date_posted         0
days_listed         0
make                0
model_name          0
dtype: int64

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed,make,model_name
0,9400,2011.0,bmw x5,good,6.0,gas,145000.0,automatic,SUV,,1.0,2018-06-23,19,bmw,x5
1,25500,,ford f-150,good,6.0,gas,88705.0,automatic,pickup,white,1.0,2018-10-19,50,ford,f-150
2,5500,2013.0,hyundai sonata,like new,4.0,gas,110000.0,automatic,sedan,red,,2019-02-07,79,hyundai,sonata
3,1500,2003.0,ford f-150,fair,8.0,gas,,automatic,pickup,,,2019-03-22,9,ford,f-150
4,14900,2017.0,chrysler 200,excellent,4.0,gas,80903.0,automatic,sedan,black,,2019-04-02,28,chrysler,200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51520,9249,2013.0,nissan maxima,like new,6.0,gas,88136.0,automatic,sedan,black,,2018-10-03,37,nissan,maxima
51521,2700,2002.0,honda civic,salvage,4.0,gas,181500.0,automatic,sedan,white,,2018-11-14,22,honda,civic
51522,3950,2009.0,hyundai sonata,excellent,4.0,gas,128000.0,automatic,sedan,blue,,2018-11-15,32,hyundai,sonata
51523,7455,2013.0,toyota corolla,good,4.0,gas,139573.0,automatic,sedan,black,,2018-07-02,71,toyota,corolla


Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed,make,model_name
0,9400,2011.0,bmw x5,good,6.0,gas,145000.0,automatic,SUV,,1.0,2018-06-23,19,bmw,x5
1,25500,,ford f-150,good,6.0,gas,88705.0,automatic,pickup,white,1.0,2018-10-19,50,ford,f-150
2,5500,2013.0,hyundai sonata,like new,4.0,gas,110000.0,automatic,sedan,red,,2019-02-07,79,hyundai,sonata
3,1500,2003.0,ford f-150,fair,8.0,gas,,automatic,pickup,,,2019-03-22,9,ford,f-150
4,14900,2017.0,chrysler 200,excellent,4.0,gas,80903.0,automatic,sedan,black,,2019-04-02,28,chrysler,200
5,14990,2014.0,chrysler 300,excellent,6.0,gas,57954.0,automatic,sedan,black,1.0,2018-06-20,15,chrysler,300
6,12990,2015.0,toyota camry,excellent,4.0,gas,79212.0,automatic,sedan,white,,2018-12-27,73,toyota,camry
7,15990,2013.0,honda pilot,excellent,6.0,gas,109473.0,automatic,SUV,black,1.0,2019-01-07,68,honda,pilot
8,11500,2012.0,kia sorento,excellent,4.0,gas,104174.0,automatic,SUV,,1.0,2018-07-16,19,kia,sorento
9,9200,2008.0,honda pilot,excellent,,gas,147191.0,automatic,SUV,blue,1.0,2019-02-15,17,honda,pilot


In [15]:
# EDA: view the unique value names and counts in the following columns

display(vehicles_df['make'].value_counts(sort=False))
print()

display(vehicles_df['model_name'].value_counts(sort=False))
print()

make
bmw                267
ford             12672
hyundai           1173
chrysler           838
toyota            5445
honda             3485
kia                585
chevrolet        10611
ram               3316
gmc               2378
jeep              3281
nissan            3208
subaru            1272
dodge             1255
mercedes-benz       41
acura              236
cadillac           322
volkswagen         869
buick              271
Name: count, dtype: int64




model_name
x5              267
f-150          3326
sonata          477
200             243
300             316
               ... 
dakota          242
f-350           250
trailblazer     255
econoline       296
murano          235
Name: count, Length: 95, dtype: int64




In [16]:
# FE: Consolidating vehicle type value names in the 'type' column

# Combine 'truck' and 'pickup' value names into 'truck' value name in `type` column
#vehicles_df['type'] = vehicles_df['type'].replace('pickup', 'truck')

display(vehicles_df['type'].value_counts(sort=True))

# Combine 'truck' and 'pickup' value names into 'pickup' value name in `type` column
#vehicles_df['type'] = vehicles_df['type'].replace()

type
SUV            12405
truck          12353
sedan          12154
pickup          6988
coupe           2303
wagon           1541
mini-van        1161
hatchback       1047
van              633
convertible      446
other            256
offroad          214
bus               24
Name: count, dtype: int64

Note-2-Self

export large output greater than 120 onto `.csv` or `.txt` or `.xlsx` 

## <span style="color: teal;">Remove Missing Value, Null, and NaN (*Not a Number*)</span>

<span style="color: green;"><u>**Steps / Action Taken**</u></span>
- Fill Missing Values
    - `is_4wd` column: replace `NaN` with zero (`0` = not 4wd capable)
    - `paint_color` column: replace `NaN` with `'white'`
- <>

<br>

<span style="color: darkorange;"><u>**Findings / Observations**</u></span>
- <>

<br>

<span style="color: red;"><u>**Note-2-Self**</u></span>
- for now...okay to keep the `date_posted` column as `object` datatype
    - will determine and return if needed to change to `datetime` datatype
- may want to clean the `model_year` column to be 4-digit year (YYYY)
- <>


### <span style="color: green;">Fill Missing Value: `'is_4wd'` column</span>


In [17]:
# MISSING VALUES: replace NaN with zero (0 = not 4wd) in 'is_4wd' column 
vehicles_df['is_4wd'] = vehicles_df['is_4wd'].fillna(0)

### <span style="color: green;">Fill Missing Value: `'paint_color'` column</span>


In [18]:
# EDA: using `.describe()` on `paint_color` column
vehicles_df['paint_color'].describe()


# MISSING VALUES: replace NaN with zero (0 = not 4wd) in 'is_4wd' column 
vehicles_df['paint_color'] = vehicles_df['paint_color'].fillna('white')

### <span style="color: green;">Filling Missing Value: `'model_year'` column</span>

In [19]:
# SANITY CHECK: before filling in missing values in 'model_year' column
vehicles_df.isna().sum()

price              0
model_year      3619
model              0
condition          0
cylinders       5260
fuel               0
odometer        7892
transmission       0
type               0
paint_color        0
is_4wd             0
date_posted        0
days_listed        0
make               0
model_name         0
dtype: int64

In [20]:
# MISSING VALUES: replace NaN with 'median' model year calculation in 'model_year' column
    # use df['column_with_NaN'] = df.fillna(df.groupby(['column-1', 'column-2'])['column_with_NaN'].transform('median'))
    # 'column-1' = this 1st column name must have some relavance to 'column_with_NaN'
    # 'column-2' = this 2nd column name must have some relavance to 'column_with_NaN'
        # REMINDER:  additional column names to groupby reduces the groupings size when calculating either the 'median' or 'mean'
            # Thus the chance the reductions of NaN may not be filled in with `.transform('median')`

vehicles_df['model_year'] = vehicles_df['model_year'].fillna(vehicles_df.groupby(['model'])['model_year'].transform('median'))

In [21]:
# SANITY CHECK: after filling in missing values in 'model_year' column
vehicles_df.isna().sum()

price              0
model_year         0
model              0
condition          0
cylinders       5260
fuel               0
odometer        7892
transmission       0
type               0
paint_color        0
is_4wd             0
date_posted        0
days_listed        0
make               0
model_name         0
dtype: int64

### <span style="color: green;">Filling Missing Value: `'odometer'` column</span>


In [22]:
# EDA: using `.describe()` on `odometer` column
vehicles_df['odometer'].describe()

count     43633.000000
mean     115553.461738
std       65094.611341
min           0.000000
25%       70000.000000
50%      113000.000000
75%      155000.000000
max      990000.000000
Name: odometer, dtype: float64

In [23]:
# SANITY CHECK: before filling in missing values in 'odometer' column
vehicles_df.isna().sum()

price              0
model_year         0
model              0
condition          0
cylinders       5260
fuel               0
odometer        7892
transmission       0
type               0
paint_color        0
is_4wd             0
date_posted        0
days_listed        0
make               0
model_name         0
dtype: int64

In [24]:
# MISSING VALUES: replace NaN with 'median' odometer calculation in 'odometer' column
    # use df['column_with_NaN'] = df.fillna(df.groupby(['column-1', 'column-2'])['column_with_NaN'].transform('median'))
    # 'column-1' = this 1st column name must have some relavance to 'column_with_NaN'
    # 'column-2' = this 2nd column name must have some relavance to 'column_with_NaN'
        # REMINDER:  additional column names to groupby reduces the groupings size when calculating either the 'median' or 'mean'
            # Thus the chance the reductions of NaN may not be filled in with `.transform('median')`

vehicles_df['odometer'] = vehicles_df['odometer'].fillna(vehicles_df.groupby(['model', 'model_year'])['odometer'].transform('median'))

In [25]:
# SANITY CHECK: after filling in missing values in 'odometer' column
vehicles_df.isna().sum()

price              0
model_year         0
model              0
condition          0
cylinders       5260
fuel               0
odometer          79
transmission       0
type               0
paint_color        0
is_4wd             0
date_posted        0
days_listed        0
make               0
model_name         0
dtype: int64

In [26]:
# REFERENCE CODE:  use `groupby()` and `transform()` to fill in missing values

#df['value'] = df['value'].fillna(df.groupby('name')['value'].transform('mean'))

### <span style="color: green;">Filling Missing Value: `'cylinders'` column</span>

In [27]:
# SANITY CHECK: before filling in missing values in 'cylinders' column
vehicles_df.isna().sum()

price              0
model_year         0
model              0
condition          0
cylinders       5260
fuel               0
odometer          79
transmission       0
type               0
paint_color        0
is_4wd             0
date_posted        0
days_listed        0
make               0
model_name         0
dtype: int64

In [28]:
# MISSING VALUES: replace NaN with 'median' cylinder calculation in 'cylinders' column
    # use df['column_with_NaN'] = df.fillna(df.groupby(['column-1', 'column-2'])['column_with_NaN'].transform('median'))
    # 'column-1' = this 1st column name must have some relavance to 'column_with_NaN'
    # 'column-2' = this 2nd column name must have some relavance to 'column_with_NaN'
        # REMINDER:  additional column names to groupby reduces the groupings size when calculating either the 'median' or 'mean'
            # Thus the chance the reductions of NaN may not be filled in with `.transform('median')`

vehicles_df['cylinders'] = vehicles_df['cylinders'].fillna(vehicles_df.groupby(['model', 'model_year'])['cylinders'].transform('median'))

In [29]:
# SANITY CHECK: after filling in missing values in 'cylinders' column
display(vehicles_df.isna().sum())
print()

vehicles_df.info()

price            0
model_year       0
model            0
condition        0
cylinders       23
fuel             0
odometer        79
transmission     0
type             0
paint_color      0
is_4wd           0
date_posted      0
days_listed      0
make             0
model_name       0
dtype: int64


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    51525 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     51502 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      51446 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   51525 non-null  object 
 10  is_4wd        51525 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
 13  make          51525 non-null  object 
 14  model_name    51525 non-null  object 
dtypes: float64(4), int64(2), object(9)
memory usage: 5.9+ MB


### <span style="color: green;">Dropping Missing Value: `'odometer'` & `'cylinders'` column</span>

<span style="color: green;"><u>**Steps / Action Taken**</u></span>
- Drop the remaining Missing Values unable to either fill in or replace
    - `'cylinders'` (counts=23)
    - `'odometer'` (counts=79)
- <>

<br>

<span style="color: darkorange;"><u>**Findings / Observations**</u></span>
- <>

<br>

<span style="color: red;"><u>**Note-2-Self**</u></span>
- <>


## <span style="color: red;">TEST CODES (DID NOT USE)</span>

In [30]:
# EDA: 

# view rows pertaining to bmw 
bmw_df = vehicles_df.query("make == 'bmw'")[['model', 'model_year', 'cylinders', 'odometer', 'type', 'make', 'model_name']]

# sort dataframe by model year then by model then by cylinder in ascending order
sort_bmw_df = bmw_df.sort_values(by=['model_year', 'model', 'cylinders'], ascending=[True, True, True])

#display(bmw_df.head(60))
display(sort_bmw_df.head(60))

Unnamed: 0,model,model_year,cylinders,odometer,type,make,model_name
14731,bmw x5,2001.0,6.0,140000.0,SUV,bmw,x5
40844,bmw x5,2001.0,6.0,165273.0,SUV,bmw,x5
50828,bmw x5,2001.0,6.0,153000.0,hatchback,bmw,x5
456,bmw x5,2001.0,8.0,190108.0,SUV,bmw,x5
1425,bmw x5,2001.0,8.0,190108.0,SUV,bmw,x5
10433,bmw x5,2001.0,8.0,118000.0,SUV,bmw,x5
26000,bmw x5,2001.0,8.0,157146.0,SUV,bmw,x5
349,bmw x5,2002.0,6.0,153000.0,SUV,bmw,x5
14536,bmw x5,2002.0,6.0,163000.0,SUV,bmw,x5
23532,bmw x5,2002.0,6.0,163000.0,SUV,bmw,x5


In [31]:
# groupby

#group_vehicles = vehicles_df.groupby(['model', 'model_year', 'cylinders'])['model_name'].count()

group_bmw = bmw_df.groupby(['model', 'model_year', 'cylinders'])['model_name'].count()

display(group_bmw)


model   model_year  cylinders
bmw x5  2001.0      6.0           3
                    8.0           4
        2002.0      6.0           3
                    8.0           1
        2003.0      6.0           5
                    8.0           9
        2004.0      6.0          15
                    8.0           5
        2005.0      6.0           8
                    8.0           2
        2006.0      6.0           9
                    8.0           5
        2007.0      6.0          15
        2008.0      6.0          13
                    8.0           8
        2009.0      6.0          12
                    8.0           1
        2010.0      6.0          35
                    8.0          10
        2011.0      6.0          24
                    8.0           3
        2012.0      6.0          39
                    8.0           1
        2013.0      6.0          15
                    8.0           1
        2014.0      6.0           3
        2015.0      6.0          1

In [32]:
# convert groupby() into dataframe

group_bmw_df = group_bmw.reset_index()

display(group_bmw_df)

Unnamed: 0,model,model_year,cylinders,model_name
0,bmw x5,2001.0,6.0,3
1,bmw x5,2001.0,8.0,4
2,bmw x5,2002.0,6.0,3
3,bmw x5,2002.0,8.0,1
4,bmw x5,2003.0,6.0,5
5,bmw x5,2003.0,8.0,9
6,bmw x5,2004.0,6.0,15
7,bmw x5,2004.0,8.0,5
8,bmw x5,2005.0,6.0,8
9,bmw x5,2005.0,8.0,2


In [33]:
# TEST CODE:

# view rows with NaN in `cylinders` column
cylinders_df = vehicles_df[vehicles_df['cylinders'].isna()]

# sort rows in ascending order, or descending order (ascending=False), under `cylinders` column
sort_cylinders_df = cylinders_df.sort_values(by='model', ascending=False)


display(cylinders_df.sample(10))
display(sort_cylinders_df.head(10))
display(sort_cylinders_df.tail(10))

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed,make,model_name
29822,1800,1989.0,toyota camry,excellent,,gas,196617.0,automatic,sedan,blue,0.0,2019-03-01,32,toyota,camry
39952,1900,1990.0,toyota camry,good,,gas,156481.0,automatic,sedan,white,0.0,2018-09-29,43,toyota,camry
47761,3800,1984.0,honda accord,new,,gas,121000.0,manual,sedan,blue,0.0,2019-01-05,162,honda,accord
1101,9200,1975.0,ford f-150,excellent,,gas,,automatic,truck,green,0.0,2018-08-28,40,ford,f-150
15623,5300,2005.0,dodge charger,excellent,,gas,125000.0,automatic,sedan,red,0.0,2018-10-30,46,dodge,charger
35818,1700,1986.0,nissan sentra,excellent,,gas,152000.0,manual,coupe,blue,0.0,2018-09-29,23,nissan,sentra
7113,84900,2019.0,ford f-350,new,,diesel,9000.0,automatic,pickup,white,1.0,2019-03-26,7,ford,f-350
33965,5800,1971.0,ford econoline,good,,gas,78000.0,automatic,van,blue,0.0,2019-04-01,40,ford,econoline
36582,44900,1949.0,chevrolet suburban,good,,gas,1800.0,automatic,wagon,orange,0.0,2018-08-19,10,chevrolet,suburban
3844,1900,1986.0,nissan sentra,excellent,,gas,152000.0,manual,coupe,blue,0.0,2018-10-29,63,nissan,sentra


Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed,make,model_name
12760,1050,1995.0,toyota corolla,fair,,gas,428000.0,manual,sedan,black,0.0,2018-06-13,12,toyota,corolla
39952,1900,1990.0,toyota camry,good,,gas,156481.0,automatic,sedan,white,0.0,2018-09-29,43,toyota,camry
29822,1800,1989.0,toyota camry,excellent,,gas,196617.0,automatic,sedan,blue,0.0,2019-03-01,32,toyota,camry
27075,3800,1988.0,ram 2500,good,,gas,64000.0,automatic,pickup,white,1.0,2018-08-26,34,ram,2500
3844,1900,1986.0,nissan sentra,excellent,,gas,152000.0,manual,coupe,blue,0.0,2018-10-29,63,nissan,sentra
35818,1700,1986.0,nissan sentra,excellent,,gas,152000.0,manual,coupe,blue,0.0,2018-09-29,23,nissan,sentra
36935,3995,2009.0,kia sorento,excellent,,gas,211698.0,automatic,SUV,grey,1.0,2018-10-08,56,kia,sorento
43595,5995,1988.0,jeep wrangler,good,,gas,100309.0,manual,SUV,white,1.0,2018-08-17,12,jeep,wrangler
7121,6900,2009.0,jeep cherokee,excellent,,gas,130023.0,automatic,SUV,black,1.0,2018-09-03,19,jeep,cherokee
47761,3800,1984.0,honda accord,new,,gas,121000.0,manual,sedan,blue,0.0,2019-01-05,162,honda,accord


Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed,make,model_name
14752,15000,1954.0,ford f-150,excellent,,gas,3565.0,manual,pickup,black,0.0,2019-02-16,13,ford,f-150
1101,9200,1975.0,ford f-150,excellent,,gas,,automatic,truck,green,0.0,2018-08-28,40,ford,f-150
33965,5800,1971.0,ford econoline,good,,gas,78000.0,automatic,van,blue,0.0,2019-04-01,40,ford,econoline
15623,5300,2005.0,dodge charger,excellent,,gas,125000.0,automatic,sedan,red,0.0,2018-10-30,46,dodge,charger
36582,44900,1949.0,chevrolet suburban,good,,gas,1800.0,automatic,wagon,orange,0.0,2018-08-19,10,chevrolet,suburban
6982,3900,1977.0,chevrolet suburban,fair,,gas,,automatic,SUV,custom,0.0,2019-02-02,71,chevrolet,suburban
3914,9750,2007.0,chevrolet silverado 1500 crew,excellent,,gas,133000.0,automatic,pickup,grey,0.0,2018-08-22,55,chevrolet,silverado 1500 crew
37000,6800,1980.0,chevrolet malibu,good,,gas,,automatic,coupe,brown,0.0,2019-02-26,33,chevrolet,malibu
28799,45900,1971.0,chevrolet camaro,like new,,gas,0.0,manual,coupe,orange,0.0,2019-01-10,75,chevrolet,camaro
11087,16000,1971.0,chevrolet camaro,excellent,,gas,0.0,manual,coupe,brown,0.0,2018-12-28,125,chevrolet,camaro


In [34]:
# SANITY CHECK:
display(vehicles_df.isna().sum())
print()

vehicles_df.info()

price            0
model_year       0
model            0
condition        0
cylinders       23
fuel             0
odometer        79
transmission     0
type             0
paint_color      0
is_4wd           0
date_posted      0
days_listed      0
make             0
model_name       0
dtype: int64


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    51525 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     51502 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      51446 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   51525 non-null  object 
 10  is_4wd        51525 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
 13  make          51525 non-null  object 
 14  model_name    51525 non-null  object 
dtypes: float64(4), int64(2), object(9)
memory usage: 5.9+ MB


In [35]:
# pull all rows containing the specific 'model_year' column

#specific_model = 

#filtered_models = vehicles_df.loc[vehicles_df['model']]


In [36]:
# Reference Code - double check the `model` column


# SANTIY CHECK: Spot Check for specific user ID's total monthly call counts = total 2018 call count


# Replace with the user_id you want to filter
#specific_user_id = 1498  

# pull all rows containing the specific user ID
#filtered_df_call_counts = df_user_monthly_call_count.loc[df_user_monthly_call_count['user_id'] == specific_user_id]

# sums the sepcific user ID's monthly call counts
#user_total_call_count = filtered_df_call_counts['total_calls'].sum()

# notice this equals the User ID's total call counts in 2018
#print(f"User ID: {specific_user_id} total call count in 2018 is {user_total_call_count}.")


# Display the filtered dataframe
#display(filtered_df_call_counts)

In [37]:
# CLEANING DATA - remove NaN values in `model_year` column




In [38]:
# Feature Engineering - parse manufacturer and model name in 'model' column
# code to reference to parse manufacturer and model
# use later.  Need to clean.
# remove null / Nan / missing values


#vehicles_df[['make', 'model']] = vehicles_df['model'].str.split(' ', n=1, expand=True)