# Initialization

In [1]:
# Load libraries
import pandas as pd
import numpy as np
import plotly.express as px
from IPython.display import display

## Load Data

In [2]:
# load *.csv file into dataframes
try:
    vehicles_df = pd.read_csv('vehicles_us.csv')
except:
    vehicles_df = pd.read_csv('https://practicum-content.s3.us-west-1.amazonaws.com/datasets/vehicles_us.csv')

# EDA - Exploratory Data Analysis

<span style="color: blue;"><u>**OBJECTIVE**</u></span>
1. Review uploaded `vehicles_us.csv` file
2. Check for missing values, `Nulls`, `NaN` (*Not a Number*)

<br>

<span style="color: darkorange;"><u>**Findings / Observations**</u></span>
- 51525 row with 13 columns
- missing values, `Nulls`, & `NaNs` found in the following (5) attributes:
    1. `model_year` (3619)
    2. `cylinders` (5260)
    3. `odometer` (7892)
    4. `paint_color` (9267)
    5. `is_4wd` (25953)
- `is_4wd` column is boolean logic (1 = Y vs. 0 = N)
    - `NaN` found and may need to determine if these will be zero (0 = No)
- found several ford model names that can be consolidated
    - `ford f150` (counts=530) with `ford f-150` (counts=2796) 
    - `ford f250` (counts=339) with `ford f-250` (counts=422)
    - `ford f-250 sd` (counts=426) with `ford f-250 super duty` (counts=241)
    - `ford f250 super duty` (counts=370) with `ford f-250 super duty` (counts=241)
    - `ford f-350 sd` (counts=295) with `ford f350 super duty` (counts=246)
    - keep `f-###` model name format 
- <>

<br>

<span style="color: red;"><u>**Note-2-Self**</u></span>
- will need to parse out the manufacturer and vehicle model from `model` column
    - *see code shared by Jester* 
- `date_posted` and `model_year` column as `object` datatype
    - convert to `datetime` datatype
- `model_year` column as `float` datatype
    - convert to `datetime` datatype for ease of data visualization
- may want to clean the `model_year` column to be 4-digit year (YYYY)
- <>



In [3]:
# EDA: preview data
display(vehicles_df.sample(60))
#display(vehicles_df.head(60))
#display(vehicles_df.tail(60))

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
28315,13490,2012.0,ford econoline,good,8.0,gas,80836.0,automatic,truck,white,,2019-02-18,21
34631,9500,,nissan frontier,good,4.0,gas,,automatic,truck,black,,2018-12-23,13
36914,5500,2002.0,ram 1500,good,8.0,gas,217321.0,automatic,pickup,,1.0,2018-08-01,39
29012,5900,2006.0,hyundai sonata,excellent,5.0,gas,29373.0,automatic,sedan,silver,,2018-05-22,26
12806,30990,2015.0,chevrolet silverado 3500hd,excellent,8.0,diesel,111686.0,automatic,other,white,1.0,2018-09-11,55
39096,11300,2015.0,nissan rogue,good,4.0,gas,77000.0,automatic,SUV,grey,,2019-04-01,42
44315,13995,2015.0,chevrolet equinox,good,4.0,gas,,automatic,SUV,silver,1.0,2018-11-18,23
31871,23090,2013.0,ram 2500,excellent,6.0,diesel,160432.0,automatic,truck,white,1.0,2019-04-13,15
20354,14990,2012.0,jeep wrangler,good,,gas,150356.0,automatic,SUV,silver,1.0,2019-02-21,8
13700,10995,,chevrolet cruze,excellent,4.0,gas,95000.0,automatic,sedan,green,,2018-12-03,29


In [4]:
# EDA: check for nulls/NaN/missing values

# view & identify dataframe attributes with null/missing values
vehicles_df.info()
print('\n\n')

# counts of missing values, nulls, NaNs
print('Missing Value / Null / NaN Counts')
display(vehicles_df.isna().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    47906 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     46265 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      43633 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   42258 non-null  object 
 10  is_4wd        25572 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 5.1+ MB



Missing Value / Null / NaN Counts


price               0
model_year       3619
model               0
condition           0
cylinders        5260
fuel                0
odometer         7892
transmission        0
type                0
paint_color      9267
is_4wd          25953
date_posted         0
days_listed         0
dtype: int64

In [5]:
# EDA: view the unique value and counts in the following columns

display(vehicles_df['model_year'].value_counts(sort=True))
print()

display(vehicles_df['model'].value_counts(sort=True))
print()

display(vehicles_df['condition'].value_counts(sort=True))
print()

display(vehicles_df['cylinders'].value_counts(sort=True))
print()

display(vehicles_df['fuel'].value_counts(sort=True))
print()

display(vehicles_df['transmission'].value_counts(sort=False))
print()

display(vehicles_df['type'].value_counts(sort=True))
print()

display(vehicles_df['paint_color'].value_counts())
print()

display(vehicles_df['is_4wd'].value_counts(sort=True))
print()

model_year
2013.0    3549
2012.0    3468
2014.0    3448
2011.0    3375
2015.0    3323
          ... 
1948.0       1
1961.0       1
1936.0       1
1949.0       1
1929.0       1
Name: count, Length: 68, dtype: int64




model
ford f-150                           2796
chevrolet silverado 1500             2171
ram 1500                             1750
chevrolet silverado                  1271
jeep wrangler                        1119
                                     ... 
ford f-250 super duty                 241
acura tl                              236
kia sorento                           236
nissan murano                         235
mercedes-benz benze sprinter 2500      41
Name: count, Length: 100, dtype: int64




condition
excellent    24773
good         20145
like new      4742
fair          1607
new            143
salvage        115
Name: count, dtype: int64




cylinders
8.0     15844
6.0     15700
4.0     13864
10.0      549
5.0       272
3.0        34
12.0        2
Name: count, dtype: int64




fuel
gas         47288
diesel       3714
hybrid        409
other         108
electric        6
Name: count, dtype: int64




transmission
automatic    46902
manual        2829
other         1794
Name: count, dtype: int64




type
SUV            12405
truck          12353
sedan          12154
pickup          6988
coupe           2303
wagon           1541
mini-van        1161
hatchback       1047
van              633
convertible      446
other            256
offroad          214
bus               24
Name: count, dtype: int64




paint_color
white     10029
black      7692
silver     6244
grey       5037
blue       4475
red        4421
green      1396
brown      1223
custom     1153
yellow      255
orange      231
purple      102
Name: count, dtype: int64




is_4wd
1.0    25572
Name: count, dtype: int64




In [6]:
# EDA: view the unique value and counts in the 'type' column
# Note: will use `.isin()` method to filter 'type' with NaN for 
    # no_4wd_nan = [sedan, coupe, wagon, mini-van, hatchback, van, convertible, other, bus] 
    # yes_4wd_nan = [SUV, truck, pickup, offroad]

display(vehicles_df['type'].value_counts(sort=True))

type
SUV            12405
truck          12353
sedan          12154
pickup          6988
coupe           2303
wagon           1541
mini-van        1161
hatchback       1047
van              633
convertible      446
other            256
offroad          214
bus               24
Name: count, dtype: int64

### <span style="color: gold;">EDA: Missing Values (*Null / NaN*)</span>

<span style="color: blue;"><u>**OBJECTIVE**</u></span>
- deep dive review of all columns with missing values
- determine how to either replace or fill in the missing values

<br>

<span style="color: red;"><u>**Note-2-Self**</u></span>
- Read/Review the use  `fillna()`  `mode` parameter for fill-in `'model_year'`
- `'odometer'` - take average to fill-in NaN

In [7]:
# EDA: review rows with missing values

display(vehicles_df[vehicles_df.isna().any(axis=1)].head(60))

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
0,9400,2011.0,bmw x5,good,6.0,gas,145000.0,automatic,SUV,,1.0,2018-06-23,19
1,25500,,ford f-150,good,6.0,gas,88705.0,automatic,pickup,white,1.0,2018-10-19,50
2,5500,2013.0,hyundai sonata,like new,4.0,gas,110000.0,automatic,sedan,red,,2019-02-07,79
3,1500,2003.0,ford f-150,fair,8.0,gas,,automatic,pickup,,,2019-03-22,9
4,14900,2017.0,chrysler 200,excellent,4.0,gas,80903.0,automatic,sedan,black,,2019-04-02,28
6,12990,2015.0,toyota camry,excellent,4.0,gas,79212.0,automatic,sedan,white,,2018-12-27,73
8,11500,2012.0,kia sorento,excellent,4.0,gas,104174.0,automatic,SUV,,1.0,2018-07-16,19
9,9200,2008.0,honda pilot,excellent,,gas,147191.0,automatic,SUV,blue,1.0,2019-02-15,17
11,8990,2012.0,honda accord,excellent,4.0,gas,111142.0,automatic,sedan,grey,,2019-03-28,29
12,18990,2012.0,ram 1500,excellent,8.0,gas,140742.0,automatic,pickup,,1.0,2019-04-02,37


#### <span style="color: gold;">`'is_4wd'` column</span>

<span style="color: blue;"><u>**OBJECTIVE**</u></span>
- review & determine if the `NaN` in `is_4wd` column should be zero (`0` = not 4wd vehicle)



In [8]:
# `is_4wd`: see rows with NaN
rows_with_nan = vehicles_df[vehicles_df['is_4wd'].isna()]

display(rows_with_nan.sample(60))

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
17290,5490,2009.0,gmc sierra 1500,excellent,6.0,gas,,automatic,pickup,silver,,2018-06-08,55
8551,8950,,ford fusion,good,4.0,gas,90770.0,automatic,sedan,black,,2019-02-24,34
27326,16900,,ram 1500,like new,8.0,gas,30145.0,automatic,truck,white,,2019-03-06,113
17924,4500,2005.0,chevrolet silverado 1500,excellent,6.0,gas,168235.0,manual,pickup,red,,2019-03-01,6
27582,6900,2013.0,chevrolet cruze,good,4.0,gas,,automatic,coupe,grey,,2018-07-25,35
1049,4295,,ford focus,excellent,4.0,gas,97824.0,automatic,sedan,white,,2018-12-23,45
11500,7900,2012.0,honda civic,excellent,4.0,gas,,automatic,coupe,,,2018-12-24,123
19132,12500,2016.0,jeep cherokee,like new,4.0,gas,26000.0,automatic,SUV,green,,2018-08-18,24
13613,6300,2014.0,ford focus,new,4.0,gas,75000.0,automatic,sedan,black,,2018-06-21,17
29910,5995,2008.0,toyota camry,good,,gas,194000.0,automatic,sedan,red,,2019-02-04,32


In [9]:
# EDA: filter & view NaN in the 'type' column for specifically for the following vehicle types
# OBJECTIVE: replace NaN with zero (0 = Non-4wd vehicle)

# list of vehicle types that will not have 4wd capability and use with `.isin()` to view then later with either `.fillna()` or `.replace()`
no_4wd_nan = ['sedan', 'coupe', 'mini-van', 'hatchback', 'van', 'convertible', 'other', 'bus']

# new dataframe to review rows
sorted_non_4wd_df = vehicles_df[vehicles_df['type'].isin(no_4wd_nan)][['model_year', 'model', 'type', 'is_4wd']].sort_values(by='model')

# review top, bottom, and random selected (60) rows to have confidence to replace NaN with zero (0 = not 4wd capable) in 'type' column
display(sorted_non_4wd_df.head(60))
display(sorted_non_4wd_df.tail(60))
display(sorted_non_4wd_df.sample(60))

Unnamed: 0,model_year,model,type,is_4wd
33040,2003.0,acura tl,sedan,
38743,2013.0,acura tl,sedan,
41145,2013.0,acura tl,sedan,
38760,2008.0,acura tl,sedan,
23124,2003.0,acura tl,sedan,
11355,2007.0,acura tl,sedan,
23110,2005.0,acura tl,sedan,
32707,2012.0,acura tl,sedan,
23106,2007.0,acura tl,sedan,
11413,2011.0,acura tl,sedan,1.0


Unnamed: 0,model_year,model,type,is_4wd
46569,2013.0,volkswagen passat,sedan,
42078,2014.0,volkswagen passat,sedan,
35025,2015.0,volkswagen passat,sedan,
11261,2004.0,volkswagen passat,sedan,
43403,2007.0,volkswagen passat,sedan,
14511,2013.0,volkswagen passat,sedan,
40209,2013.0,volkswagen passat,sedan,
27283,2008.0,volkswagen passat,hatchback,
48302,2015.0,volkswagen passat,sedan,
20865,2016.0,volkswagen passat,sedan,


Unnamed: 0,model_year,model,type,is_4wd
43763,2013.0,hyundai sonata,sedan,
51020,2009.0,honda accord,sedan,
2009,2016.0,toyota prius,hatchback,
26383,2003.0,ford taurus,sedan,
6776,2013.0,dodge charger,sedan,
41533,2010.0,toyota prius,hatchback,
138,2013.0,nissan altima,sedan,
44387,2013.0,kia soul,sedan,
18347,2013.0,ford focus,hatchback,
45974,2009.0,toyota camry,sedan,


In [10]:
# EDA: filter & view NaN in the 'type' column for specifically for the following vehicle types
# OBJECTIVE: replace NaN with one (1 = 4wd vehicle)

yes_4wd_nan = ['SUV', 'truck', 'pickup', 'wagon', 'offroad']

#yes_4wd_nan = ['truck']

# new dataframe to review rows
sorted_yes_4wd_df = vehicles_df[vehicles_df['type'].isin(yes_4wd_nan)][['model_year', 'model', 'type', 'is_4wd']].sort_values(by='model')


# review top, bottom, and random selected (60) rows to have confidence to replace NaN with zero (0 = not 4wd capable) in 'type' column
display(sorted_yes_4wd_df.head(60))
display(sorted_yes_4wd_df.tail(60))
display(sorted_yes_4wd_df.sample(60))

Unnamed: 0,model_year,model,type,is_4wd
8133,2007.0,acura tl,SUV,
7029,2007.0,acura tl,SUV,
8675,2007.0,acura tl,SUV,
16496,2008.0,bmw x5,SUV,1.0
16449,2010.0,bmw x5,SUV,1.0
29694,2010.0,bmw x5,SUV,1.0
29704,2011.0,bmw x5,SUV,1.0
43159,2012.0,bmw x5,SUV,1.0
29745,2012.0,bmw x5,SUV,1.0
43196,2011.0,bmw x5,SUV,1.0


Unnamed: 0,model_year,model,type,is_4wd
27439,2007.0,toyota tundra,truck,1.0
16725,2006.0,toyota tundra,truck,
2110,2014.0,toyota tundra,truck,1.0
31366,2000.0,toyota tundra,truck,
16724,2003.0,toyota tundra,pickup,
20875,2007.0,toyota tundra,truck,
46479,2001.0,toyota tundra,truck,1.0
31407,,toyota tundra,pickup,1.0
20549,2005.0,toyota tundra,pickup,1.0
40796,2006.0,toyota tundra,pickup,1.0


Unnamed: 0,model_year,model,type,is_4wd
9748,2008.0,ford ranger,pickup,
29120,2018.0,chevrolet silverado 2500hd,truck,1.0
27506,2011.0,chevrolet traverse,wagon,1.0
7291,2014.0,jeep grand cherokee laredo,SUV,
45033,2017.0,ford f250,pickup,1.0
49487,,ford escape,SUV,
32711,2015.0,ford explorer,SUV,1.0
29517,2007.0,chevrolet silverado,truck,1.0
27376,2018.0,nissan rogue,SUV,
10465,2014.0,ford f-150,pickup,1.0


#### <span style="color: gold;">`'model_year'` column</span>

<span style="color: blue;"><u>**OBJECTIVE**</u></span>
- review & determine how replace the `NaN` with a year in `model_year` column
- <>

<br>

<span style="color: red;"><u>**Note-2-Self**</u></span>
- Read/Review the use  `fillna()`  `mode` parameter for fill-in `'model_year'`
- `'odometer'` - take average to fill-in NaN
- <>

In [11]:
# EDA: view all the unique value and counts in the 'model_year' column

# created a new dataframe to view the entire (68) unique values
model_year_counts_df = (vehicles_df["model_year"].value_counts(sort=False)).reset_index()

# create the columns to view  in the created dataframe to review the (68) unique years and counts
model_year_counts_df.columns = ['model_year', 'count']

# sort model years in ascending order (ascending=True) or descending order (ascending=False)
model_year_df_sorted = model_year_counts_df.sort_values(by='model_year', ascending=False)

# display all (68) unique model years and its counts in the `model_year` column
display(model_year_df_sorted.head(60))
display(model_year_df_sorted.tail(10))

Unnamed: 0,model_year,count
18,2019.0,380
8,2018.0,2193
3,2017.0,2419
20,2016.0,2954
5,2015.0,3323
4,2014.0,3448
1,2013.0,3549
6,2012.0,3468
0,2011.0,3375
10,2010.0,2691


Unnamed: 0,model_year,count
63,1961.0,1
61,1960.0,3
66,1958.0,2
54,1955.0,1
58,1954.0,1
65,1949.0,1
60,1948.0,1
64,1936.0,1
67,1929.0,1
62,1908.0,2


#### <span style="color: gold;">`'cylinders'` column</span>

<span style="color: blue;"><u>**OBJECTIVE**</u></span>
- review & determine how replace the `NaN` with a year in `cylinders` column
- <>

<br>

<span style="color: red;"><u>**Note-2-Self**</u></span>
- Read/Review the use  `fillna()`  `mode` parameter for fill-in `'model_year'`
- `'odometer'` - take average to fill-in NaN
- <>

In [12]:
# EDA: view rows with NaN in `cylinders` column

cylinders_df = vehicles_df[vehicles_df['cylinders'].isna()]


# sort rows in ascending order, or descending order (ascending=False), under `cylinders` column
sort_cylinders_df = cylinders_df.sort_values(by='model', ascending=False)


display(cylinders_df.sample(10))
display(sort_cylinders_df.head(10))
display(sort_cylinders_df.tail(10))

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
3740,5900,,honda cr-v,good,,gas,177663.0,automatic,SUV,blue,1.0,2019-02-09,34
35622,10250,2015.0,honda civic,excellent,,gas,,automatic,sedan,grey,,2019-01-04,38
19402,14995,2009.0,jeep wrangler,good,,gas,77046.0,automatic,SUV,yellow,1.0,2018-06-30,96
29438,12495,2013.0,volkswagen jetta,excellent,,diesel,40593.0,automatic,sedan,silver,,2018-10-19,51
15012,4995,2009.0,hyundai sonata,good,,gas,115150.0,automatic,sedan,blue,,2019-04-09,52
4734,28977,2012.0,ram 2500,excellent,,gas,87742.0,automatic,pickup,,1.0,2019-04-10,71
8516,2100,1994.0,ford f-250,good,,gas,132600.0,automatic,pickup,blue,,2018-09-02,29
9056,10990,2015.0,toyota camry,good,,gas,121487.0,automatic,sedan,grey,,2018-12-31,7
16053,3996,2004.0,toyota camry,good,,gas,146000.0,automatic,sedan,green,,2018-09-04,16
17245,5395,,acura tl,excellent,,gas,133500.0,automatic,sedan,white,,2018-11-18,31


Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
40036,9100,2013.0,volkswagen passat,excellent,,gas,73000.0,automatic,sedan,white,,2019-04-04,39
16229,12495,2013.0,volkswagen passat,excellent,,diesel,40437.0,automatic,sedan,silver,,2018-08-01,60
27322,6995,2013.0,volkswagen passat,excellent,,gas,103336.0,manual,sedan,black,,2018-08-04,21
48309,7950,2012.0,volkswagen passat,like new,,gas,108000.0,automatic,sedan,black,,2018-10-22,33
40361,6900,2013.0,volkswagen passat,good,,gas,,automatic,sedan,,,2018-09-26,36
15353,9995,2013.0,volkswagen passat,good,,diesel,69898.0,automatic,sedan,silver,,2018-09-15,20
39426,4777,2000.0,volkswagen passat,excellent,,gas,,manual,sedan,silver,,2018-07-07,1
13103,11495,2015.0,volkswagen passat,excellent,,gas,69000.0,automatic,sedan,grey,,2018-09-08,92
22154,9995,2014.0,volkswagen passat,good,,gas,83000.0,automatic,sedan,white,,2018-07-14,22
22129,12500,2013.0,volkswagen passat,excellent,,diesel,52573.0,automatic,sedan,,,2019-01-20,14


Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
7422,8795,2008.0,acura tl,excellent,,gas,100496.0,automatic,sedan,,,2018-11-17,31
43814,3999,2008.0,acura tl,good,,gas,145000.0,automatic,sedan,red,,2018-08-28,8
15360,4295,2006.0,acura tl,good,,gas,,automatic,sedan,silver,,2018-08-03,75
9284,7500,2006.0,acura tl,like new,,gas,101000.0,automatic,sedan,silver,,2019-01-25,29
17375,4800,2005.0,acura tl,good,,gas,127705.0,automatic,sedan,custom,,2019-02-05,56
38760,6700,2008.0,acura tl,excellent,,gas,134460.0,automatic,sedan,white,,2019-01-20,18
20517,5490,2008.0,acura tl,good,,gas,165738.0,automatic,other,blue,,2018-11-24,12
23124,3299,2003.0,acura tl,excellent,,gas,116234.0,automatic,sedan,brown,,2019-03-13,15
20444,2000,2012.0,acura tl,excellent,,gas,100000.0,automatic,sedan,black,1.0,2018-09-19,4
38628,11999,2012.0,acura tl,excellent,,gas,85000.0,automatic,sedan,white,,2018-08-07,51


### <span style="color: gold;">EDA: Duplicate names in `'model'` column</span>

<span style="color: blue;"><u>**OBJECTIVE**</u></span>
- review and check if there is duplicate/similar model names in `model` column
    - if found then will need to consolidate by replacing/changing the model names
- <>

<br>

<span style="color: darkorange;"><u>**Findings / Observations**</u></span>
- found several ford model names that can be consolidated
    - `ford f150` (counts=530) with `ford f-150` (counts=2796) 
    - `ford f250` (counts=339) with `ford f-250` (counts=422)
    - `ford f-250 sd` (counts=426) with `ford f-250 super duty` (counts=241)
    - `ford f250 super duty` (counts=370) with `ford f-250 super duty` (counts=241)
    - `ford f-350 sd` (counts=295) with `ford f350 super duty` (counts=246)
    - keep `f-###` model name format 
- <>

<br>

<span style="color: red;"><u>**Note-2-Self**</u></span>
- Read/Review the use  `fillna()`  `mode` parameter for fill-in `'model_year'`
- `'odometer'` - take average to fill-in NaN

In [13]:
# EDA: view all the unique value and counts in the 'model' column
# found several ford model names that can be consolidated i.e. 'ford f250' with 'ford f-250'


# create another dataframe to review the (100) unique vehicle models and its counts
model_counts_df = (vehicles_df['model'].value_counts()).reset_index()


# create the columns to view  in the created dataframe to review the (100) unique vehicle models and counts
model_counts_df.columns = ['model', 'count']


# sort `model` column by alphabectical order 
model_counts_df_sorted = model_counts_df.sort_values(by='model')


display(model_counts_df_sorted.head(60))
display(model_counts_df_sorted.tail(42))

Unnamed: 0,model,count
96,acura tl,236
85,bmw x5,267
84,buick enclave,271
65,cadillac escalade,322
49,chevrolet camaro,414
69,chevrolet camaro lt coupe 2d,311
79,chevrolet colorado,286
31,chevrolet corvette,499
40,chevrolet cruze,457
25,chevrolet equinox,591


Unnamed: 0,model,count
86,honda civic lx,262
18,honda cr-v,685
39,honda odyssey,457
73,honda pilot,302
46,hyundai elantra,423
83,hyundai santa fe,273
35,hyundai sonata,477
78,jeep cherokee,293
15,jeep grand cherokee,806
87,jeep grand cherokee laredo,256


## <span style="color: teal;">Feature Engineering (FE)</span>
<span style="color: green;"><u>**Steps / Action Taken**</u></span>
- combined the following ford model names in `model` column
    - `ford f150` (counts=530) with `ford f-150` (counts=2796) 
    - `ford f250` (counts=339) with `ford f-250` (counts=422)
    - `ford f-250 sd` (counts=426) with `ford f-250 super duty` (counts=241)
    - `ford f250 super duty` (counts=370) with `ford f-250 super duty` (counts=241)
    - `ford f-350 sd` (counts=295) with `ford f350 super duty` (counts=246)
- renamed the following ford model names in `model` column
    - `ford f350 super duty` (counts=541) to `ford f-350 super duty`
    - `ford f150 supercrew cab xlt` (counts=327) to `ford f-150 supercrew cab xlt`
    - `ford f350` (counts=250) to `ford f-350`
- created two new columns, `make` and `model_name`, derived from `model`
- <>

<br>

<span style="color: darkorange;"><u>**Findings / Observations**</u></span>
- <>

<br>

<span style="color: red;"><u>**Note-2-Self**</u></span> 
- for now...okay to keep the `date_posted` column as `object` datatype
    - will determine and return if needed to change to `datetime` datatype
- may want to clean the `model_year` column to be 4-digit year (YYYY)
- <>


### <span style="color: green;">Combine & Rename ford model names in `'model'` column</span>

In [14]:
# FE: replace the following model names in the 'model' column

# `ford f150` (counts=530) with `ford f-150` (counts=2796)
vehicles_df['model'] = vehicles_df['model'].replace('ford f150', 'ford f-150')

# `ford f250` (counts=339) with `ford f-250` (counts=422)
vehicles_df['model'] = vehicles_df['model'].replace('ford f250', 'ford f-250')

# `ford f-250 sd` (counts=426) with `ford f-250 super duty` (counts=241)
vehicles_df['model'] = vehicles_df['model'].replace('ford f-250 sd', 'ford f-250 super duty')

# `ford f250 super duty` (counts=370) with `ford f-250 super duty` (counts=241)
vehicles_df['model'] = vehicles_df['model'].replace('ford f250 super duty', 'ford f-250 super duty')

# `ford f-350 sd` (counts=295) with `ford f350 super duty` (counts=246)
vehicles_df['model'] = vehicles_df['model'].replace('ford f-350 sd', 'ford f350 super duty')

# rename `ford f350 super duty` (counts=541) to `ford f-350 super duty`
vehicles_df['model'] = vehicles_df['model'].replace('ford f350 super duty', 'ford f-350 super duty')

# rename `ford f150 supercrew cab xlt` (counts=327) to `ford f-150 supercrew cab xlt`
vehicles_df['model'] = vehicles_df['model'].replace('ford f150 supercrew cab xlt', 'ford f-150 supercrew cab xlt')

# rename `ford f350` (counts=250) to `ford f-350`
vehicles_df['model'] = vehicles_df['model'].replace('ford f350', 'ford f-350')


# create another dataframe to review the changes above in the 'model' column
model_counts_df = vehicles_df['model'].value_counts().reset_index()


# create the columns to view  in the created dataframe to review the (100) unique vehicle models and counts
model_counts_df.columns = ['model', 'count']

# sort `model` column by alphabectical order 
sorted_model_df = model_counts_df.sort_values(by='model')


display(sorted_model_df)
display(sorted_model_df.head(60))
display(sorted_model_df.tail(36))

Unnamed: 0,model,count
91,acura tl,236
82,bmw x5,267
81,buick enclave,271
63,cadillac escalade,322
49,chevrolet camaro,414
...,...,...
60,toyota sienna,329
13,toyota tacoma,827
25,toyota tundra,603
32,volkswagen jetta,519


Unnamed: 0,model,count
91,acura tl,236
82,bmw x5,267
81,buick enclave,271
63,cadillac escalade,322
49,chevrolet camaro,414
67,chevrolet camaro lt coupe 2d,311
76,chevrolet colorado,286
33,chevrolet corvette,499
42,chevrolet cruze,457
27,chevrolet equinox,591


Unnamed: 0,model,count
37,hyundai sonata,477
75,jeep cherokee,293
16,jeep grand cherokee,806
84,jeep grand cherokee laredo,256
54,jeep liberty,355
4,jeep wrangler,1119
44,jeep wrangler unlimited,452
92,kia sorento,236
56,kia soul,349
94,mercedes-benz benze sprinter 2500,41


### <span style="color: green;">Create Two New Columns: `'make'` and `'model_name'`</span>


In [15]:
# FE: create two new columns, 'make' and 'model_name', derived from 'model' column

# `.str.split(' ')`: This splits each string in the model column by spaces. The default behavior is to split by whitespace.
# `n=1`: This limits the split to 1 occurrence. This means that the string will be split at the first space only. If there are more spaces in the string, they will be ignored for further splitting.
# `expand=True`: This returns a DataFrame where each split part becomes a separate column.
vehicles_df[['make', 'model_name']] = vehicles_df['model'].str.split(' ', n=1, expand=True)


# review & confirm two new columns created
display(vehicles_df.info())
display(vehicles_df.isna().sum())
display(vehicles_df)
display(vehicles_df.head(60))
#display(vehicles_df.tail(60))
#display(vehicles_df.sample(60))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    47906 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     46265 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      43633 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   42258 non-null  object 
 10  is_4wd        25572 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
 13  make          51525 non-null  object 
 14  model_name    51525 non-null  object 
dtypes: float64(4), int64(2), object(9)
memory usage: 5.9+ MB


None

price               0
model_year       3619
model               0
condition           0
cylinders        5260
fuel                0
odometer         7892
transmission        0
type                0
paint_color      9267
is_4wd          25953
date_posted         0
days_listed         0
make                0
model_name          0
dtype: int64

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed,make,model_name
0,9400,2011.0,bmw x5,good,6.0,gas,145000.0,automatic,SUV,,1.0,2018-06-23,19,bmw,x5
1,25500,,ford f-150,good,6.0,gas,88705.0,automatic,pickup,white,1.0,2018-10-19,50,ford,f-150
2,5500,2013.0,hyundai sonata,like new,4.0,gas,110000.0,automatic,sedan,red,,2019-02-07,79,hyundai,sonata
3,1500,2003.0,ford f-150,fair,8.0,gas,,automatic,pickup,,,2019-03-22,9,ford,f-150
4,14900,2017.0,chrysler 200,excellent,4.0,gas,80903.0,automatic,sedan,black,,2019-04-02,28,chrysler,200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51520,9249,2013.0,nissan maxima,like new,6.0,gas,88136.0,automatic,sedan,black,,2018-10-03,37,nissan,maxima
51521,2700,2002.0,honda civic,salvage,4.0,gas,181500.0,automatic,sedan,white,,2018-11-14,22,honda,civic
51522,3950,2009.0,hyundai sonata,excellent,4.0,gas,128000.0,automatic,sedan,blue,,2018-11-15,32,hyundai,sonata
51523,7455,2013.0,toyota corolla,good,4.0,gas,139573.0,automatic,sedan,black,,2018-07-02,71,toyota,corolla


Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed,make,model_name
0,9400,2011.0,bmw x5,good,6.0,gas,145000.0,automatic,SUV,,1.0,2018-06-23,19,bmw,x5
1,25500,,ford f-150,good,6.0,gas,88705.0,automatic,pickup,white,1.0,2018-10-19,50,ford,f-150
2,5500,2013.0,hyundai sonata,like new,4.0,gas,110000.0,automatic,sedan,red,,2019-02-07,79,hyundai,sonata
3,1500,2003.0,ford f-150,fair,8.0,gas,,automatic,pickup,,,2019-03-22,9,ford,f-150
4,14900,2017.0,chrysler 200,excellent,4.0,gas,80903.0,automatic,sedan,black,,2019-04-02,28,chrysler,200
5,14990,2014.0,chrysler 300,excellent,6.0,gas,57954.0,automatic,sedan,black,1.0,2018-06-20,15,chrysler,300
6,12990,2015.0,toyota camry,excellent,4.0,gas,79212.0,automatic,sedan,white,,2018-12-27,73,toyota,camry
7,15990,2013.0,honda pilot,excellent,6.0,gas,109473.0,automatic,SUV,black,1.0,2019-01-07,68,honda,pilot
8,11500,2012.0,kia sorento,excellent,4.0,gas,104174.0,automatic,SUV,,1.0,2018-07-16,19,kia,sorento
9,9200,2008.0,honda pilot,excellent,,gas,147191.0,automatic,SUV,blue,1.0,2019-02-15,17,honda,pilot


In [16]:
# EDA: view the unique value names and counts in the following columns

display(vehicles_df['make'].value_counts(sort=False))
print()

display(vehicles_df['model_name'].value_counts(sort=False))
print()

make
bmw                267
ford             12672
hyundai           1173
chrysler           838
toyota            5445
honda             3485
kia                585
chevrolet        10611
ram               3316
gmc               2378
jeep              3281
nissan            3208
subaru            1272
dodge             1255
mercedes-benz       41
acura              236
cadillac           322
volkswagen         869
buick              271
Name: count, dtype: int64




model_name
x5              267
f-150          3326
sonata          477
200             243
300             316
               ... 
dakota          242
f-350           250
trailblazer     255
econoline       296
murano          235
Name: count, Length: 95, dtype: int64




In [17]:
# FE: Consolidating vehicle type value names in the 'type' column

# Combine 'truck' and 'pickup' value names into 'truck' value name in `type` column
#vehicles_df['type'] = vehicles_df['type'].replace('pickup', 'truck')

display(vehicles_df['type'].value_counts(sort=True))

# Combine 'truck' and 'pickup' value names into 'pickup' value name in `type` column
#vehicles_df['type'] = vehicles_df['type'].replace()

type
SUV            12405
truck          12353
sedan          12154
pickup          6988
coupe           2303
wagon           1541
mini-van        1161
hatchback       1047
van              633
convertible      446
other            256
offroad          214
bus               24
Name: count, dtype: int64

Note-2-Self

export large output greater than 120 onto `.csv` or `.txt` or `.xlsx` 

## <span style="color: teal;">Remove Missing Value, Null, and NaN (*Not a Number*)</span>

<span style="color: green;"><u>**Steps / Action Taken**</u></span>
- Fill Missing Values
    - `is_4wd` column: replace `NaN` with zero (`0` = not 4wd capable)
    - `paint_color` column: replace `NaN` with `'white'`
- Fill Missing Values using `.groupby().transform()`
    - `'model_year'`
    - `'odometer'`
    - `'cylinders'`
- Drop the remaining Missing Values unable to either fill in or replace
    - `'cylinders'` (counts=23)
    - `'odometer'` (counts=79)
- created new dataframe called `clean_VEH_df`
- datatype conversion for the following
    - `'model_year'` (float -> int)
    - `'date_posted'` (object -> datetime)
- <>

<br>

<span style="color: darkorange;"><u>**Findings / Observations**</u></span>
- <>

<br>

<span style="color: red;"><u>**Note-2-Self**</u></span>
- use new dataframe called `clean_VEH_df` for data visualization
- <>


### <span style="color: green;">Fill Missing Value: `'is_4wd'` column</span>


In [18]:
# MISSING VALUES: replace NaN with zero (0 = not 4wd) in 'is_4wd' column 
vehicles_df['is_4wd'] = vehicles_df['is_4wd'].fillna(0)

### <span style="color: green;">Fill Missing Value: `'paint_color'` column</span>


In [19]:
# EDA: using `.describe()` on `paint_color` column
vehicles_df['paint_color'].describe()


# MISSING VALUES: replace NaN with zero (0 = not 4wd) in 'is_4wd' column 
vehicles_df['paint_color'] = vehicles_df['paint_color'].fillna('white')

### <span style="color: green;">Fill Missing Value: `'model_year'` column</span>

In [20]:
# SANITY CHECK: before filling in missing values in 'model_year' column
vehicles_df.isna().sum()

price              0
model_year      3619
model              0
condition          0
cylinders       5260
fuel               0
odometer        7892
transmission       0
type               0
paint_color        0
is_4wd             0
date_posted        0
days_listed        0
make               0
model_name         0
dtype: int64

In [21]:
# MISSING VALUES: replace NaN with 'median' model year calculation in 'model_year' column
    # use df['column_with_NaN'] = df.fillna(df.groupby(['column-1', 'column-2'])['column_with_NaN'].transform('median'))
    # 'column-1' = this 1st column name must have some relavance to 'column_with_NaN'
    # 'column-2' = this 2nd column name must have some relavance to 'column_with_NaN'
        # REMINDER:  additional column names to groupby reduces the groupings size when calculating either the 'median' or 'mean'
            # Thus the chance the reductions of NaN may not be filled in with `.transform('median')`

vehicles_df['model_year'] = vehicles_df['model_year'].fillna(vehicles_df.groupby(['model'])['model_year'].transform('median'))

In [22]:
# SANITY CHECK: after filling in missing values in 'model_year' column
vehicles_df.isna().sum()

price              0
model_year         0
model              0
condition          0
cylinders       5260
fuel               0
odometer        7892
transmission       0
type               0
paint_color        0
is_4wd             0
date_posted        0
days_listed        0
make               0
model_name         0
dtype: int64

### <span style="color: green;">Fill Missing Value: `'odometer'` column</span>


In [23]:
# EDA: using `.describe()` on `odometer` column
vehicles_df['odometer'].describe()

count     43633.000000
mean     115553.461738
std       65094.611341
min           0.000000
25%       70000.000000
50%      113000.000000
75%      155000.000000
max      990000.000000
Name: odometer, dtype: float64

In [24]:
# SANITY CHECK: before filling in missing values in 'odometer' column
vehicles_df.isna().sum()

price              0
model_year         0
model              0
condition          0
cylinders       5260
fuel               0
odometer        7892
transmission       0
type               0
paint_color        0
is_4wd             0
date_posted        0
days_listed        0
make               0
model_name         0
dtype: int64

In [25]:
# MISSING VALUES: replace NaN with 'mean' odometer calculation in 'odometer' column
    # use df['column_with_NaN'] = df.fillna(df.groupby(['column-1', 'column-2'])['column_with_NaN'].transform('median'))
    # 'column-1' = this 1st column name must have some relavance to 'column_with_NaN'
    # 'column-2' = this 2nd column name must have some relavance to 'column_with_NaN'
        # REMINDER:  additional column names to groupby reduces the groupings size when calculating either the 'median' or 'mean'
            # Thus the chance the reductions of NaN may not be filled in with `.transform('median')`

vehicles_df['odometer'] = vehicles_df['odometer'].fillna(vehicles_df.groupby(['model', 'model_year'])['odometer'].transform('mean'))

In [26]:
# SANITY CHECK: after filling in missing values in 'odometer' column
vehicles_df.isna().sum()

price              0
model_year         0
model              0
condition          0
cylinders       5260
fuel               0
odometer          79
transmission       0
type               0
paint_color        0
is_4wd             0
date_posted        0
days_listed        0
make               0
model_name         0
dtype: int64

In [27]:
# REFERENCE CODE:  use `groupby()` and `transform()` to fill in missing values

#df['value'] = df['value'].fillna(df.groupby('name')['value'].transform('mean'))

### <span style="color: green;">Fill Missing Value: `'cylinders'` column</span>

In [28]:
# SANITY CHECK: before filling in missing values in 'cylinders' column
vehicles_df.isna().sum()

price              0
model_year         0
model              0
condition          0
cylinders       5260
fuel               0
odometer          79
transmission       0
type               0
paint_color        0
is_4wd             0
date_posted        0
days_listed        0
make               0
model_name         0
dtype: int64

In [29]:
# MISSING VALUES: replace NaN with 'median' cylinder calculation in 'cylinders' column
    # use df['column_with_NaN'] = df.fillna(df.groupby(['column-1', 'column-2'])['column_with_NaN'].transform('median'))
    # 'column-1' = this 1st column name must have some relavance to 'column_with_NaN'
    # 'column-2' = this 2nd column name must have some relavance to 'column_with_NaN'
        # REMINDER:  additional column names to groupby reduces the groupings size when calculating either the 'median' or 'mean'
            # Thus the chance the reductions of NaN may not be filled in with `.transform('median')`

vehicles_df['cylinders'] = vehicles_df['cylinders'].fillna(vehicles_df.groupby(['model', 'model_year'])['cylinders'].transform('median'))

In [30]:
# SANITY CHECK: after filling in missing values in 'cylinders' column
display(vehicles_df.isna().sum())
print()

vehicles_df.info()

price            0
model_year       0
model            0
condition        0
cylinders       23
fuel             0
odometer        79
transmission     0
type             0
paint_color      0
is_4wd           0
date_posted      0
days_listed      0
make             0
model_name       0
dtype: int64


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    51525 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     51502 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      51446 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   51525 non-null  object 
 10  is_4wd        51525 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
 13  make          51525 non-null  object 
 14  model_name    51525 non-null  object 
dtypes: float64(4), int64(2), object(9)
memory usage: 5.9+ MB


In [31]:
# EDA: review rows with missing values

display(vehicles_df[vehicles_df.isna().any(axis=1)])
#display(vehicles_df[vehicles_df.isna().any(axis=1)].head(60))
#display(vehicles_df[vehicles_df.isna().any(axis=1)].tail(32))

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed,make,model_name
24,3950,2009.0,chrysler 200,excellent,4.0,gas,,automatic,sedan,red,0.0,2018-06-11,40,chrysler,200
42,34900,2013.0,mercedes-benz benze sprinter 2500,excellent,6.0,diesel,,automatic,van,black,0.0,2019-01-15,16,mercedes-benz,benze sprinter 2500
1101,9200,1975.0,ford f-150,excellent,,gas,,automatic,truck,green,0.0,2018-08-28,40,ford,f-150
1642,34900,2013.0,mercedes-benz benze sprinter 2500,excellent,6.0,diesel,,automatic,van,black,0.0,2018-12-04,36,mercedes-benz,benze sprinter 2500
2232,34900,2013.0,mercedes-benz benze sprinter 2500,excellent,6.0,diesel,,automatic,van,black,0.0,2018-08-23,70,mercedes-benz,benze sprinter 2500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48012,34900,2013.0,mercedes-benz benze sprinter 2500,excellent,6.0,diesel,,automatic,van,black,0.0,2018-10-14,19,mercedes-benz,benze sprinter 2500
48483,34900,2013.0,mercedes-benz benze sprinter 2500,excellent,6.0,diesel,,automatic,van,black,0.0,2018-08-19,27,mercedes-benz,benze sprinter 2500
50062,1695,1991.0,jeep cherokee,fair,6.0,gas,,automatic,SUV,white,1.0,2018-05-25,16,jeep,cherokee
50547,34900,2013.0,mercedes-benz benze sprinter 2500,excellent,6.0,diesel,,automatic,van,black,0.0,2018-12-28,49,mercedes-benz,benze sprinter 2500


### <span style="color: green;">Drop Missing Value: `'odometer'` & `'cylinders'` column</span>

<span style="color: green;"><u>**Steps / Action Taken**</u></span>
- Drop rows with remaining Missing Values unable to either fill in or replace
    - `'cylinders'` (counts=23)
    - `'odometer'` (counts=79)
- created new dataframe called `clean_VEH_df`
- <>

<br>

<span style="color: darkorange;"><u>**Findings / Observations**</u></span>
- <>

<br>

<span style="color: red;"><u>**Note-2-Self**</u></span>
- use `clean_VEH_df` moving forward
- convert the following columns to datetime datatype
    - `'model_year'` = float64
    - `'date_posted'` = object
- <>


In [32]:
# MISSING VALUES: drop rows with missing values and create new vehicles.df called clean_VEH_df
clean_VEH_df = vehicles_df.dropna(subset=['cylinders', 'odometer'])


# SANITY CHECK: verify all missing values were filled/replaced/removed
clean_VEH_df.info()
clean_VEH_df.isna().sum()

<class 'pandas.core.frame.DataFrame'>
Index: 51426 entries, 0 to 51524
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51426 non-null  int64  
 1   model_year    51426 non-null  float64
 2   model         51426 non-null  object 
 3   condition     51426 non-null  object 
 4   cylinders     51426 non-null  float64
 5   fuel          51426 non-null  object 
 6   odometer      51426 non-null  float64
 7   transmission  51426 non-null  object 
 8   type          51426 non-null  object 
 9   paint_color   51426 non-null  object 
 10  is_4wd        51426 non-null  float64
 11  date_posted   51426 non-null  object 
 12  days_listed   51426 non-null  int64  
 13  make          51426 non-null  object 
 14  model_name    51426 non-null  object 
dtypes: float64(4), int64(2), object(9)
memory usage: 6.3+ MB


price           0
model_year      0
model           0
condition       0
cylinders       0
fuel            0
odometer        0
transmission    0
type            0
paint_color     0
is_4wd          0
date_posted     0
days_listed     0
make            0
model_name      0
dtype: int64

### <span style="color: green;">Convert `'model_year'` and `'date_posted'` DataType</span>

In [41]:
# FE: 

# 'model_year' column convert datatype from float64 to int32 to get 4-digit year
clean_VEH_df['model_year'] = clean_VEH_df['model_year'].astype(int)

# 'date_posted' column convert value datatype from object to datetime 
clean_VEH_df['date_posted'] = pd.to_datetime(clean_VEH_df['date_posted'], format='%Y-%m-%d')


clean_VEH_df.info()
print('/n/n')
display(clean_VEH_df.isna().sum())
print()
display(clean_VEH_df)

<class 'pandas.core.frame.DataFrame'>
Index: 51426 entries, 0 to 51524
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   price         51426 non-null  int64         
 1   model_year    51426 non-null  int32         
 2   model         51426 non-null  object        
 3   condition     51426 non-null  object        
 4   cylinders     51426 non-null  float64       
 5   fuel          51426 non-null  object        
 6   odometer      51426 non-null  float64       
 7   transmission  51426 non-null  object        
 8   type          51426 non-null  object        
 9   paint_color   51426 non-null  object        
 10  is_4wd        51426 non-null  float64       
 11  date_posted   51426 non-null  datetime64[ns]
 12  days_listed   51426 non-null  int64         
 13  make          51426 non-null  object        
 14  model_name    51426 non-null  object        
dtypes: datetime64[ns](1), float64(3), int32(1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_VEH_df['model_year'] = clean_VEH_df['model_year'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_VEH_df['date_posted'] = pd.to_datetime(clean_VEH_df['date_posted'], format='%Y-%m-%d')


price           0
model_year      0
model           0
condition       0
cylinders       0
fuel            0
odometer        0
transmission    0
type            0
paint_color     0
is_4wd          0
date_posted     0
days_listed     0
make            0
model_name      0
dtype: int64




Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed,make,model_name
0,9400,2011,bmw x5,good,6.0,gas,145000.000000,automatic,SUV,white,1.0,2018-06-23,19,bmw,x5
1,25500,2011,ford f-150,good,6.0,gas,88705.000000,automatic,pickup,white,1.0,2018-10-19,50,ford,f-150
2,5500,2013,hyundai sonata,like new,4.0,gas,110000.000000,automatic,sedan,red,0.0,2019-02-07,79,hyundai,sonata
3,1500,2003,ford f-150,fair,8.0,gas,169240.688312,automatic,pickup,white,0.0,2019-03-22,9,ford,f-150
4,14900,2017,chrysler 200,excellent,4.0,gas,80903.000000,automatic,sedan,black,0.0,2019-04-02,28,chrysler,200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51520,9249,2013,nissan maxima,like new,6.0,gas,88136.000000,automatic,sedan,black,0.0,2018-10-03,37,nissan,maxima
51521,2700,2002,honda civic,salvage,4.0,gas,181500.000000,automatic,sedan,white,0.0,2018-11-14,22,honda,civic
51522,3950,2009,hyundai sonata,excellent,4.0,gas,128000.000000,automatic,sedan,blue,0.0,2018-11-15,32,hyundai,sonata
51523,7455,2013,toyota corolla,good,4.0,gas,139573.000000,automatic,sedan,black,0.0,2018-07-02,71,toyota,corolla


In [34]:
# ALTERNATIVE OPTION: 'model_year' column converts datatype from float64 to datetime
# Note: default datetime format is YYYY-MM-DD (i.e. 2024-01-01) 

#clean_VEH_df['model_year'] = pd.to_datetime(clean_VEH_df['model_year'], format='%Y')


# creates a new column titled year from model_year column to display 4-digit year (dtype=int64)
#clean_VEH_df['year'] = clean_VEH_df['model_year'].dt.year


#clean_VEH_df.info()
#print()
#display(clean_VEH_df)

## <span style="color: red;">TEST CODES (DID NOT USE)</span>

In [35]:
# EDA: 

# view rows pertaining to bmw 
#bmw_df = vehicles_df.query("make == 'bmw'")[['model', 'model_year', 'cylinders', 'odometer', 'type', 'make', 'model_name']]

# sort dataframe by model year then by model then by cylinder in ascending order
#sort_bmw_df = bmw_df.sort_values(by=['model_year', 'model', 'cylinders'], ascending=[True, True, True])

#display(bmw_df.head(60))
#display(sort_bmw_df.head(60))

In [36]:
# groupby

#group_vehicles = vehicles_df.groupby(['model', 'model_year', 'cylinders'])['model_name'].count()

#group_bmw = bmw_df.groupby(['model', 'model_year', 'cylinders'])['model_name'].count()

#display(group_bmw)


In [37]:
# convert groupby() into dataframe

#group_bmw_df = group_bmw.reset_index()

#display(group_bmw_df)

In [38]:
# pull all rows containing the specific 'model_year' column

#specific_model = 

#filtered_models = vehicles_df.loc[vehicles_df['model']]


In [39]:
# Reference Code - double check the `model` column


# SANTIY CHECK: Spot Check for specific user ID's total monthly call counts = total 2018 call count


# Replace with the user_id you want to filter
#specific_user_id = 1498  

# pull all rows containing the specific user ID
#filtered_df_call_counts = df_user_monthly_call_count.loc[df_user_monthly_call_count['user_id'] == specific_user_id]

# sums the sepcific user ID's monthly call counts
#user_total_call_count = filtered_df_call_counts['total_calls'].sum()

# notice this equals the User ID's total call counts in 2018
#print(f"User ID: {specific_user_id} total call count in 2018 is {user_total_call_count}.")


# Display the filtered dataframe
#display(filtered_df_call_counts)