# Initialization

In [123]:
# Loading libraries

import pandas as pd
import numpy as np
import plotly.express as px
from IPython.display import display


## Load Data

In [124]:
# load *.csv file into dataframes

try:
    vehicles_df = pd.read_csv('vehicles_us.csv')
except:
    vehicles_df = pd.read_csv('https://practicum-content.s3.us-west-1.amazonaws.com/datasets/vehicles_us.csv')


# EDA - Exploratory Data Analysis

<span style="color: blue;"><u>**OBJECTIVE**</u></span>
1. Review uploaded `vehicles_us.csv` file
2. Check for missing values/Nulls, NaN (*Not a Number, etc.*)

<br>

<span style="color: darkorange;"><u>**Findings / Observations**</u></span>
- 51525 row with 13 columns
- missing values, nulls, & NaNs found in the following (5) attributes:
    1. `model_year` (3619)
    2. `cylinders` (5260)
    3. `odometer` (7892)
    4. `paint_color` (9267)
    5. `is_4wd` (25953)
- `is_4wd` column is boolean logic (1 = Y vs. 0 = N)
    - NaN found and may need to determine if these will be zero (0 = No)
- found several ford model names that can be consolidated
    - `ford f-150` (2796) with `ford f150` (530)
    - `ford f-250` (422) with `ford f250`	(339)
    - keep `f-###` model name format 
- <>

<br>

<span style="color: red;"><u>**Note-2-Self**</u></span>
- will need to parse out the manufacturer and vehicle model from `model` column
    - *see code shared by Jester* 
- `date_posted` and `model_year` column as `object` datatype
    - convert to `datetime` datatype
- `model_year` column as `float` datatype
    - convert to `datetime` datatype for ease of data visualization
- may want to clean the `model_year` column to be 4-digit year (YYYY)
- <>



In [146]:
# EDA - displays rows with missing values

display(vehicles_df[vehicles_df.isna().any(axis=1)].head(60))

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
0,9400,2011.0,bmw x5,good,6.0,gas,145000.0,automatic,SUV,,1.0,2018-06-23,19
1,25500,,ford f-150,good,6.0,gas,88705.0,automatic,pickup,white,1.0,2018-10-19,50
2,5500,2013.0,hyundai sonata,like new,4.0,gas,110000.0,automatic,sedan,red,,2019-02-07,79
3,1500,2003.0,ford f-150,fair,8.0,gas,,automatic,pickup,,,2019-03-22,9
4,14900,2017.0,chrysler 200,excellent,4.0,gas,80903.0,automatic,sedan,black,,2019-04-02,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...
76,3000,2010.0,nissan versa,like new,4.0,gas,200412.0,automatic,hatchback,silver,,2018-09-23,33
77,4200,2012.0,ford focus,like new,4.0,gas,144273.0,automatic,sedan,silver,,2018-06-11,19
80,26900,2014.0,chevrolet silverado 1500 crew,good,8.0,gas,68607.0,automatic,pickup,,1.0,2018-09-08,10
81,5900,2000.0,ford f250 super duty,good,8.0,diesel,202752.0,automatic,pickup,red,,2018-07-08,32


<span style="color: red;"><u>**Note-2-Self**</u></span>


Read/Review the use  `fillna()`  `mode` parameter for fill-in `'model_year'`

`'odometer'` - take average to fill-in NaN

`'is_4wd'` fill-in with zero = No




In [125]:
# EDA - preview data

display(vehicles_df.sample(50))


Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
6824,12900,2010.0,ford f-150,good,,gas,107636.0,automatic,pickup,black,1.0,2018-10-27,39
40383,27999,2014.0,gmc sierra 1500,like new,8.0,gas,52000.0,automatic,truck,white,1.0,2018-11-09,13
42431,5500,2001.0,toyota 4runner,good,6.0,gas,208913.0,automatic,SUV,,1.0,2018-07-17,20
49107,2100,2006.0,nissan altima,good,4.0,gas,167000.0,automatic,sedan,grey,,2018-10-15,54
6819,2500,1999.0,jeep cherokee,good,6.0,gas,253108.0,automatic,SUV,white,1.0,2019-01-29,45
33770,17500,2016.0,ford f-150,like new,,gas,41500.0,automatic,pickup,black,1.0,2018-05-19,7
12389,8995,2011.0,nissan rogue,excellent,4.0,gas,74517.0,automatic,SUV,white,1.0,2019-02-19,29
23095,8600,2015.0,kia soul,excellent,4.0,gas,54510.0,automatic,SUV,white,,2019-03-03,35
26297,17995,,ram 1500,excellent,,gas,126787.0,automatic,pickup,white,1.0,2019-04-18,58
47017,12998,2010.0,ford f-150,good,8.0,gas,126729.0,automatic,truck,silver,1.0,2019-03-30,102


In [141]:
ford_models = vehicles_df[vehicles_df['model'].isna()]

display(ford_models.head(60))

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed


In [140]:
# EDA - see row with NaN

rows_with_nan = vehicles_df[vehicles_df['model_year'].isna()]

display(rows_with_nan.head(60))


Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
1,25500,,ford f-150,good,6.0,gas,88705.0,automatic,pickup,white,1.0,2018-10-19,50
20,6990,,chevrolet tahoe,excellent,8.0,gas,147485.0,automatic,SUV,silver,1.0,2018-08-05,28
65,12800,,ford f-150,excellent,6.0,gas,108500.0,automatic,pickup,white,,2018-09-23,15
69,7800,,ford f-150,like new,8.0,gas,97510.0,automatic,truck,white,1.0,2019-02-20,39
72,3650,,subaru impreza,excellent,,gas,74000.0,automatic,sedan,blue,1.0,2018-08-07,60
84,4995,,hyundai elantra,like new,4.0,gas,151223.0,automatic,sedan,custom,,2018-09-15,1
102,10800,,chevrolet colorado,excellent,5.0,gas,114922.0,automatic,truck,red,1.0,2018-06-23,35
114,23700,,nissan frontier crew cab sv,good,6.0,gas,12901.0,other,pickup,black,1.0,2018-09-21,8
116,25300,,chevrolet camaro lt coupe 2d,good,6.0,gas,3568.0,other,coupe,,,2018-06-16,34
144,8995,,chevrolet silverado 1500,excellent,8.0,gas,119726.0,automatic,truck,grey,1.0,2019-03-18,27


In [126]:
# EDA - check for nulls/NaN/missing values

# view & identify dataframe attributes with null/missing values
vehicles_df.info()
print('\n\n')

# counts of missing values, nulls, NaNs
print('Missing Value / Null / NaN Counts')
display(vehicles_df.isna().sum())
 


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    47906 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     46265 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      43633 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   42258 non-null  object 
 10  is_4wd        25572 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 5.1+ MB



Missing Value / Null / NaN Counts


price               0
model_year       3619
model               0
condition           0
cylinders        5260
fuel                0
odometer         7892
transmission        0
type                0
paint_color      9267
is_4wd          25953
date_posted         0
days_listed         0
dtype: int64

In [127]:
# EDA - view the unique value and counts in the following columns

display(vehicles_df['model'].value_counts(sort=True))
print()

display(vehicles_df['condition'].value_counts(sort=True))
print()

display(vehicles_df['transmission'].value_counts(sort=False))
print()

display(vehicles_df['type'].value_counts(sort=True))
print()

display(vehicles_df['paint_color'].value_counts())
print()

display(vehicles_df['is_4wd'].value_counts(sort=True))
print()


model
ford f-150                           2796
chevrolet silverado 1500             2171
ram 1500                             1750
chevrolet silverado                  1271
jeep wrangler                        1119
                                     ... 
ford f-250 super duty                 241
acura tl                              236
kia sorento                           236
nissan murano                         235
mercedes-benz benze sprinter 2500      41
Name: count, Length: 100, dtype: int64




condition
excellent    24773
good         20145
like new      4742
fair          1607
new            143
salvage        115
Name: count, dtype: int64




transmission
automatic    46902
manual        2829
other         1794
Name: count, dtype: int64




type
SUV            12405
truck          12353
sedan          12154
pickup          6988
coupe           2303
wagon           1541
mini-van        1161
hatchback       1047
van              633
convertible      446
other            256
offroad          214
bus               24
Name: count, dtype: int64




paint_color
white     10029
black      7692
silver     6244
grey       5037
blue       4475
red        4421
green      1396
brown      1223
custom     1153
yellow      255
orange      231
purple      102
Name: count, dtype: int64




is_4wd
1.0    25572
Name: count, dtype: int64




In [128]:
# EDA - view the unique value and counts in the 'type' column

display(vehicles_df['type'].value_counts(sort=True))

type
SUV            12405
truck          12353
sedan          12154
pickup          6988
coupe           2303
wagon           1541
mini-van        1161
hatchback       1047
van              633
convertible      446
other            256
offroad          214
bus               24
Name: count, dtype: int64

In [149]:
# EDA - view all the unique value and counts in the 'model_year' column
# converted into a dataframe in order to view the entire (68) unique values

model_year_counts_df = (vehicles_df["model_year"].value_counts(sort=False)).reset_index()

model_year_counts_df.columns = ['model_year', 'count']

model_year_df_sorted = model_year_df.sort_values(by='model_year')

display(model_year_counts_df.head(60))


NameError: name 'model_year_df' is not defined

In [148]:
# EDA - view all the unique value and counts in the 'model' column
# converted into a dataframe in order to view the entire (100) unique values
# found several ford model names that can be consolidated i.e. 'ford f250' with 'ford f-250'

model_counts_df = (vehicles_df['model'].value_counts()).reset_index()

#vehicles_df['model'].value_counts(sort=True)

model_counts_df.columns = ['model', 'count']

# sort `model` column by alphabectical order
model_counts_df_sorted = model_counts_df.sort_values(by='model')


bdisplay(model_counts_df_sorted.tail(60))


Unnamed: 0,model,count
54,ford f250 super duty,370
90,ford f350,250
91,ford f350 super duty,246
16,ford focus,754
52,ford focus se,376
27,ford fusion,544
70,ford fusion se,309
19,ford mustang,681
74,ford mustang gt coupe 2d,301
45,ford ranger,423




export large output greater than 120 onto `.csv` or `.txt` or `.xlsx` 

In [150]:
# TEST CODE - to view all rows output that is less than 120 rows

display(model_counts_df_sorted.head(60))


display(model_counts_df_sorted.tail(60))

Unnamed: 0,model,count
96,acura tl,236
85,bmw x5,267
84,buick enclave,271
65,cadillac escalade,322
49,chevrolet camaro,414
69,chevrolet camaro lt coupe 2d,311
79,chevrolet colorado,286
31,chevrolet corvette,499
40,chevrolet cruze,457
25,chevrolet equinox,591


Unnamed: 0,model,count
54,ford f250 super duty,370
90,ford f350,250
91,ford f350 super duty,246
16,ford focus,754
52,ford focus se,376
27,ford fusion,544
70,ford fusion se,309
19,ford mustang,681
74,ford mustang gt coupe 2d,301
45,ford ranger,423


## Remove Missing Value, Null, and NaN (*Not a Number*)

<span style="color: green;"><u>**Steps / Action Taken**</u></span>
- <>

<br>

<span style="color: darkorange;"><u>**Findings / Observations**</u></span>
- <>

<br>

<span style="color: red;"><u>**Note-2-Self**</u></span>
- will need to parse out the manufacturer and vehicle model from `model` column
    - *see code shared by Jester* 
- for now...okay to keep the `date_posted` column as `object` datatype
    - will determine and return if needed to change to `datetime` datatype
- may want to clean the `model_year` column to be 4-digit year (YYYY)
- <>


In [131]:
# pull all rows containing the specific 'model_year' column

#specific_model = 

filtered_models = vehicles_df.loc[vehicles_df['model']]


KeyError: "None of [Index(['bmw x5', 'ford f-150', 'hyundai sonata', 'ford f-150', 'chrysler 200',\n       'chrysler 300', 'toyota camry', 'honda pilot', 'kia sorento',\n       'honda pilot',\n       ...\n       'chevrolet silverado 2500hd', 'jeep grand cherokee', 'dodge charger',\n       'ford taurus', 'honda accord', 'nissan maxima', 'honda civic',\n       'hyundai sonata', 'toyota corolla', 'nissan altima'],\n      dtype='object', length=51525)] are in the [index]"

In [None]:
# Reference Code - double check the `model` column


# SANTIY CHECK: Spot Check for specific user ID's total monthly call counts = total 2018 call count


# Replace with the user_id you want to filter
#specific_user_id = 1498  

# pull all rows containing the specific user ID
#filtered_df_call_counts = df_user_monthly_call_count.loc[df_user_monthly_call_count['user_id'] == specific_user_id]

# sums the sepcific user ID's monthly call counts
#user_total_call_count = filtered_df_call_counts['total_calls'].sum()

# notice this equals the User ID's total call counts in 2018
#print(f"User ID: {specific_user_id} total call count in 2018 is {user_total_call_count}.")


# Display the filtered dataframe
#display(filtered_df_call_counts)

In [None]:
# CLEANING DATA - remove NaN values in `model_year` column




In [None]:
# Feature Engineering - parse manufacturer and model name in 'model' column
# code to reference to parse manufacturer and model
# use later.  Need to clean.
# remove null / Nan / missing values


#vehicles_df[['make', 'model']] = vehicles_df['model'].str.split(' ', n=1, expand=True)