In [1]:
# importing pandas
import pandas as pd

# importing numpy
import numpy as np

In [2]:
# reading the file and storing it to df
df = pd.read_csv('/Users/user/Downloads/vehicles_us.csv')

In [3]:
# obtaining general information about the data in df
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    47906 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     46265 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      43633 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   42258 non-null  object 
 10  is_4wd        25572 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 5.1+ MB


In [4]:
# the list of column names in the df table
print(df.columns)

Index(['price', 'model_year', 'model', 'condition', 'cylinders', 'fuel',
       'odometer', 'transmission', 'type', 'paint_color', 'is_4wd',
       'date_posted', 'days_listed'],
      dtype='object')


In [5]:
# calculating missing values
print(df.isna().sum())

price               0
model_year       3619
model               0
condition           0
cylinders        5260
fuel                0
odometer         7892
transmission        0
type                0
paint_color      9267
is_4wd          25953
date_posted         0
days_listed         0
dtype: int64


In [6]:
# Assuming 'df' is your DataFrame
columns_to_replace = ['condition','paint_color', 'cylinders']

# Replace missing values with 'unknown' using transform()
df[columns_to_replace] = df[columns_to_replace].transform(lambda x: x.fillna('unknown'))

# Display the updated DataFrame
print(df)

       price  model_year           model  condition cylinders fuel  odometer  \
0       9400      2011.0          bmw x5       good       6.0  gas  145000.0   
1      25500         NaN      ford f-150       good       6.0  gas   88705.0   
2       5500      2013.0  hyundai sonata   like new       4.0  gas  110000.0   
3       1500      2003.0      ford f-150       fair       8.0  gas       NaN   
4      14900      2017.0    chrysler 200  excellent       4.0  gas   80903.0   
...      ...         ...             ...        ...       ...  ...       ...   
51520   9249      2013.0   nissan maxima   like new       6.0  gas   88136.0   
51521   2700      2002.0     honda civic    salvage       4.0  gas  181500.0   
51522   3950      2009.0  hyundai sonata  excellent       4.0  gas  128000.0   
51523   7455      2013.0  toyota corolla       good       4.0  gas  139573.0   
51524   6300      2014.0   nissan altima       good       4.0  gas       NaN   

      transmission    type paint_color 

In [7]:
# Replace 'unknown' values with NaN
df['model_year'] = pd.to_numeric(df['model_year'], errors='coerce')
df['odometer'] = pd.to_numeric(df['odometer'], errors='coerce')

In [8]:
# Convert 'model_year' and 'odometer' columns to integers, treating NaN values as 0
df['model_year'] = df['model_year'].fillna(0).astype(int)
df['odometer'] = df['odometer'].fillna(0).astype(int)

# Display the updated DataFrame
print(df.head())

   price  model_year           model  condition cylinders fuel  odometer  \
0   9400        2011          bmw x5       good       6.0  gas    145000   
1  25500           0      ford f-150       good       6.0  gas     88705   
2   5500        2013  hyundai sonata   like new       4.0  gas    110000   
3   1500        2003      ford f-150       fair       8.0  gas         0   
4  14900        2017    chrysler 200  excellent       4.0  gas     80903   

  transmission    type paint_color  is_4wd date_posted  days_listed  
0    automatic     SUV     unknown     1.0  2018-06-23           19  
1    automatic  pickup       white     1.0  2018-10-19           50  
2    automatic   sedan         red     NaN  2019-02-07           79  
3    automatic  pickup     unknown     NaN  2019-03-22            9  
4    automatic   sedan       black     NaN  2019-04-02           28  


In [9]:
# Convert 'is_4wd' to string type (if not already)
df['is_4wd'] = df['is_4wd'].astype(str)

# Replace '1.0' with 'yes' and 'nan' with 'no'
df['is_4wd'] = np.where(df['is_4wd'] == '1.0', 'yes', np.where(df['is_4wd'] == 'nan', 'no', df['is_4wd']))

# Display the updated DataFrame
print(df.head())

   price  model_year           model  condition cylinders fuel  odometer  \
0   9400        2011          bmw x5       good       6.0  gas    145000   
1  25500           0      ford f-150       good       6.0  gas     88705   
2   5500        2013  hyundai sonata   like new       4.0  gas    110000   
3   1500        2003      ford f-150       fair       8.0  gas         0   
4  14900        2017    chrysler 200  excellent       4.0  gas     80903   

  transmission    type paint_color is_4wd date_posted  days_listed  
0    automatic     SUV     unknown    yes  2018-06-23           19  
1    automatic  pickup       white    yes  2018-10-19           50  
2    automatic   sedan         red     no  2019-02-07           79  
3    automatic  pickup     unknown     no  2019-03-22            9  
4    automatic   sedan       black     no  2019-04-02           28  


In [10]:
# calculating missing values
print(df.isna().sum())

price           0
model_year      0
model           0
condition       0
cylinders       0
fuel            0
odometer        0
transmission    0
type            0
paint_color     0
is_4wd          0
date_posted     0
days_listed     0
dtype: int64


In [11]:
# counting clear duplicates
print(df.duplicated().sum()) 

0


In [12]:
# viewing unique model names
print(df['model'].unique())

['bmw x5' 'ford f-150' 'hyundai sonata' 'chrysler 200' 'chrysler 300'
 'toyota camry' 'honda pilot' 'kia sorento' 'chevrolet silverado 1500'
 'honda accord' 'ram 1500' 'gmc yukon' 'jeep cherokee'
 'chevrolet traverse' 'hyundai elantra' 'chevrolet tahoe' 'toyota rav4'
 'chevrolet silverado' 'jeep wrangler' 'chevrolet malibu' 'ford fusion se'
 'chevrolet impala' 'chevrolet corvette' 'jeep liberty' 'toyota camry le'
 'nissan altima' 'subaru outback' 'toyota highlander' 'dodge charger'
 'toyota tacoma' 'chevrolet equinox' 'nissan rogue'
 'mercedes-benz benze sprinter 2500' 'honda cr-v' 'jeep grand cherokee'
 'toyota 4runner' 'ford focus' 'honda civic' 'kia soul'
 'chevrolet colorado' 'ford f150 supercrew cab xlt'
 'chevrolet camaro lt coupe 2d' 'chevrolet cruze' 'ford mustang'
 'chevrolet silverado 3500hd' 'nissan frontier crew cab sv'
 'subaru impreza' 'jeep grand cherokee laredo' 'nissan versa'
 'ford f-250 sd' 'chevrolet silverado 1500 crew' 'ford f250 super duty'
 'chevrolet camaro' 'f

In [14]:
# function for replacing implicit duplicates
def replace_wrong_models(wrong_models, correct_model):
    for wrong_model in wrong_models:
        df['model'] = df['model'].str.strip().str.lower().replace(wrong_model.lower(), correct_model.lower())

In [15]:
# removing implicit duplicates
replace_wrong_models(['ford f-250'], 'ford f250') 

replace_wrong_models(['ford f-250 super duty'], 'ford f250 super duty') 

In [16]:
# viewing updated unique model names
print(df['model'].unique())

['bmw x5' 'ford f-150' 'hyundai sonata' 'chrysler 200' 'chrysler 300'
 'toyota camry' 'honda pilot' 'kia sorento' 'chevrolet silverado 1500'
 'honda accord' 'ram 1500' 'gmc yukon' 'jeep cherokee'
 'chevrolet traverse' 'hyundai elantra' 'chevrolet tahoe' 'toyota rav4'
 'chevrolet silverado' 'jeep wrangler' 'chevrolet malibu' 'ford fusion se'
 'chevrolet impala' 'chevrolet corvette' 'jeep liberty' 'toyota camry le'
 'nissan altima' 'subaru outback' 'toyota highlander' 'dodge charger'
 'toyota tacoma' 'chevrolet equinox' 'nissan rogue'
 'mercedes-benz benze sprinter 2500' 'honda cr-v' 'jeep grand cherokee'
 'toyota 4runner' 'ford focus' 'honda civic' 'kia soul'
 'chevrolet colorado' 'ford f150 supercrew cab xlt'
 'chevrolet camaro lt coupe 2d' 'chevrolet cruze' 'ford mustang'
 'chevrolet silverado 3500hd' 'nissan frontier crew cab sv'
 'subaru impreza' 'jeep grand cherokee laredo' 'nissan versa'
 'ford f-250 sd' 'chevrolet silverado 1500 crew' 'ford f250 super duty'
 'chevrolet camaro' 'f

In [17]:
# Save DataFrame to a CSV file
df.to_csv('/Users/user/Downloads/vehicles_upd.csv', index=False)  # Set index=False to avoid writing row indices