In [1]:
# importing pandas
import pandas as pd

import numpy as np

In [2]:
# reading the file and storing it to df
df = pd.read_csv('/Users/user/Downloads/vehicles_us.csv')

In [3]:
# obtaining general information about the data in df
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    47906 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     46265 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      43633 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   42258 non-null  object 
 10  is_4wd        25572 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 5.1+ MB


In [4]:
# the list of column names in the df table
print(df.columns)

Index(['price', 'model_year', 'model', 'condition', 'cylinders', 'fuel',
       'odometer', 'transmission', 'type', 'paint_color', 'is_4wd',
       'date_posted', 'days_listed'],
      dtype='object')


In [5]:
# calculating missing values
print(df.isna().sum())

price               0
model_year       3619
model               0
condition           0
cylinders        5260
fuel                0
odometer         7892
transmission        0
type                0
paint_color      9267
is_4wd          25953
date_posted         0
days_listed         0
dtype: int64


In [6]:
# Assuming 'df' is your DataFrame
columns_to_replace = ['condition','paint_color', 'cylinders']

# Replace missing values with 'unknown' using transform()
df[columns_to_replace] = df[columns_to_replace].transform(lambda x: x.fillna('unknown'))

# Display the updated DataFrame
print(df)

       price  model_year           model  condition cylinders fuel  odometer  \
0       9400      2011.0          bmw x5       good       6.0  gas  145000.0   
1      25500         NaN      ford f-150       good       6.0  gas   88705.0   
2       5500      2013.0  hyundai sonata   like new       4.0  gas  110000.0   
3       1500      2003.0      ford f-150       fair       8.0  gas       NaN   
4      14900      2017.0    chrysler 200  excellent       4.0  gas   80903.0   
...      ...         ...             ...        ...       ...  ...       ...   
51520   9249      2013.0   nissan maxima   like new       6.0  gas   88136.0   
51521   2700      2002.0     honda civic    salvage       4.0  gas  181500.0   
51522   3950      2009.0  hyundai sonata  excellent       4.0  gas  128000.0   
51523   7455      2013.0  toyota corolla       good       4.0  gas  139573.0   
51524   6300      2014.0   nissan altima       good       4.0  gas       NaN   

      transmission    type paint_color 

In [7]:
# counting missing values
print(df.isna().sum())

price               0
model_year       3619
model               0
condition           0
cylinders           0
fuel                0
odometer         7892
transmission        0
type                0
paint_color         0
is_4wd          25953
date_posted         0
days_listed         0
dtype: int64


In [8]:
# Replace 'unknown' values with NaN
df['model_year'] = pd.to_numeric(df['model_year'], errors='coerce')
df['odometer'] = pd.to_numeric(df['odometer'], errors='coerce')

In [9]:
# Convert 'model_year' and 'odometer' columns to integers, treating NaN values as 0
df['model_year'] = df['model_year'].fillna(0).astype(int)
df['odometer'] = df['odometer'].fillna(0).astype(int)

# Display the updated DataFrame
print(df.head())

   price  model_year           model  condition cylinders fuel  odometer  \
0   9400        2011          bmw x5       good       6.0  gas    145000   
1  25500           0      ford f-150       good       6.0  gas     88705   
2   5500        2013  hyundai sonata   like new       4.0  gas    110000   
3   1500        2003      ford f-150       fair       8.0  gas         0   
4  14900        2017    chrysler 200  excellent       4.0  gas     80903   

  transmission    type paint_color  is_4wd date_posted  days_listed  
0    automatic     SUV     unknown     1.0  2018-06-23           19  
1    automatic  pickup       white     1.0  2018-10-19           50  
2    automatic   sedan         red     NaN  2019-02-07           79  
3    automatic  pickup     unknown     NaN  2019-03-22            9  
4    automatic   sedan       black     NaN  2019-04-02           28  


In [None]:
# Convert 'is_4wd' to string type (if not already)
df['is_4wd'] = df['is_4wd'].astype(str)

# Define a function to conditionally replace missing values based on the condition
def fill_missing_values(x):
    if x == 'nan':
        return 'yes' if df['is_4wd'].str.contains('1.0').any() else 'no'
    else:
        return x

# Apply the function to the 'is_4wd' column using .apply()
df['is_4wd'] = df['is_4wd'].apply(lambda x: fill_missing_values(x))

# Display the updated DataFrame
print(df.head())

In [None]:
# counting clear duplicates
print(df.duplicated().sum()) 

In [None]:
# viewing unique model names
df = df.drop_duplicates().reset_index(drop=True)
df.sort_values(by = ['model'],inplace = True)
print(df['model'].unique())

In [None]:
# Calculating number of models released per year
grouped = df.groupby('model_year')['model'].count()

# Sort the grouped data in descending order by count
reversed_grouped = grouped.sort_values(ascending=False)

# Print the reversed grouped data
print(reversed_grouped)

In [None]:
# Save DataFrame to a CSV file
df.to_csv('/Users/user/Downloads/vehicles_upd.csv', index=False)  # Set index=False to avoid writing row indices