# Used Cars Dataset Analysis

### The aim of this project is to clean the data and analyze the included used car listings

In [77]:
import pandas as pd 
import numpy as np

autos = pd.read_csv('autos.csv', encoding = 'Latin-1')

In [58]:
#autos.info()
#autos.head(3)

#### Change column names from CamelCase to Snake_case and shorten longer column names if possible

In [78]:
corrections = {
'dateCrawled': 'date_crawled',
'name' : 'name',
'seller' : 'seller', 
'offerType' : 'offer_type',
'price' : 'price',
'abtest' : 'abtest',
'vehicleType' : 'vehicle_type',
'yearOfRegistration' : 'registration_year',
'gearbox' : 'gearbox',
'powerPS' : 'power_ps', 
'model' : 'model',
'odometer' : 'odometer',
'monthOfRegistration' : 'registration_month',
'fuelType' : 'fuel_type',
'brand' : 'brand',
'notRepairedDamage' : 'unrepaired_damage',
'dateCreated' : 'ad_created',
'nrOfPictures' : 'nr_of_pictures',
'postalCode' : 'postal_code',
'lastSeen' : 'last_seen'
}

autos.columns = autos.columns.map(corrections)


In [83]:
autos.describe()

Unnamed: 0,price,registration_year,power_ps,odometer_km,registration_month,nr_of_pictures,postal_code,year
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,9840.044,2005.07328,116.35592,125732.7,5.72336,0.0,50813.6273,2005.07328
std,481104.4,105.712813,209.216627,40042.211706,3.711984,0.0,25779.747957,105.712813
min,0.0,1000.0,0.0,5000.0,0.0,0.0,1067.0,1000.0
25%,1100.0,1999.0,70.0,125000.0,3.0,0.0,30451.0,1999.0
50%,2950.0,2003.0,105.0,150000.0,6.0,0.0,49577.0,2003.0
75%,7200.0,2008.0,150.0,150000.0,9.0,0.0,71540.0,2008.0
max,100000000.0,9999.0,17700.0,150000.0,12.0,0.0,99998.0,9999.0


#### Can see that no cars have pictures so we can remove this column. Max power_ps needs to be looked at ( suspiciously high) along with max registration year. Price and Odometer columns also need to be cleaned.

In [79]:
autos['price'] = autos['price'].str.replace('$','').str.replace(',','').astype(int)
autos['odometer'] = autos['odometer'].str.replace('km','').str.replace(',','').astype(int)
autos.rename({'odometer':'odometer_km'}, axis = 1, inplace = True)
autos['year'] = autos['registration_year'].astype(int)

#### The price and odometer columns have been cleaned, now to remove outliers.


In [92]:
odometer_km_value = autos['odometer_km'].unique()

# odometer values look reasonable, will not remove any

price_values = autos['price'].value_counts().sort_index(ascending=True).head(10)
print(price_values)

# looking at price data, the prices vary greatly, I will remove any car 
# less than $100 or more than $150,000.

autos = autos[autos["price"].between(100,150000)]



0     1421
1      156
2        3
3        1
5        2
8        1
9        1
10       7
11       2
12       3
Name: price, dtype: int64


In [95]:
registration_years = autos['registration_year'].unique()

# some impossible years found, set limits from 
# earliest faesible year to present year.

autos = autos[autos["registration_year"].between(1900,2020)]
# irregular years successfully removed

# some strange horsepower values have been found in power_ps. 
# while a car may be a display car and hence have 0 power_ps, 
# it is unlikely to have 2 BHP. I will remove some irregularities.

autos = autos[autos["power_ps"].between(100,200000)]


In [105]:
autos['date_crawled'] = autos['date_crawled'].str[:10]
date_crawled = autos['date_crawled'].value_counts(normalize = True, dropna = False).sort_index()
autos['ad_created'] = autos['ad_created'].str[:10]
ad_created = autos['ad_created'].value_counts(normalize = True, dropna = False).sort_index()
autos['last_seen'] = autos['last_seen'].str[:10]
last_seen = autos['last_seen'].value_counts(normalize = True, dropna = False).sort_index()

# remove times from the timepoint columns

### Mean Price per Brand

In [191]:
brands = autos['brand']
brands_distinct = brands.value_counts().index[:20]

# Chosen top 20 most popular brands to look at

agg_price_data = {}
for brand in brands_distinct:
    sum_price = sum(autos[autos['brand'] == brand]['price'])
    brand_count = brands.value_counts()[:20][brand]
    mean_price = sum_price/ brand_count
    agg_price_data[brand] = int(mean_price)

# most expensive car to buy is a porsch, cheapest is a fiat

### Mean Mileage per Brand

In [192]:
brands = autos['brand']
brands_distinct = brands.value_counts().index[:20]

# Chosen top 20 most popular brands to look at

agg_mileage_data = {}
for brand in brands_distinct:
    sum_mileage = sum(autos[autos['brand'] == brand]['odometer_km'])
    brand_count = brands.value_counts()[:20][brand]
    mean_mileage = sum_mileage/ brand_count
    agg_mileage_data[brand] = int(mean_mileage)
    



### New Dataframe

In [195]:
mileage_data = pd.Series(agg_mileage_data)
new_df = pd.DataFrame(mileage_data, columns = ['avg_mileage'])
new_df['avg_price'] = pd.Series(agg_price_data)
print(new_df)

# Created new df from calculated values

               avg_mileage  avg_price
volkswagen          126048       8061
bmw                 132740       8511
mercedes_benz       130064       9517
audi                129114       9874
opel                128927       4226
ford                127922       5280
renault             128470       4016
peugeot             124557       4433
mazda               124102       5275
skoda               113665       8454
seat                120012       6763
volvo               139164       5121
nissan              114159       7812
mini                 85910      11883
alfa_romeo          132052       4112
toyota              118722       6816
fiat                123768       3787
citroen             123458       5288
porsche             101035      40325
hyundai             110041       6983
