## **Load Libraires**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [2]:
warnings.filterwarnings('ignore')

# **Feature Engineering**

In [3]:
df = pd.read_csv('C:/Users/PC/Desktop/Car-Price-Proyect/data/cars.csv')

In [4]:
class Intervals():
    
    def __init__(self,feature):
        
        self.mean=np.mean(feature)
        self.sd=np.std(feature)
        self.interval_range=[1.5,2,2.5,3.0,3.5,4]
        
    def Upper_Interval(self):
        
        for interval in self.interval_range:
            
            upper_interval= self.mean+interval*self.sd
            upper_interval=np.round(upper_interval)
            
            print(f"Interval range STD {interval}: {upper_interval}")
        
    def Lower_Interval(self):
        
         for interval in self.interval_range:
            
            lower_interval=self.mean-interval*self.sd
            lower_interval=np.round(lower_interval)
            
            print(f"Interval range STD {interval}: {lower_interval}")

In [5]:
class IntervalFit(Intervals):
    def __init__(self,feature):
        Intervals.__init__(self,feature)

Using object-oriented programming. We create a class that allows us to calculate the lower and upper interval, for the continuous numeric variable.

### *Price*

In [6]:
df.price.describe()

count     64132.000000
mean      18404.653543
std       10855.717134
min         495.000000
25%       10930.000000
50%       15999.000000
75%       22980.000000
max      159999.000000
Name: price, dtype: float64

In [7]:
IntervalFit(df.price).Lower_Interval()

Interval range STD 1.5: 2121.0
Interval range STD 2: -3307.0
Interval range STD 2.5: -8734.0
Interval range STD 3.0: -14162.0
Interval range STD 3.5: -19590.0
Interval range STD 4: -25018.0


We will take the minimum value of 2122 pounds sterling.

In [14]:
IntervalFit(df.price).Upper_Interval()

Interval range STD 1.5: 34688.0
Interval range STD 2: 40116.0
Interval range STD 2.5: 45544.0
Interval range STD 3.0: 50972.0
Interval range STD 3.5: 56399.0
Interval range STD 4: 61827.0


I am not convinced by the upper range. Better I will choose to apply this interval, according to the make of the vehicle.

In [15]:
df = df.query("price > 2122")

### *Mileage*

In [16]:
IntervalFit(df.mileage).Upper_Interval()

Interval range STD 1.5: 55113.0
Interval range STD 2: 65704.0
Interval range STD 2.5: 76295.0
Interval range STD 3.0: 86886.0
Interval range STD 3.5: 97478.0
Interval range STD 4: 108069.0


I will choose to use an interval of 86,886 miles. Sounds like a reasonable amount to me.

In [24]:
df = df.query("mileage > 3106 and mileage < 86886")

### *Year*

In [25]:
df.year.describe()

count    55851.000000
mean      2016.875544
std          1.942900
min       1970.000000
25%       2016.000000
50%       2017.000000
75%       2018.000000
max       2060.000000
Name: year, dtype: float64

It is impossible that there is a manufacturing date of the year 2060. It is a clear data collection failure.

In [43]:
df = df.query('year > 2000 and year <= 2020')

### *MPG*

In [13]:
df.mpg.describe()

count    55104.000000
mean        57.139837
std         17.932148
min          1.100000
25%         47.900000
50%         56.500000
75%         65.700000
max        470.800000
Name: mpg, dtype: float64

There is a clear outlier for MPG of 1.0, which is illogical. While high MPG values can be justified by the vehicle's engine type.

In [46]:
IntervalFit(df.mpg).Lower_Interval()

Interval range STD 1.5: 30.0
Interval range STD 2: 21.0
Interval range STD 2.5: 12.0
Interval range STD 3.0: 3.0
Interval range STD 3.5: -6.0
Interval range STD 4: -15.0


In [80]:
df = df.query("mpg > 21.0")

In [62]:
df.fuelType.value_counts()

Petrol      27002
Diesel      26061
Hybrid       2628
Other         138
Electric        5
Name: fuelType, dtype: int64

To apply this interval, we first have to eliminate the other category, since it does not add value. Also eliminate those vehicles that have an electric fuel type. Since there are very few vehicles in the dataset and they can skew the results.

In [63]:
df = df.query("fuelType != 'Other' and fuelType != 'Electric'")

In [81]:
df.fuelType.value_counts()

Petrol    26980
Diesel    26055
Hybrid     2615
Name: fuelType, dtype: int64

In [114]:
def UppperLimitMPG(x):
    
    mean = np.mean(x)
    std = np.std(x)
    
    return mean+2.5*std

In [115]:
df.groupby("fuelType")["mpg"].apply(UppperLimitMPG)

fuelType
Diesel     86.103206
Hybrid    215.534904
Petrol     77.932620
Name: mpg, dtype: float64

We can use the maximum estimated interval of 215.53.

In [116]:
df = df.query("mpg < 215.53")

## *Price*


### *Ford*

In [125]:
ford = df.query("manufacturer == 'Ford'")

In [126]:
ford.price.describe()

count    16546.000000
mean     12035.749365
std       4327.783541
min       2250.000000
25%       8998.000000
50%      11000.000000
75%      14899.000000
max      54995.000000
Name: price, dtype: float64

In [127]:
ford.query('price >  40000')

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,manufacturer
302,Mustang,2020,42489,Automatic,3500,Petrol,145,22.1,5.0,Ford
7164,Mustang,2020,40495,Semi-Auto,3200,Petrol,145,24.8,5.0,Ford
11910,Mustang,2017,48999,Automatic,7546,Petrol,145,23.5,5.0,Ford
11912,Focus,2018,54995,Manual,11000,Petrol,145,36.7,2.3,Ford


* I do not find a logical explanation that supports the high price of the *Ford Focus*. Since it is a mid-range vehicle.



* Model year 2017 *Ford Mustang* models are priced higher than a more recent model. In addition to having a higher mileage.

In [128]:
ford = ford.query('price < 43000')

Select those vehicles with an amount less than 43k £.

In [129]:
ford.query('price >  40000')

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,manufacturer
302,Mustang,2020,42489,Automatic,3500,Petrol,145,22.1,5.0,Ford
7164,Mustang,2020,40495,Semi-Auto,3200,Petrol,145,24.8,5.0,Ford


## *Toyota*

In [130]:
toyota = df.query("manufacturer == 'Toyota'")

In [131]:
toyota.price.describe()

count     6029.000000
mean     12215.275668
std       5788.586299
min       2295.000000
25%       7999.000000
50%      10645.000000
75%      14695.000000
max      54550.000000
Name: price, dtype: float64

In [132]:
toyota.query('price > 42000')

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,manufacturer
24623,Land Cruiser,2019,42444,Semi-Auto,10083,Diesel,145,30.1,2.8,Toyota
24625,Land Cruiser,2019,42995,Semi-Auto,16634,Diesel,145,30.1,2.8,Toyota
24627,Land Cruiser,2020,50995,Semi-Auto,3390,Diesel,145,30.1,2.8,Toyota
24629,Land Cruiser,2019,45995,Semi-Auto,8967,Diesel,150,30.1,2.8,Toyota
24633,Land Cruiser,2019,50995,Semi-Auto,6254,Diesel,145,30.1,2.8,Toyota
...,...,...,...,...,...,...,...,...,...,...
24655,Land Cruiser,2019,44995,Automatic,4512,Diesel,145,30.1,2.8,Toyota
24661,Land Cruiser,2020,54550,Automatic,4000,Diesel,150,30.1,2.8,Toyota
24664,Land Cruiser,2014,44990,Automatic,60000,Diesel,540,29.7,4.5,Toyota
24665,Land Cruiser,2019,49995,Automatic,6836,Diesel,145,30.1,2.8,Toyota


Vehicles over £40k are heavily dominated by the Land Cruiser model.

In [133]:
toyota.query("model == ' Supra'")

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,manufacturer
24673,Supra,2019,51995,Semi-Auto,4153,Petrol,145,34.5,3.0,Toyota


There is only one single value. Which is better to remove as it can skew the model.

In [134]:
toyota = toyota.query("model != ' Supra'")

In [135]:
land_cruiser = toyota.query("model == ' Land Cruiser'")

In [136]:
other_vehicles_toyota = toyota.query("model != ' Land Cruiser'")

In [137]:
land_cruiser_2019 = land_cruiser.query("year == 2019")
land_cruiser_2020 = land_cruiser.query("year == 2020")

In [138]:
land_cruiser = land_cruiser.query("year != 2019 and year != 2020")

In [139]:
land_cruiser_2020

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,manufacturer
24627,Land Cruiser,2020,50995,Semi-Auto,3390,Diesel,145,30.1,2.8,Toyota
24648,Land Cruiser,2020,50995,Automatic,3500,Diesel,145,30.1,2.8,Toyota
24661,Land Cruiser,2020,54550,Automatic,4000,Diesel,150,30.1,2.8,Toyota


For the 2020 Toyota Land Cruiser. It has a higher price and with a considerable amount of mileage.

In [140]:
land_cruiser_2020 = land_cruiser_2020.query("price < 51000")

In [141]:
land_cruiser_2020

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,manufacturer
24627,Land Cruiser,2020,50995,Semi-Auto,3390,Diesel,145,30.1,2.8,Toyota
24648,Land Cruiser,2020,50995,Automatic,3500,Diesel,145,30.1,2.8,Toyota


In [142]:
land_cruiser_2019

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,manufacturer
24615,Land Cruiser,2019,39498,Semi-Auto,11712,Diesel,145,30.1,2.8,Toyota
24619,Land Cruiser,2019,39498,Semi-Auto,12543,Diesel,145,30.1,2.8,Toyota
24623,Land Cruiser,2019,42444,Semi-Auto,10083,Diesel,145,30.1,2.8,Toyota
24624,Land Cruiser,2019,40999,Semi-Auto,11619,Diesel,145,30.1,2.8,Toyota
24625,Land Cruiser,2019,42995,Semi-Auto,16634,Diesel,145,30.1,2.8,Toyota
...,...,...,...,...,...,...,...,...,...,...
24637,Land Cruiser,2019,47795,Semi-Auto,8813,Diesel,145,30.1,2.8,Toyota
24641,Land Cruiser,2019,44935,Semi-Auto,15200,Diesel,145,30.1,2.8,Toyota
24643,Land Cruiser,2019,42990,Semi-Auto,22845,Diesel,150,30.1,2.8,Toyota
24655,Land Cruiser,2019,44995,Automatic,4512,Diesel,145,30.1,2.8,Toyota


For the same vehicle with a slightly older model it has a higher price. Therefore, we will proceed to eliminate them.

In [143]:
land_cruiser_2019 = land_cruiser_2019.query("price < 48000")

In [144]:
land_cruiser_2019

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,manufacturer
24615,Land Cruiser,2019,39498,Semi-Auto,11712,Diesel,145,30.1,2.8,Toyota
24619,Land Cruiser,2019,39498,Semi-Auto,12543,Diesel,145,30.1,2.8,Toyota
24623,Land Cruiser,2019,42444,Semi-Auto,10083,Diesel,145,30.1,2.8,Toyota
24624,Land Cruiser,2019,40999,Semi-Auto,11619,Diesel,145,30.1,2.8,Toyota
24625,Land Cruiser,2019,42995,Semi-Auto,16634,Diesel,145,30.1,2.8,Toyota
...,...,...,...,...,...,...,...,...,...,...
24632,Land Cruiser,2019,40995,Semi-Auto,11404,Diesel,145,30.1,2.8,Toyota
24637,Land Cruiser,2019,47795,Semi-Auto,8813,Diesel,145,30.1,2.8,Toyota
24641,Land Cruiser,2019,44935,Semi-Auto,15200,Diesel,145,30.1,2.8,Toyota
24643,Land Cruiser,2019,42990,Semi-Auto,22845,Diesel,150,30.1,2.8,Toyota


In [145]:
land_cruiser = pd.concat([land_cruiser,land_cruiser_2019,land_cruiser_2020])

In [146]:
toyota = pd.concat([other_vehicles_toyota,land_cruiser])

In [147]:
toyota.query("price > 42000")

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,manufacturer
24664,Land Cruiser,2014,44990,Automatic,60000,Diesel,540,29.7,4.5,Toyota
24623,Land Cruiser,2019,42444,Semi-Auto,10083,Diesel,145,30.1,2.8,Toyota
24625,Land Cruiser,2019,42995,Semi-Auto,16634,Diesel,145,30.1,2.8,Toyota
24629,Land Cruiser,2019,45995,Semi-Auto,8967,Diesel,150,30.1,2.8,Toyota
24637,Land Cruiser,2019,47795,Semi-Auto,8813,Diesel,145,30.1,2.8,Toyota
24641,Land Cruiser,2019,44935,Semi-Auto,15200,Diesel,145,30.1,2.8,Toyota
24643,Land Cruiser,2019,42990,Semi-Auto,22845,Diesel,150,30.1,2.8,Toyota
24655,Land Cruiser,2019,44995,Automatic,4512,Diesel,145,30.1,2.8,Toyota
24627,Land Cruiser,2020,50995,Semi-Auto,3390,Diesel,145,30.1,2.8,Toyota
24648,Land Cruiser,2020,50995,Automatic,3500,Diesel,145,30.1,2.8,Toyota


A 2014 model is priced similarly to a 2019. We are going to proceed to remove it. It also has considerably better mileage.

In [148]:
toyota = toyota.query("price != 44990")

In [149]:
toyota.query("price > 42000")

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,manufacturer
24623,Land Cruiser,2019,42444,Semi-Auto,10083,Diesel,145,30.1,2.8,Toyota
24625,Land Cruiser,2019,42995,Semi-Auto,16634,Diesel,145,30.1,2.8,Toyota
24629,Land Cruiser,2019,45995,Semi-Auto,8967,Diesel,150,30.1,2.8,Toyota
24637,Land Cruiser,2019,47795,Semi-Auto,8813,Diesel,145,30.1,2.8,Toyota
24641,Land Cruiser,2019,44935,Semi-Auto,15200,Diesel,145,30.1,2.8,Toyota
24643,Land Cruiser,2019,42990,Semi-Auto,22845,Diesel,150,30.1,2.8,Toyota
24655,Land Cruiser,2019,44995,Automatic,4512,Diesel,145,30.1,2.8,Toyota
24627,Land Cruiser,2020,50995,Semi-Auto,3390,Diesel,145,30.1,2.8,Toyota
24648,Land Cruiser,2020,50995,Automatic,3500,Diesel,145,30.1,2.8,Toyota


## *Hyundi*

In [150]:
hyundi = df.query("manufacturer == 'Hyundi'")

In [151]:
hyundi.price.describe()

count     4257.000000
mean     12090.109467
std       5379.541576
min       2195.000000
25%       7995.000000
50%      11298.000000
75%      15127.000000
max      92000.000000
Name: price, dtype: float64

In [152]:
hyundi.query("price > 40000")

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,manufacturer
28951,I10,2017,92000,Automatic,35460,Petrol,150,47.9,1.2,Hyundi


It is a clear outlier. Since it is a relatively economical vehicle.

In [153]:
IntervalFit(hyundi.price).Upper_Interval()

Interval range STD 1.5: 20158.0
Interval range STD 2: 22848.0
Interval range STD 2.5: 25537.0
Interval range STD 3.0: 28227.0
Interval range STD 3.5: 30916.0
Interval range STD 4: 33606.0


In [154]:
hyundi.query("price > 33000")

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,manufacturer
24874,Santa Fe,2019,34998,Semi-Auto,4133,Diesel,145,39.2,2.2,Hyundi
24875,Santa Fe,2019,34998,Semi-Auto,4956,Diesel,145,39.2,2.2,Hyundi
25198,Santa Fe,2019,33998,Semi-Auto,3122,Diesel,145,39.8,2.2,Hyundi
25700,Santa Fe,2019,34000,Semi-Auto,5931,Diesel,145,39.2,2.2,Hyundi
25790,Santa Fe,2019,33995,Automatic,6363,Diesel,145,42.8,2.2,Hyundi
...,...,...,...,...,...,...,...,...,...,...
28819,Santa Fe,2020,36999,Semi-Auto,6000,Diesel,150,39.2,2.2,Hyundi
28843,Santa Fe,2019,33949,Semi-Auto,6982,Diesel,145,39.2,2.2,Hyundi
28873,Santa Fe,2019,33999,Semi-Auto,6697,Diesel,150,39.2,2.2,Hyundi
28951,I10,2017,92000,Automatic,35460,Petrol,150,47.9,1.2,Hyundi


The Hyundai Santa Fe strongly dominates this price range.

In [155]:
hyundi = hyundi.query("price < 37000")

### *Audi*

In [156]:
audi = df.query("manufacturer == 'Audi'")

In [157]:
audi.price.describe()

count      9104.000000
mean      21912.766037
std       10699.966856
min        3490.000000
25%       14990.000000
50%       19495.000000
75%       25999.000000
max      129000.000000
Name: price, dtype: float64

In [158]:
IntervalFit(audi.price).Lower_Interval()

Interval range STD 1.5: 5864.0
Interval range STD 2: 514.0
Interval range STD 2.5: -4836.0
Interval range STD 3.0: -10185.0
Interval range STD 3.5: -15535.0
Interval range STD 4: -20885.0


In [159]:
audi  = audi.query("price > 6000")

In [160]:
audi.query("price > 100000")

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,manufacturer
55110,R8,2019,129000,Semi-Auto,4000,Petrol,145,21.4,5.2,Audi
57403,R8,2019,112990,Automatic,8175,Petrol,145,21.6,5.2,Audi
58206,R8,2019,117990,Automatic,11936,Petrol,145,21.4,5.2,Audi
58418,R8,2019,109990,Automatic,6954,Petrol,145,21.6,5.2,Audi
60909,R8,2019,119995,Semi-Auto,7800,Petrol,145,21.4,5.2,Audi
63932,R8,2019,125000,Automatic,13663,Petrol,150,21.4,5.2,Audi


There is a higher priced 2019 model year R8 that has more mileage. So I will choose to remove it.

In [161]:
audi_R8 = audi.query("model == ' R8'")
audi_R8_2019 = audi_R8.query("year == 2019")
audi_R8 = audi_R8.query("year != 2019")

In [162]:
other_audi_cars = audi.query("model != ' R8'")

In [163]:
audi_R8_2019

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,manufacturer
55110,R8,2019,129000,Semi-Auto,4000,Petrol,145,21.4,5.2,Audi
57403,R8,2019,112990,Automatic,8175,Petrol,145,21.6,5.2,Audi
58206,R8,2019,117990,Automatic,11936,Petrol,145,21.4,5.2,Audi
58418,R8,2019,109990,Automatic,6954,Petrol,145,21.6,5.2,Audi
60909,R8,2019,119995,Semi-Auto,7800,Petrol,145,21.4,5.2,Audi
63932,R8,2019,125000,Automatic,13663,Petrol,150,21.4,5.2,Audi


In [164]:
audi_R8_2019 = audi_R8_2019.query("price != 125000 and price != 117990")

In [165]:
audi_R8_2019

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,manufacturer
55110,R8,2019,129000,Semi-Auto,4000,Petrol,145,21.4,5.2,Audi
57403,R8,2019,112990,Automatic,8175,Petrol,145,21.6,5.2,Audi
58418,R8,2019,109990,Automatic,6954,Petrol,145,21.6,5.2,Audi
60909,R8,2019,119995,Semi-Auto,7800,Petrol,145,21.4,5.2,Audi


In [166]:
R8 = pd.concat([audi_R8,audi_R8_2019])

In [167]:
audi = pd.concat([other_audi_cars,R8])

## *BMW*

In [169]:
bmw = df.query("manufacturer == 'BMW'")

In [170]:
bmw.price.describe()

count      8689.000000
mean      21292.924272
std       10384.602002
min        2795.000000
25%       14412.000000
50%       18995.000000
75%       25450.000000
max      123456.000000
Name: price, dtype: float64

In [171]:
IntervalFit(bmw.price).Lower_Interval()

Interval range STD 1.5: 5717.0
Interval range STD 2: 525.0
Interval range STD 2.5: -4667.0
Interval range STD 3.0: -9859.0
Interval range STD 3.5: -15051.0
Interval range STD 4: -20243.0


In [172]:
bmw.query("price < 3000")

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,manufacturer
39583,3 Series,2006,2795,Manual,76000,Diesel,160,50.4,2.0,BMW


In [173]:
bmw.query("price > 75000")

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,manufacturer
29805,X7,2020,78000,Semi-Auto,5000,Diesel,150,31.4,3.0,BMW
30285,8 Series,2019,84898,Semi-Auto,3185,Petrol,145,24.4,4.4,BMW
32106,X7,2020,77990,Semi-Auto,5656,Diesel,150,31.4,3.0,BMW
32108,X7,2019,77880,Semi-Auto,6506,Diesel,150,31.4,3.0,BMW
32166,X7,2020,78490,Semi-Auto,4919,Diesel,145,31.4,3.0,BMW
32502,X7,2019,77880,Semi-Auto,6506,Diesel,145,31.4,3.0,BMW
33202,2 Series,2015,123456,Semi-Auto,33419,Diesel,20,68.9,2.0,BMW
36676,X7,2020,77995,Semi-Auto,7500,Diesel,150,31.4,3.0,BMW
36958,M4,2016,76990,Automatic,4550,Petrol,300,33.2,3.0,BMW


The 2 series model I do not find an explanation that supports the high price. Since it exceeds the price of the most expensive vehicles of the brand. It is also a model with more mileage and a longer manufacturing date.

In [174]:
bmw = bmw.query("price > 6000 and price < 85000")

In [175]:
bmw.query("price > 75000")

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,manufacturer
29805,X7,2020,78000,Semi-Auto,5000,Diesel,150,31.4,3.0,BMW
30285,8 Series,2019,84898,Semi-Auto,3185,Petrol,145,24.4,4.4,BMW
32106,X7,2020,77990,Semi-Auto,5656,Diesel,150,31.4,3.0,BMW
32108,X7,2019,77880,Semi-Auto,6506,Diesel,150,31.4,3.0,BMW
32166,X7,2020,78490,Semi-Auto,4919,Diesel,145,31.4,3.0,BMW
32502,X7,2019,77880,Semi-Auto,6506,Diesel,145,31.4,3.0,BMW
36676,X7,2020,77995,Semi-Auto,7500,Diesel,150,31.4,3.0,BMW
36958,M4,2016,76990,Automatic,4550,Petrol,300,33.2,3.0,BMW


## *Mercedes*

In [178]:
mercedes = df.query("manufacturer== 'Mercedes-Benz'")

In [179]:
mercedes.price.describe()

count     10974.000000
mean      23049.840259
std        9737.201283
min        2140.000000
25%       16899.000000
50%       20899.000000
75%       26980.000000
max      139995.000000
Name: price, dtype: float64

In [180]:
mercedes = mercedes.query("price > 6000")

In [181]:
mercedes.price.describe()

count     10918.000000
mean      23144.747481
std        9671.031970
min        6495.000000
25%       16980.500000
50%       20980.000000
75%       26980.000000
max      139995.000000
Name: price, dtype: float64

In [182]:
mercedes.query("price > 125000")

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,manufacturer
40353,G Class,2019,139948,Automatic,12000,Petrol,145,21.4,4.0,Mercedes-Benz
46247,G Class,2018,135771,Semi-Auto,19000,Petrol,145,21.4,4.0,Mercedes-Benz
46731,G Class,2018,139995,Semi-Auto,13046,Petrol,145,21.4,4.0,Mercedes-Benz
48209,G Class,2018,135124,Semi-Auto,18234,Petrol,150,21.4,4.0,Mercedes-Benz


In [183]:
df2 = pd.concat([ford,toyota,hyundi,mercedes,bmw,audi])

We create a new dataset, which has the cleanest data.

In [184]:
models = df2.model.value_counts()

In [185]:
models[models < 10]

 Puma               9
 IQ                 8
 SQ7                8
 M6                 7
 PROACE VERSO       6
 S4                 6
 S8                 4
 Urban Cruiser      4
 Veloster           3
 Verso-S            3
 S5                 2
 R Class            2
 GLB Class          2
 Getz               2
 Transit Tourneo    1
180                 1
 Terracan           1
 Z3                 1
220                 1
Focus               1
 RS7                1
200                 1
Name: model, dtype: int64

In [186]:
models = models[models < 10]

In [187]:
df2['model'] = df2['model'].apply(lambda x: "Other" if x in models else x)

In [188]:
df2.query("model == 'Other'").head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,manufacturer
122,Other,2020,21995,Manual,4111,Petrol,145,50.4,1.0,Ford
1141,Other,2020,20991,Manual,5750,Petrol,145,49.6,1.0,Ford
3318,Other,2020,20491,Manual,4998,Petrol,145,50.4,1.0,Ford
5880,Other,2020,21500,Manual,3198,Petrol,145,50.4,1.0,Ford
7117,Other,2020,20590,Manual,3214,Petrol,145,50.4,1.0,Ford


In [189]:
engineSize = df2.engineSize.value_counts()

In [190]:
engineSize[engineSize < 10]

6.2    9
3.2    8
6.6    2
4.1    2
5.4    1
3.7    1
6.0    1
2.7    1
6.3    1
Name: engineSize, dtype: int64

In [191]:
engineSize = engineSize[engineSize < 10]

In [192]:
df2['engineSize'] = df2['engineSize'].apply(lambda x: "Other" if x in engineSize else x)

In [193]:
df2.engineSize.unique()

array([1.0, 1.5, 1.6, 1.2, 2.0, 1.1, 2.3, 1.4, 5.0, 2.2, 2.5, 1.8, 1.3,
       0.0, 2.4, 3.0, 2.8, 1.7, 2.9, 2.1, 5.5, 4.0, 3.5, 4.7, 'Other',
       4.4, 4.2, 5.2], dtype=object)

In [194]:
df2.transmission.unique()

array(['Automatic', 'Manual', 'Semi-Auto', 'Other'], dtype=object)

For those categories that are repeated less than 10 times. We convert them to another category called "Other". To later delete them.

### *Delete Other Values*

Since they do not add significant value.

In [195]:
df2 = df2.query("model != 'Other'")
df2 = df2.query("engineSize	!= 'Other'")
df2 = df2.query("transmission != 'Other'")

In [196]:
df2.fuelType.unique()

array(['Petrol', 'Diesel', 'Hybrid'], dtype=object)

In [197]:
df2.engineSize.unique()

array([1.0, 1.5, 1.6, 1.2, 2.0, 1.1, 2.3, 1.4, 5.0, 2.2, 2.5, 1.8, 1.3,
       0.0, 2.4, 3.0, 2.8, 1.7, 2.1, 5.5, 4.0, 2.9, 3.5, 4.7, 4.4, 4.2,
       5.2], dtype=object)

In [198]:
df2.engineSize = df2.engineSize.astype(float)

In [199]:
df2.engineSize.unique()

array([1. , 1.5, 1.6, 1.2, 2. , 1.1, 2.3, 1.4, 5. , 2.2, 2.5, 1.8, 1.3,
       0. , 2.4, 3. , 2.8, 1.7, 2.1, 5.5, 4. , 2.9, 3.5, 4.7, 4.4, 4.2,
       5.2])

In [200]:
df2.query("model == 'Other'")

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,manufacturer


## **Save Data Clear**

In [202]:
df2.to_csv('C:/Users/PC/Desktop/Car-Price-Proyect/data/car_clear.csv',index = False)