In [41]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

pd.options.display.max_rows
pd.set_option('display.max_rows', None)


In [2]:
year = []
model = []
mileage = []
location = []
price = []
price_type = []
color_scheme = []
hist_cond = []

for i in range (1,31):
    
        # Website variable
        website = 'https://www.truecar.com/used-cars-for-sale/listings/bmw/3-series/year-2006-2011/location-effort-pa/?page=' + str(i) + '&searchRadius=5000&trimSlug[]=335i&trimSlug[]=335is&trimSlug[]=335xi'

        # Response to website
        response = requests.get(website)
        
        # Soup object
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Results
        results = soup.find_all('div', {'class':'linkable card card-shadow vehicle-card _1qd1muk'})
        
        for result in results:
            
            # year 
            try:
                year.append(result.find('span', {'class': 'vehicle-card-year font-size-1'}).get_text())
            except:
                year.append('n/a')
                
            # model
            try:
                model.append(result.find('div', {'class': 'font-size-1 text-truncate'}).get_text())
            except:
                model.append('n/a')
            
            # mileage
            try:
                mileage.append(result.find('div', {'data-test':'vehicleMileage'}).get_text())
            except:
                mileage.append('n/a')
            
            # location
            try:
                location.append(result.find('div', {'data-test':'vehicleCardLocation'}).get_text())
            except:
                location.append('n/a')
            
            # price
            try:
                price.append(result.find('div', {'data-test':'vehicleCardPricingBlockPrice'}).get_text())
            except:
                price.append('n/a')
            
            # price_type
            try:
                price_type.append(result.find('span', {'data-test':'graphIconLabel'}).get_text())
            except:
                price_type.append('n/a')
            
            # color_scheme
            try:
                color_scheme.append(result.find('div', {'data-test': 'vehicleCardColors'}).get_text())
            except:
                color_scheme.append('n/a')
            
            # hist_cond
            try:
                hist_cond.append(result.find('div', {'data-test':'vehicleCardCondition'}).get_text())
            except:
                hist_cond.append('n/a')

In [3]:
truecar_df = pd.DataFrame ({'Year':year, 'Model':model, 'Mileage':mileage,
                           'Location':location, 'Price':price, 'Site Price Type':price_type,
                           'Color Scheme':color_scheme, 'History':hist_cond})

In [4]:
truecar_df

Unnamed: 0,Year,Model,Mileage,Location,Price,Site Price Type,Color Scheme,History
0,2011,335is Convertible,"62,515 miles","58 mi - Somerset, NJ","$25,999",High Price,"Silver exterior, Black interior","No accidents, 3 Owners, Personal use"
1,2010,335d Sedan,"48,400 miles","8.7 mi - Stroudsburg, PA","$16,990",Excellent Price,"Black exterior, Black interior","No accidents, 2 Owners, Personal use"
2,2011,328i xDrive Sedan AWD SULEV,"80,434 miles","74 mi - South Hackensack, NJ","$9,795",Excellent Price,"White exterior, Brown interior","No accidents, 2 Owners, Personal use"
3,2011,328i Coupe,"78,420 miles","21 mi - Easton, PA","$15,998",Excellent Price,"Black exterior, Black interior","No accidents, 4 Owners, Personal use"
4,2011,328i xDrive Sedan AWD,"79,491 miles","59 mi - Fort Washington, PA","$12,981",Excellent Price,"Black exterior, Brown interior","1 accident, 1 Owner, Personal use"
...,...,...,...,...,...,...,...,...
925,2009,335i Convertible,"60,798 miles","8.7 mi - Stroudsburg, PA","$20,990",High Price,"Black exterior, Brown interior","No accidents, 3 Owners, Personal use"
926,2011,335i Convertible,"89,479 miles","67 mi - Elizabeth, NJ","$17,495",High Price,"Blue exterior, Beige interior","1 accident, 2 Owners, Personal use"
927,2011,328i xDrive Sedan AWD,"129,862 miles","66 mi - Rahway, NJ","$9,500",High Price,"Unknown exterior, Unknown interior","1 accident, 4 Owners, Fleet use"
928,2011,328i xDrive Sedan AWD,"70,009 miles","52 mi - Morristown, NJ","$13,995",High Price,"Unknown exterior, Black interior","2 accidents, 1 Owner, Personal use"


# Data Cleaning

In [5]:
truecar_df.columns

Index(['Year', 'Model', 'Mileage', 'Location', 'Price', 'Site Price Type',
       'Color Scheme', 'History'],
      dtype='object')

In [6]:
truecar_df.Model.value_counts()

328i xDrive Sedan AWD SULEV    180
328i xDrive Sedan AWD          120
335i Convertible               120
335i xDrive Coupe AWD           90
335is Convertible               60
335d Sedan                      60
328i Sedan                      60
335i xDrive Sedan AWD           60
328i Coupe                      30
325Ci Convertible               30
328xi Sedan AWD SULEV           30
328i xDrive Coupe AWD           30
328xi Sedan AWD                 30
325i Sedan                      30
Name: Model, dtype: int64

### N54 Car Filter

In [7]:
truecar_df['335i_yn'] = truecar_df['Model'].apply(lambda x: 'yes' if '335i' in x.lower() else 'no')
truecar_df['335i_yn'].value_counts()

no     600
yes    330
Name: 335i_yn, dtype: int64

In [8]:
truecar_df.drop(truecar_df[truecar_df['335i_yn'] == 'no'].index, inplace = True)

In [9]:
truecar_df['Model'].value_counts()

335i Convertible         120
335i xDrive Coupe AWD     90
335is Convertible         60
335i xDrive Sedan AWD     60
Name: Model, dtype: int64

### xDrive (All wheel drive)

In [10]:
truecar_df['xdrive_yn'] = truecar_df['Model'].apply(lambda x: 'yes' if 'awd' in x.lower() else 'no')

In [11]:
truecar_df['xdrive_yn'].value_counts()

no     180
yes    150
Name: xdrive_yn, dtype: int64

### 'is' model

In [12]:
truecar_df['is_model_yn'] = truecar_df['Model'].apply(lambda x: 'yes' if 'is' in x.lower() else 'no')

In [13]:
truecar_df['is_model_yn'].value_counts()

no     270
yes     60
Name: is_model_yn, dtype: int64

### E90 or E92

In [14]:
truecar_df['body_style'] = truecar_df['Model'].apply(lambda x: 'E90' if 'sedan' in x.lower() else 'E92')

In [15]:
truecar_df['body_style'].value_counts()

E92    270
E90     60
Name: body_style, dtype: int64

### Vehicle Color

In [16]:
truecar_df['car_color'] = truecar_df['Color Scheme'].apply(lambda x: x.split(' ')[0])

In [17]:
truecar_df['car_color'].value_counts()

White     90
Black     90
Blue      90
Silver    30
Gray      30
Name: car_color, dtype: int64

In [18]:
truecar_df.head(25)

Unnamed: 0,Year,Model,Mileage,Location,Price,Site Price Type,Color Scheme,History,335i_yn,xdrive_yn,is_model_yn,body_style,car_color
0,2011,335is Convertible,"62,515 miles","58 mi - Somerset, NJ","$25,999",High Price,"Silver exterior, Black interior","No accidents, 3 Owners, Personal use",yes,no,yes,E92,Silver
8,2011,335i xDrive Coupe AWD,"37,739 miles","21 mi - Easton, PA","$25,998",High Price,"White exterior, Brown interior","No accidents, 2 Owners, Personal use",yes,yes,no,E92,White
9,2010,335i xDrive Sedan AWD,"63,935 miles","8.7 mi - Stroudsburg, PA","$17,990",High Price,"Black exterior, Black interior","No accidents, 2 Owners, Personal use",yes,yes,no,E90,Black
13,2011,335i Convertible,"76,693 miles","8.7 mi - Stroudsburg, PA","$20,990",High Price,"Gray exterior, Brown interior","No accidents, 3 Owners, Personal use",yes,no,no,E92,Gray
14,2011,335i xDrive Sedan AWD,"55,126 miles","8.7 mi - Stroudsburg, PA","$20,990",High Price,"White exterior, Brown interior","No accidents, 2 Owners, Personal use",yes,yes,no,E90,White
15,2008,335i Convertible,"137,855 miles","62 mi - Huntingdon Valley, PA","$11,295",Fair Price,"Black exterior, Beige interior","No accidents, 5 Owners, Personal use",yes,no,no,E92,Black
17,2011,335is Convertible,"40,262 miles","8.7 mi - Stroudsburg, PA","$28,590",High Price,"Blue exterior, Black interior","No accidents, 3 Owners, Personal use",yes,no,yes,E92,Blue
18,2011,335i xDrive Coupe AWD,"86,645 miles","67 mi - Elizabeth, NJ","$19,895",Fair Price,"White exterior, Red interior","No accidents, 1 Owner, Personal use",yes,yes,no,E92,White
25,2011,335i xDrive Coupe AWD,"34,908 miles","74 mi - Jersey City, NJ","$19,998",Excellent Price,"Blue exterior, Unknown interior","No accidents, 2 Owners, Personal use",yes,yes,no,E92,Blue
26,2009,335i Convertible,"60,798 miles","8.7 mi - Stroudsburg, PA","$20,990",High Price,"Black exterior, Brown interior","No accidents, 3 Owners, Personal use",yes,no,no,E92,Black


### Interior Color

In [19]:
truecar_df['interior_color'] = truecar_df['Color Scheme'].apply(lambda x: x.split(',')[1])

In [20]:
truecar_df['interior_color'].value_counts()

 Brown interior      120
 Black interior       90
 Beige interior       60
 Red interior         30
 Unknown interior     30
Name: interior_color, dtype: int64

In [21]:
truecar_df['interior_color'] = truecar_df['interior_color'].apply(lambda x: x.replace('interior', ''))

In [22]:
truecar_df['interior_color'].value_counts()

 Brown       120
 Black        90
 Beige        60
 Red          30
 Unknown      30
Name: interior_color, dtype: int64

In [23]:
truecar_df.head(10)

Unnamed: 0,Year,Model,Mileage,Location,Price,Site Price Type,Color Scheme,History,335i_yn,xdrive_yn,is_model_yn,body_style,car_color,interior_color
0,2011,335is Convertible,"62,515 miles","58 mi - Somerset, NJ","$25,999",High Price,"Silver exterior, Black interior","No accidents, 3 Owners, Personal use",yes,no,yes,E92,Silver,Black
8,2011,335i xDrive Coupe AWD,"37,739 miles","21 mi - Easton, PA","$25,998",High Price,"White exterior, Brown interior","No accidents, 2 Owners, Personal use",yes,yes,no,E92,White,Brown
9,2010,335i xDrive Sedan AWD,"63,935 miles","8.7 mi - Stroudsburg, PA","$17,990",High Price,"Black exterior, Black interior","No accidents, 2 Owners, Personal use",yes,yes,no,E90,Black,Black
13,2011,335i Convertible,"76,693 miles","8.7 mi - Stroudsburg, PA","$20,990",High Price,"Gray exterior, Brown interior","No accidents, 3 Owners, Personal use",yes,no,no,E92,Gray,Brown
14,2011,335i xDrive Sedan AWD,"55,126 miles","8.7 mi - Stroudsburg, PA","$20,990",High Price,"White exterior, Brown interior","No accidents, 2 Owners, Personal use",yes,yes,no,E90,White,Brown
15,2008,335i Convertible,"137,855 miles","62 mi - Huntingdon Valley, PA","$11,295",Fair Price,"Black exterior, Beige interior","No accidents, 5 Owners, Personal use",yes,no,no,E92,Black,Beige
17,2011,335is Convertible,"40,262 miles","8.7 mi - Stroudsburg, PA","$28,590",High Price,"Blue exterior, Black interior","No accidents, 3 Owners, Personal use",yes,no,yes,E92,Blue,Black
18,2011,335i xDrive Coupe AWD,"86,645 miles","67 mi - Elizabeth, NJ","$19,895",Fair Price,"White exterior, Red interior","No accidents, 1 Owner, Personal use",yes,yes,no,E92,White,Red
25,2011,335i xDrive Coupe AWD,"34,908 miles","74 mi - Jersey City, NJ","$19,998",Excellent Price,"Blue exterior, Unknown interior","No accidents, 2 Owners, Personal use",yes,yes,no,E92,Blue,Unknown
26,2009,335i Convertible,"60,798 miles","8.7 mi - Stroudsburg, PA","$20,990",High Price,"Black exterior, Brown interior","No accidents, 3 Owners, Personal use",yes,no,no,E92,Black,Brown


### Accident Check

In [24]:
truecar_df['accident'] = truecar_df['History'].apply(lambda x: 'No' if 'no accidents' in x.lower() else 'Yes')

In [25]:
truecar_df.head(50)

Unnamed: 0,Year,Model,Mileage,Location,Price,Site Price Type,Color Scheme,History,335i_yn,xdrive_yn,is_model_yn,body_style,car_color,interior_color,accident
0,2011,335is Convertible,"62,515 miles","58 mi - Somerset, NJ","$25,999",High Price,"Silver exterior, Black interior","No accidents, 3 Owners, Personal use",yes,no,yes,E92,Silver,Black,No
8,2011,335i xDrive Coupe AWD,"37,739 miles","21 mi - Easton, PA","$25,998",High Price,"White exterior, Brown interior","No accidents, 2 Owners, Personal use",yes,yes,no,E92,White,Brown,No
9,2010,335i xDrive Sedan AWD,"63,935 miles","8.7 mi - Stroudsburg, PA","$17,990",High Price,"Black exterior, Black interior","No accidents, 2 Owners, Personal use",yes,yes,no,E90,Black,Black,No
13,2011,335i Convertible,"76,693 miles","8.7 mi - Stroudsburg, PA","$20,990",High Price,"Gray exterior, Brown interior","No accidents, 3 Owners, Personal use",yes,no,no,E92,Gray,Brown,No
14,2011,335i xDrive Sedan AWD,"55,126 miles","8.7 mi - Stroudsburg, PA","$20,990",High Price,"White exterior, Brown interior","No accidents, 2 Owners, Personal use",yes,yes,no,E90,White,Brown,No
15,2008,335i Convertible,"137,855 miles","62 mi - Huntingdon Valley, PA","$11,295",Fair Price,"Black exterior, Beige interior","No accidents, 5 Owners, Personal use",yes,no,no,E92,Black,Beige,No
17,2011,335is Convertible,"40,262 miles","8.7 mi - Stroudsburg, PA","$28,590",High Price,"Blue exterior, Black interior","No accidents, 3 Owners, Personal use",yes,no,yes,E92,Blue,Black,No
18,2011,335i xDrive Coupe AWD,"86,645 miles","67 mi - Elizabeth, NJ","$19,895",Fair Price,"White exterior, Red interior","No accidents, 1 Owner, Personal use",yes,yes,no,E92,White,Red,No
25,2011,335i xDrive Coupe AWD,"34,908 miles","74 mi - Jersey City, NJ","$19,998",Excellent Price,"Blue exterior, Unknown interior","No accidents, 2 Owners, Personal use",yes,yes,no,E92,Blue,Unknown,No
26,2009,335i Convertible,"60,798 miles","8.7 mi - Stroudsburg, PA","$20,990",High Price,"Black exterior, Brown interior","No accidents, 3 Owners, Personal use",yes,no,no,E92,Black,Brown,No


### Remove "miles" in Mileage


In [26]:
truecar_df['Mileage'] = truecar_df['Mileage'].apply(lambda x: x.split('m')[0])

In [27]:
truecar_df.head(5)

Unnamed: 0,Year,Model,Mileage,Location,Price,Site Price Type,Color Scheme,History,335i_yn,xdrive_yn,is_model_yn,body_style,car_color,interior_color,accident
0,2011,335is Convertible,62515,"58 mi - Somerset, NJ","$25,999",High Price,"Silver exterior, Black interior","No accidents, 3 Owners, Personal use",yes,no,yes,E92,Silver,Black,No
8,2011,335i xDrive Coupe AWD,37739,"21 mi - Easton, PA","$25,998",High Price,"White exterior, Brown interior","No accidents, 2 Owners, Personal use",yes,yes,no,E92,White,Brown,No
9,2010,335i xDrive Sedan AWD,63935,"8.7 mi - Stroudsburg, PA","$17,990",High Price,"Black exterior, Black interior","No accidents, 2 Owners, Personal use",yes,yes,no,E90,Black,Black,No
13,2011,335i Convertible,76693,"8.7 mi - Stroudsburg, PA","$20,990",High Price,"Gray exterior, Brown interior","No accidents, 3 Owners, Personal use",yes,no,no,E92,Gray,Brown,No
14,2011,335i xDrive Sedan AWD,55126,"8.7 mi - Stroudsburg, PA","$20,990",High Price,"White exterior, Brown interior","No accidents, 2 Owners, Personal use",yes,yes,no,E90,White,Brown,No


###  Removing miles in Location

In [28]:
truecar_df['Location'] = truecar_df['Location'].apply(lambda x: x.split('-')[1])

In [29]:
truecar_df.head(40)

Unnamed: 0,Year,Model,Mileage,Location,Price,Site Price Type,Color Scheme,History,335i_yn,xdrive_yn,is_model_yn,body_style,car_color,interior_color,accident
0,2011,335is Convertible,62515,"Somerset, NJ","$25,999",High Price,"Silver exterior, Black interior","No accidents, 3 Owners, Personal use",yes,no,yes,E92,Silver,Black,No
8,2011,335i xDrive Coupe AWD,37739,"Easton, PA","$25,998",High Price,"White exterior, Brown interior","No accidents, 2 Owners, Personal use",yes,yes,no,E92,White,Brown,No
9,2010,335i xDrive Sedan AWD,63935,"Stroudsburg, PA","$17,990",High Price,"Black exterior, Black interior","No accidents, 2 Owners, Personal use",yes,yes,no,E90,Black,Black,No
13,2011,335i Convertible,76693,"Stroudsburg, PA","$20,990",High Price,"Gray exterior, Brown interior","No accidents, 3 Owners, Personal use",yes,no,no,E92,Gray,Brown,No
14,2011,335i xDrive Sedan AWD,55126,"Stroudsburg, PA","$20,990",High Price,"White exterior, Brown interior","No accidents, 2 Owners, Personal use",yes,yes,no,E90,White,Brown,No
15,2008,335i Convertible,137855,"Huntingdon Valley, PA","$11,295",Fair Price,"Black exterior, Beige interior","No accidents, 5 Owners, Personal use",yes,no,no,E92,Black,Beige,No
17,2011,335is Convertible,40262,"Stroudsburg, PA","$28,590",High Price,"Blue exterior, Black interior","No accidents, 3 Owners, Personal use",yes,no,yes,E92,Blue,Black,No
18,2011,335i xDrive Coupe AWD,86645,"Elizabeth, NJ","$19,895",Fair Price,"White exterior, Red interior","No accidents, 1 Owner, Personal use",yes,yes,no,E92,White,Red,No
25,2011,335i xDrive Coupe AWD,34908,"Jersey City, NJ","$19,998",Excellent Price,"Blue exterior, Unknown interior","No accidents, 2 Owners, Personal use",yes,yes,no,E92,Blue,Unknown,No
26,2009,335i Convertible,60798,"Stroudsburg, PA","$20,990",High Price,"Black exterior, Brown interior","No accidents, 3 Owners, Personal use",yes,no,no,E92,Black,Brown,No


### Number of accidents (Not working, will revisit)

In [30]:
# truecar_df['num_accidents'] = truecar_df['History'].apply(lambda x: x[0] if truecar_df['accident'] == 'Yes' else 'No')

### Number Of Owners

In [31]:
truecar_df['num_owners'] = truecar_df['History'].apply(lambda x: x.split(',')[1])

In [32]:
truecar_df.head(55)

Unnamed: 0,Year,Model,Mileage,Location,Price,Site Price Type,Color Scheme,History,335i_yn,xdrive_yn,is_model_yn,body_style,car_color,interior_color,accident,num_owners
0,2011,335is Convertible,62515,"Somerset, NJ","$25,999",High Price,"Silver exterior, Black interior","No accidents, 3 Owners, Personal use",yes,no,yes,E92,Silver,Black,No,3 Owners
8,2011,335i xDrive Coupe AWD,37739,"Easton, PA","$25,998",High Price,"White exterior, Brown interior","No accidents, 2 Owners, Personal use",yes,yes,no,E92,White,Brown,No,2 Owners
9,2010,335i xDrive Sedan AWD,63935,"Stroudsburg, PA","$17,990",High Price,"Black exterior, Black interior","No accidents, 2 Owners, Personal use",yes,yes,no,E90,Black,Black,No,2 Owners
13,2011,335i Convertible,76693,"Stroudsburg, PA","$20,990",High Price,"Gray exterior, Brown interior","No accidents, 3 Owners, Personal use",yes,no,no,E92,Gray,Brown,No,3 Owners
14,2011,335i xDrive Sedan AWD,55126,"Stroudsburg, PA","$20,990",High Price,"White exterior, Brown interior","No accidents, 2 Owners, Personal use",yes,yes,no,E90,White,Brown,No,2 Owners
15,2008,335i Convertible,137855,"Huntingdon Valley, PA","$11,295",Fair Price,"Black exterior, Beige interior","No accidents, 5 Owners, Personal use",yes,no,no,E92,Black,Beige,No,5 Owners
17,2011,335is Convertible,40262,"Stroudsburg, PA","$28,590",High Price,"Blue exterior, Black interior","No accidents, 3 Owners, Personal use",yes,no,yes,E92,Blue,Black,No,3 Owners
18,2011,335i xDrive Coupe AWD,86645,"Elizabeth, NJ","$19,895",Fair Price,"White exterior, Red interior","No accidents, 1 Owner, Personal use",yes,yes,no,E92,White,Red,No,1 Owner
25,2011,335i xDrive Coupe AWD,34908,"Jersey City, NJ","$19,998",Excellent Price,"Blue exterior, Unknown interior","No accidents, 2 Owners, Personal use",yes,yes,no,E92,Blue,Unknown,No,2 Owners
26,2009,335i Convertible,60798,"Stroudsburg, PA","$20,990",High Price,"Black exterior, Brown interior","No accidents, 3 Owners, Personal use",yes,no,no,E92,Black,Brown,No,3 Owners


#### Further cleaning

In [33]:
truecar_df['num_owners'] = truecar_df['num_owners'].apply(lambda x: x.replace('Owners', ''))

In [34]:
truecar_df.head(25)

Unnamed: 0,Year,Model,Mileage,Location,Price,Site Price Type,Color Scheme,History,335i_yn,xdrive_yn,is_model_yn,body_style,car_color,interior_color,accident,num_owners
0,2011,335is Convertible,62515,"Somerset, NJ","$25,999",High Price,"Silver exterior, Black interior","No accidents, 3 Owners, Personal use",yes,no,yes,E92,Silver,Black,No,3
8,2011,335i xDrive Coupe AWD,37739,"Easton, PA","$25,998",High Price,"White exterior, Brown interior","No accidents, 2 Owners, Personal use",yes,yes,no,E92,White,Brown,No,2
9,2010,335i xDrive Sedan AWD,63935,"Stroudsburg, PA","$17,990",High Price,"Black exterior, Black interior","No accidents, 2 Owners, Personal use",yes,yes,no,E90,Black,Black,No,2
13,2011,335i Convertible,76693,"Stroudsburg, PA","$20,990",High Price,"Gray exterior, Brown interior","No accidents, 3 Owners, Personal use",yes,no,no,E92,Gray,Brown,No,3
14,2011,335i xDrive Sedan AWD,55126,"Stroudsburg, PA","$20,990",High Price,"White exterior, Brown interior","No accidents, 2 Owners, Personal use",yes,yes,no,E90,White,Brown,No,2
15,2008,335i Convertible,137855,"Huntingdon Valley, PA","$11,295",Fair Price,"Black exterior, Beige interior","No accidents, 5 Owners, Personal use",yes,no,no,E92,Black,Beige,No,5
17,2011,335is Convertible,40262,"Stroudsburg, PA","$28,590",High Price,"Blue exterior, Black interior","No accidents, 3 Owners, Personal use",yes,no,yes,E92,Blue,Black,No,3
18,2011,335i xDrive Coupe AWD,86645,"Elizabeth, NJ","$19,895",Fair Price,"White exterior, Red interior","No accidents, 1 Owner, Personal use",yes,yes,no,E92,White,Red,No,1 Owner
25,2011,335i xDrive Coupe AWD,34908,"Jersey City, NJ","$19,998",Excellent Price,"Blue exterior, Unknown interior","No accidents, 2 Owners, Personal use",yes,yes,no,E92,Blue,Unknown,No,2
26,2009,335i Convertible,60798,"Stroudsburg, PA","$20,990",High Price,"Black exterior, Brown interior","No accidents, 3 Owners, Personal use",yes,no,no,E92,Black,Brown,No,3


### State

In [35]:
truecar_df['State'] = truecar_df['Location'].apply(lambda x: x.split(',')[1])

In [36]:
truecar_df.head(50)

Unnamed: 0,Year,Model,Mileage,Location,Price,Site Price Type,Color Scheme,History,335i_yn,xdrive_yn,is_model_yn,body_style,car_color,interior_color,accident,num_owners,State
0,2011,335is Convertible,62515,"Somerset, NJ","$25,999",High Price,"Silver exterior, Black interior","No accidents, 3 Owners, Personal use",yes,no,yes,E92,Silver,Black,No,3,NJ
8,2011,335i xDrive Coupe AWD,37739,"Easton, PA","$25,998",High Price,"White exterior, Brown interior","No accidents, 2 Owners, Personal use",yes,yes,no,E92,White,Brown,No,2,PA
9,2010,335i xDrive Sedan AWD,63935,"Stroudsburg, PA","$17,990",High Price,"Black exterior, Black interior","No accidents, 2 Owners, Personal use",yes,yes,no,E90,Black,Black,No,2,PA
13,2011,335i Convertible,76693,"Stroudsburg, PA","$20,990",High Price,"Gray exterior, Brown interior","No accidents, 3 Owners, Personal use",yes,no,no,E92,Gray,Brown,No,3,PA
14,2011,335i xDrive Sedan AWD,55126,"Stroudsburg, PA","$20,990",High Price,"White exterior, Brown interior","No accidents, 2 Owners, Personal use",yes,yes,no,E90,White,Brown,No,2,PA
15,2008,335i Convertible,137855,"Huntingdon Valley, PA","$11,295",Fair Price,"Black exterior, Beige interior","No accidents, 5 Owners, Personal use",yes,no,no,E92,Black,Beige,No,5,PA
17,2011,335is Convertible,40262,"Stroudsburg, PA","$28,590",High Price,"Blue exterior, Black interior","No accidents, 3 Owners, Personal use",yes,no,yes,E92,Blue,Black,No,3,PA
18,2011,335i xDrive Coupe AWD,86645,"Elizabeth, NJ","$19,895",Fair Price,"White exterior, Red interior","No accidents, 1 Owner, Personal use",yes,yes,no,E92,White,Red,No,1 Owner,NJ
25,2011,335i xDrive Coupe AWD,34908,"Jersey City, NJ","$19,998",Excellent Price,"Blue exterior, Unknown interior","No accidents, 2 Owners, Personal use",yes,yes,no,E92,Blue,Unknown,No,2,NJ
26,2009,335i Convertible,60798,"Stroudsburg, PA","$20,990",High Price,"Black exterior, Brown interior","No accidents, 3 Owners, Personal use",yes,no,no,E92,Black,Brown,No,3,PA


### Droping uncessary columns

In [37]:
truecar_df.drop(['Color Scheme', 'History','335i_yn'], axis = 1)

Unnamed: 0,Year,Model,Mileage,Location,Price,Site Price Type,xdrive_yn,is_model_yn,body_style,car_color,interior_color,accident,num_owners,State
0,2011,335is Convertible,62515,"Somerset, NJ","$25,999",High Price,no,yes,E92,Silver,Black,No,3,NJ
8,2011,335i xDrive Coupe AWD,37739,"Easton, PA","$25,998",High Price,yes,no,E92,White,Brown,No,2,PA
9,2010,335i xDrive Sedan AWD,63935,"Stroudsburg, PA","$17,990",High Price,yes,no,E90,Black,Black,No,2,PA
13,2011,335i Convertible,76693,"Stroudsburg, PA","$20,990",High Price,no,no,E92,Gray,Brown,No,3,PA
14,2011,335i xDrive Sedan AWD,55126,"Stroudsburg, PA","$20,990",High Price,yes,no,E90,White,Brown,No,2,PA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
916,2011,335is Convertible,40262,"Stroudsburg, PA","$28,590",High Price,no,yes,E92,Blue,Black,No,3,PA
917,2011,335i xDrive Coupe AWD,86645,"Elizabeth, NJ","$19,895",Fair Price,yes,no,E92,White,Red,No,1 Owner,NJ
924,2011,335i xDrive Coupe AWD,34908,"Jersey City, NJ","$19,998",Excellent Price,yes,no,E92,Blue,Unknown,No,2,NJ
925,2009,335i Convertible,60798,"Stroudsburg, PA","$20,990",High Price,no,no,E92,Black,Brown,No,3,PA


### Output to CSV (Commented out so file isn't generated everytime script runs)

In [39]:
#truecar_df.to_csv('second_car_data_cleaned.csv', index = False)

## Extra

In [3]:
df_first = pd.read_csv('car_data_cleaned.csv')

In [32]:
df_first.describe()

Unnamed: 0,Year,num_owners
count,390.0,390.0
mean,2010.461538,3.384615
std,0.930659,1.334058
min,2008.0,2.0
25%,2010.0,2.0
50%,2011.0,3.0
75%,2011.0,5.0
max,2011.0,6.0


In [31]:
df_second = pd.read_csv('second_car_data_cleaned.csv')

In [46]:
df_second.describe()

Unnamed: 0,Year
count,330.0
mean,2010.454545
std,0.989025
min,2008.0
25%,2010.0
50%,2011.0
75%,2011.0
max,2011.0


In [47]:
df_first.columns

Index(['Year', 'Model', 'Mileage', 'Location', 'Price', 'Site Price Type',
       'Color Scheme', 'History', '335i_yn', 'xdrive_yn', 'is_model_yn',
       'body_style', 'car_color', 'interior_color', 'accident', 'num_owners',
       'State'],
      dtype='object')

In [48]:
df_second.columns

Index(['Year', 'Model', 'Mileage', 'Location', 'Price', 'Site Price Type',
       'Color Scheme', 'History', '335i_yn', 'xdrive_yn', 'is_model_yn',
       'body_style', 'car_color', 'interior_color', 'accident', 'num_owners',
       'State'],
      dtype='object')

In [23]:
big_df = pd.concat([df_first, df_second], ignore_index = True)

In [45]:
big_df.duplicated()

0      False
1      False
2      False
3      False
4      False
5      False
6      False
7      False
8      False
9      False
10     False
11     False
12     False
13      True
14      True
15      True
16      True
17      True
18      True
19      True
20      True
21      True
22      True
23      True
24      True
25      True
26      True
27      True
28      True
29      True
30      True
31      True
32      True
33      True
34      True
35      True
36      True
37      True
38      True
39      True
40      True
41      True
42      True
43      True
44      True
45      True
46      True
47      True
48      True
49      True
50      True
51      True
52      True
53      True
54      True
55      True
56      True
57      True
58      True
59      True
60      True
61      True
62      True
63      True
64      True
65      True
66      True
67      True
68      True
69      True
70      True
71      True
72      True
73      True
74      True
75      True
76      True

In [43]:
big_df.describe()

Unnamed: 0,Year
count,720.0
mean,2010.458333
std,0.957185
min,2008.0
25%,2010.0
50%,2011.0
75%,2011.0
max,2011.0


In [25]:
clean_df = big_df.drop_duplicates()

In [44]:
clean_df.describe()

Unnamed: 0,Year
count,24.0
mean,2010.458333
std,0.977093
min,2008.0
25%,2010.0
50%,2011.0
75%,2011.0
max,2011.0


In [33]:
df3 = df_first.append(df_second)

In [34]:
df3.describe()

Unnamed: 0,Year
count,720.0
mean,2010.458333
std,0.957185
min,2008.0
25%,2010.0
50%,2011.0
75%,2011.0
max,2011.0


In [42]:
df3.duplicated()

0      False
1      False
2      False
3      False
4      False
5      False
6      False
7      False
8      False
9      False
10     False
11     False
12     False
13      True
14      True
15      True
16      True
17      True
18      True
19      True
20      True
21      True
22      True
23      True
24      True
25      True
26      True
27      True
28      True
29      True
30      True
31      True
32      True
33      True
34      True
35      True
36      True
37      True
38      True
39      True
40      True
41      True
42      True
43      True
44      True
45      True
46      True
47      True
48      True
49      True
50      True
51      True
52      True
53      True
54      True
55      True
56      True
57      True
58      True
59      True
60      True
61      True
62      True
63      True
64      True
65      True
66      True
67      True
68      True
69      True
70      True
71      True
72      True
73      True
74      True
75      True
76      True

In [36]:
df4.describe()

Unnamed: 0,Year
count,24.0
mean,2010.458333
std,0.977093
min,2008.0
25%,2010.0
50%,2011.0
75%,2011.0
max,2011.0


In [46]:
df4.head(23)

Unnamed: 0,Year,Model,Mileage,Location,Price,Site Price Type,Color Scheme,History,335i_yn,xdrive_yn,is_model_yn,body_style,car_color,interior_color,accident,num_owners,State
0,2011,335is Convertible,62515,"Somerset, NJ","$25,999",High Price,"Silver exterior, Black interior","No accidents, 3 Owners, Personal use",yes,no,yes,E92,Silver,Black,No,3,NJ
1,2011,335i xDrive Coupe AWD,37739,"Easton, PA","$25,998",Excellent Price,"White exterior, Brown interior","No accidents, 2 Owners, Personal use",yes,yes,no,E92,White,Brown,No,2,PA
2,2011,335i Sedan,67840,"Stroudsburg, PA","$18,990",Great Price,"Blue exterior, Brown interior","No accidents, 5 Owners, Fleet use",yes,no,no,E90,Blue,Brown,No,5,PA
3,2010,335i xDrive Sedan AWD,128507,"Hasbrouck Heights, NJ","$9,995",Excellent Price,"Black exterior, Beige interior","No accidents, 3 Owners, Personal use",yes,yes,no,E90,Black,Beige,No,3,NJ
4,2011,335i Convertible,55578,"Easton, PA","$23,998",High Price,"Black exterior, Unknown interior","No accidents, 2 Owners, Personal use",yes,no,no,E92,Black,Unknown,No,2,PA
5,2011,335i Convertible,71447,"Stroudsburg, PA","$21,990",High Price,"Black exterior, Brown interior","No accidents, 6 Owners, Personal use",yes,no,no,E92,Black,Brown,No,6,PA
6,2010,335i Convertible,74799,"Stroudsburg, PA","$18,990",High Price,"Black exterior, Black interior","No accidents, 3 Owners, Personal use",yes,no,no,E92,Black,Black,No,3,PA
7,2011,335is Convertible,40262,"Stroudsburg, PA","$28,990",High Price,"Blue exterior, Black interior","No accidents, 3 Owners, Personal use",yes,no,yes,E92,Blue,Black,No,3,PA
8,2011,335i xDrive Coupe AWD,34908,"Jersey City, NJ","$19,998",Excellent Price,"Blue exterior, Unknown interior","No accidents, 2 Owners, Personal use",yes,yes,no,E92,Blue,Unknown,No,2,NJ
9,2008,335i Convertible,137855,"Huntingdon Valley, PA","$11,295",High Price,"Black exterior, Beige interior","No accidents, 5 Owners, Personal use",yes,no,no,E92,Black,Beige,No,5,PA
