In [5]:
#import get to call a get request on the site
import requests

#import to manipulate arrays with numpy
import numpy as np

#import to create, clean, and parse data frames with pandas
import pandas as pd

#import to enable datascraping
from bs4 import BeautifulSoup

#import to set up 'sleep' to wait between page loads
import time

import timeit

# import Mongo so our webscraper dumps its scraped data without losing it
from pymongo import MongoClient
import pymongo

#import to make that html readable
import pprint

#import regular expressions operations
import re

#import to get the universe in balance
import random


from dateutil.parser import parse

#import so we can do some heavy stats work
import scipy as sp
from scipy.stats import binom
import scipy.stats as stats

#import to access certain plotting features
import seaborn as sns

#import because we need its program functions
import math

from sklearn import neighbors, datasets, tree, svm, preprocessing, utils
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import log_loss, classification_report, confusion_matrix, mean_squared_error, accuracy_score

#import because we need to plot and make it pretty
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
%matplotlib inline
plt.style.use('ggplot')

In [1]:
def motorcycle_post_parser(total_listings):
    client = MongoClient('localhost', 27017)
    db = client['craigslist_motorcycles']
    post_html = db['motorcycle_posts'].find()[0:total_listings]
    
    title = []
    year = []
    price = []
    neighborhood = []
    description = []
    url = []
    #attributes
    model = []
    vin = []
    condition = []
    cryptocurrency = []
    delivery = []
    engine_size = []
    fuel_type = []
    mileage = []
    paint_color = []
    street_legal = []
    title_status = []
    transmission = []
    bike_type = []

    iterations = 0
    print('Starting Parsing')
    print(' ')
    for _ in post_html:         
        
        # attributes key words to parse through attributes
        vin_ = 'VIN:'
        condition_ = 'condition:'
        cryptocurrency_ = 'cryptocurrency'
        delivery_ = 'delivery'
        engine_size_ = 'engine displacement'
        fuel_type_ = 'fuel:'
        mileage_ = 'odometer:'
        paint_color_ = 'paint color:'
        street_legal_ = 'street legal'
        title_status_ = 'title status:'
        transmission_ = 'transmission:'
        bike_type_ = 'type:'  
        
        response = _['html']
        soup = BeautifulSoup(response, 'html.parser')

        try:
            post_title = soup.find('span', id = 'titletextonly')
            title.append(post_title.text)
        except:
            title.append(np.nan)
            
        try:
            post_price = soup.find('span', class_ = 'price').text
            price.append(float(post_price.lstrip('$')))
        except:
            price.append(np.nan)
        
        try:
            post_neighborhood = soup.find('small')
            neighborhood.append(((post_neighborhood.text).replace(' (', '')).replace(')', ''))
        except:
            neighborhood.append(np.nan)
        
        try:
            post_description = soup.find('section', id = 'postingbody')
            description.append((post_description.text).replace('\n\nQR Code Link to This Post\n\n\n', ''))
        except:
            description.append(np.nan)
        
        try:
            year_ = parse(post_title.text, fuzzy=True).year
            if 1920 < year_ < 2022:
                year.append(year_)
            else:
                year.append(np.nan)
        except:
            year.append(np.nan)
        
        url.append(_['_id'])
        
        post_attributes = str(soup.find_all('p', attrs = {'class': 'attrgroup'}))
        post_attributes = post_attributes.replace(post_attributes, str(post_attributes.replace('''[<p class="attrgroup">\n<span><b>''', '').replace('''</b></span>\n<br/>\n</p>, ''','').replace('''<p class="attrgroup">\n<span>''', ',').replace(''' <b>''', '').replace('''</b></span>\n<br/>\n<span>''', ',').replace('''</span>\n<br/>\n<span>''', ',').replace('''</b></span>\n<br/>\n</p>]''', '').replace('''<p class="attrgroup">\n<span class="otherpostings">\n<a href="//auburn.craigslist.org/search/sss?userid=193613506">\nmore ads  by this user        </a>\n</span>\n</p>]''', '')))
                    
        bike_attributes = (post_attributes.split(','))

        vin_res = [i for i in bike_attributes if vin_ in i]
        condition_res = [i for i in bike_attributes if condition_ in i]
        cryptocurrency_res = [i for i in bike_attributes if cryptocurrency_ in i]
        delivery_res = [i for i in bike_attributes if delivery_ in i]
        engine_size_res = [i for i in bike_attributes if engine_size_ in i]
        fuel_type_res = [i for i in bike_attributes if fuel_type_ in i]
        mileage_res = [i for i in bike_attributes if mileage_ in i]
        paint_color_res = [i for i in bike_attributes if paint_color_ in i]
        street_legal_res = [i for i in bike_attributes if street_legal_ in i]
        title_status_res = [i for i in bike_attributes if title_status_ in i]
        transmission_res = [i for i in bike_attributes if transmission_ in i]
        bike_type_res = [i for i in bike_attributes if bike_type_ in i]
        
        #     
        
        try:
            model.append(bike_attributes[0])
        except:
            model.append('')
                
        try:
            vin.append((vin_res[0]).replace((vin_res[0], '1')))
        except:
            vin.append(0)
            
        try:
            condition.append(str(condition_res[0]).replace('condition:', ''))
        except:
            condition.append(np.nan)
               
        try:
            cryptocurrency.append((cryptocurrency_res[0]).replace((cryptocurrency_res[0]), '1'))
        except:
            cryptocurrency.append(0)
        
        try:
            delivery.append((delivery_res[0]).replace((delivery_res[0]), '1'))
        except:
            delivery.append(0)
                
        try:
            engine_size.append(float(str(engine_size_res[0]).replace('engine displacement (CC):', '')))
        except:
            engine_size.append(np.nan)
            
        try:
            fuel_type.append(str(fuel_type_res[0]).replace('fuel:', ''))
        except:
            fuel_type.append('gas') #gas is the most common
            
        try:
            mileage.append(float(str(mileage_res[0]).replace('odometer:', '')))
        except:
            mileage.append(np.nan)
                    
        try:
            paint_color.append(str(paint_color_res[0]).replace('paint color:', ''))
        except:
            paint_color.append(np.nan) 
                 
        try:
            street_legal.append((street_legal_res[0]).replace(street_legal_res[0], '1'))
        except:
            street_legal.append(0)
    
        try:
            title_status.append(str(title_status_res[0]).replace('title status:', ''))
        except:
            title_status.append(np.nan) #clean is the most common
                
        try: 
            transmission.append(str(transmission_res[0]).replace('transmission:', '')[0:5]) #we are pulling the first five letters to cut out the html
        except:
            transmission.append(np.nan) 

        try:
            bike_type.append(str(bike_type_res[0]).replace('type:', '')[0:5]) #we are pulling the first five letters to cut out the html
        except:
            bike_type.append('')
                
        iterations += 1

        if iterations%1000 == 0:
            print(str(round(100*round(iterations/total_listings, 2),2)) + '%' + ' parsed succesfully')
    print(' ')
    print('Parsing Complete, Preparing Dataframe')
    df_dictionary = { 'title': title, 'year': year, 'price': price, 'neighborhood': neighborhood, 
                     'description': description, 'url':url, 'model':model, 'vin': vin, 
                     'condition': condition, 'cryptocurrency': cryptocurrency, 'delivery': delivery, 
                     'engine_size': engine_size, 'fuel_type': fuel_type, 'mileage': mileage, 
                     'color': paint_color, 'street_legal': street_legal, 'title_status': title_status, 
                     'transmission': transmission, 'bike_type': bike_type}
    df = pd.DataFrame(df_dictionary)
    
#   Cleaning DataFrame 
    df['neighborhood'].replace('\n       google map\n        ', '', inplace=True)
    
    print(' ')
    print('Parsing Complete! Enjoy Mammal!')
    return df

In [398]:
moto_df = motorcycle_post_parser(58600)

Starting Parsing
 
2.0% parsed succesfully
3.0% parsed succesfully
5.0% parsed succesfully
7.0% parsed succesfully
9.0% parsed succesfully
10.0% parsed succesfully
12.0% parsed succesfully
14.0% parsed succesfully
15.0% parsed succesfully




17.0% parsed succesfully
19.0% parsed succesfully
20.0% parsed succesfully
22.0% parsed succesfully
24.0% parsed succesfully
26.0% parsed succesfully
27.0% parsed succesfully




29.0% parsed succesfully




31.0% parsed succesfully
32.0% parsed succesfully
34.0% parsed succesfully
36.0% parsed succesfully
38.0% parsed succesfully
39.0% parsed succesfully




41.0% parsed succesfully




43.0% parsed succesfully
44.0% parsed succesfully




46.0% parsed succesfully




48.0% parsed succesfully
49.0% parsed succesfully
51.0% parsed succesfully
53.0% parsed succesfully
55.0% parsed succesfully
56.0% parsed succesfully
58.0% parsed succesfully
60.0% parsed succesfully
61.0% parsed succesfully
63.0% parsed succesfully
65.0% parsed succesfully
67.0% parsed succesfully
68.0% parsed succesfully
70.0% parsed succesfully
72.0% parsed succesfully
73.0% parsed succesfully
75.0% parsed succesfully
77.0% parsed succesfully
78.0% parsed succesfully
80.0% parsed succesfully
82.0% parsed succesfully




84.0% parsed succesfully
85.0% parsed succesfully
87.0% parsed succesfully
89.0% parsed succesfully
90.0% parsed succesfully
92.0% parsed succesfully




94.0% parsed succesfully
96.0% parsed succesfully
97.0% parsed succesfully
99.0% parsed succesfully
 
Parsing Complete, Preparing Dataframe
 
Parsing Complete! Enjoy Mammal!


In [468]:
df = moto_df.copy()

In [400]:
moto_df.to_csv('craigslist_motorcycle_data_full')