In [57]:
import requests
import re
from bs4 import BeautifulSoup as BS
import pandas as pd
import datetime
import csv

In [58]:
evo = "https://www.evo.com/shop/snowboard/snowboards/rpp_400"
headers = {'User-agent': 'Mozilla/5.0'}
response = requests.get(evo, headers = headers)

In [59]:
response

<Response [200]>

In [60]:
soup = BS(response.content,'html.parser')

In [61]:
items = soup.find_all('div',{'class':'product-thumb-details'})

In [62]:
#The company_cleaner function extract the first word of the product-thumb-title class from a list of items. 
#The board_cleaner function extracts the entire product-thumb-title text from a list of items.

def company_cleaner (item):
  names = []
  for snowboard in items:
    names.append((snowboard.find('span',{'class':'product-thumb-title'}).text).split(' ')[0])

  return names

def board_cleaner(item):
  names = []
  for snowboard in items:
    names.append((snowboard.find('span',{'class':'product-thumb-title'}).text))
  return names


In [63]:
brands = company_cleaner(items)
boards = board_cleaner(items)

In [64]:
# This function appears to be using the regular expression library re to extract prices from a list of HTML documents (represented as BeautifulSoup objects). 
# The regular expression pattern used appears to match decimal numbers with at least one digit before the decimal point and one or two digits after it, with commas allowed as thousands separators.

def price_finder(soups):
  price_pattern = r'([\d+\,]+\d+\.\d{1,2})'
  price_list = []
  for slop in soups:
    price_list.append((re.findall(price_pattern, str(slop))))
  return price_list

In [65]:
stringprices = price_finder(items)

In [66]:
#This function takes a list of lists of prices (as strings), and converts them to a list of lists of prices (as floats), removing commas along the way.

def price_cleaner(prices):
  finalprice=[]
  for pricelist in prices:
    rinsedprice =[]
    finalprice.append(rinsedprice)
    for price in pricelist:
      rinsedprice.append(float(price.replace(',','')))
  return finalprice  

In [67]:
prices = price_cleaner(stringprices)
prices[0]

[499.0, 499.0, 349.3]

In [68]:
#Collection date stamp and apply to every row
collected = [str(datetime.date.today())]*len(boards)

In [69]:
# Bring together lists created by above functions
complete = list(zip(collected, boards, brands))
complete[0]

('2023-04-01', 'Season Kin Snowboard 2023', 'Season')

In [70]:
priceframe = pd.DataFrame(prices, columns = 'main dropprice sale'.split())
priceframe.head()

Unnamed: 0,main,dropprice,sale
0,499.0,499.0,349.3
1,329.95,329.95,263.96
2,899.95,899.95,764.96
3,599.95,599.95,479.96
4,399.95,399.95,319.96


In [71]:
del priceframe['dropprice']

In [72]:
boardframe = pd.DataFrame(complete, columns = 'Collected Deck Brand'.split())

In [73]:
df = pd.concat([boardframe,priceframe], axis = 1, sort = 'False')

  df = pd.concat([boardframe,priceframe], axis = 1, sort = 'False')


In [74]:
df['Collected'] = pd.to_datetime(df.Collected)

In [75]:
df.head()

Unnamed: 0,Collected,Deck,Brand,main,sale
0,2023-04-01,Season Kin Snowboard 2023,Season,499.0,349.3
1,2023-04-01,Nitro Prime Raw Snowboard 2023,Nitro,329.95,263.96
2,2023-04-01,Jones Hovercraft Splitboard 2023,Jones,899.95,764.96
3,2023-04-01,Arbor Coda Rocker Snowboard 2023,Arbor,599.95,479.96
4,2023-04-01,Nidecker Venus Snowboard - Women's 2023,Nidecker,399.95,319.96


In [76]:
df.dtypes

Collected    datetime64[ns]
Deck                 object
Brand                object
main                float64
sale                float64
dtype: object

In [77]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Collected  400 non-null    datetime64[ns]
 1   Deck       400 non-null    object        
 2   Brand      400 non-null    object        
 3   main       400 non-null    float64       
 4   sale       335 non-null    float64       
dtypes: datetime64[ns](1), float64(2), object(2)
memory usage: 15.8+ KB


In [82]:
from google.colab import drive
drive.mount('/drive')


Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).


In [85]:
'''Ran one time to start the csv,
#df.to_csv('/drive/My Drive/Colab Notebooks/Evodata/evodata.csv')'''
#df.to_csv('/drive/My Drive/Colab Notebooks/Evodata/evodata.csv', mode='a', header=False)

In [86]:
df2 = pd.read_csv('/drive/My Drive/Colab Notebooks/Evodata/evodata.csv')

In [81]:
df2.tail()

Unnamed: 0.1,Unnamed: 0,Collected,Deck,Brand,main,sale
1195,395,2023-04-01,WNDR Alpine BelleTour Splitboard 2023,WNDR,899.0,809.1
1196,396,2023-04-01,Rossignol XV Sushi Splitboard 2024,Rossignol,679.95,
1197,397,2023-04-01,Rossignol After Hours Splitboard - Women's 2024,Rossignol,679.95,
1198,398,2023-04-01,Moss Snowstick Fluffy 54 Snowboard 2023,Moss,990.0,
1199,399,2023-04-01,Arbor Swoon Rocker Splitboard - Women's 2023,Arbor,749.95,
