##<font color="lightgreen">Scraping the winetourism website</font>##

In [41]:
#Import libraries that we will need
import requests
from bs4 import BeautifulSoup as bs

In [52]:
#Seting the url of the first page that we are going to scrape
#In a sense that we can get more other data, we only change the website and the selectors that we work with, all other functionalities are the same.
url="https://www.winetourism.com/wineries-in-north-macedonia/"
#Getting the response from the web site
response = requests.get(url)
#Checking if the websie allows us to scrape it by seeing the response. It is 200 so we are good to go
response.status_code
#Beautifying the html so we can easily select the needed elements with Beautiful Soup
raw_html = response.text
beautified_html = bs(raw_html, 'html.parser')
wine_card = beautified_html.select(".blog-item")
#Extracting the data using css selectors with Beautiful Soup
wine_card
winary_card = beautified_html.select(".blog-item .image-wrapper img")
winary_names = [item['alt'] for item in winary_card]

root_url = "https://www.winetourism.com/"

winary_images_links = [str(root_url[:-1]) + "" + str(item['src']) for item in winary_card]
winary_names

winary_description = beautified_html.select(".blog-item .blog-content .blog-excerpt")
winary_description = [item.text.replace("\n", "") for item in winary_description]
#Exctracted info: Winary names, Winary Description, Winary images links
#Other data in the csv file is Coordinates and rating which is manually written for this particular dataset

In [53]:
#Defining Data Frame so we can easily display and viasulize the data in table
import pandas as pd
item_dict = dict()

item_dict["Winary Name"] = winary_names
item_dict["Winary Description"] = winary_description
item_dict["Winary Image Link"] = winary_images_links

df = pd.DataFrame(item_dict)

In [59]:
def replacing_empty_strings_with_nulls(data):
  import numpy as np
  data.replace(" ", np.nan, inplace=True)
  return data

In [60]:
def dropping_duplicates_by_name(data):
  data.drop_duplicates(subset=['Winary Name'], inplace=True)
  return data

###<font color="lightgreen">Scraping images for displaying</font>###

In [64]:
def scraping_images():
  import requests
  from bs4 import BeautifulSoup as bs

  images_url = "https://unsplash.com/s/photos/wineries"

  response = requests.get(images_url)
  response.status_code

  raw_html = response.text
  beautified_html = bs(raw_html, 'html.parser')

  images = beautified_html.select("img")
  images = [item['src'] for item in images]
  images = [images[i] for i in range(len(images)) if i%2 == 0]
  return images

##Pipe Class##

In [57]:
#Pipe class

class Pipe:
  def __init__(self):
      self.filters = list()

  def add(self, filter):
      self.filters.append(filter)

  def execute(self, message):
      print("Executing pipeline...")
      for message_filter in self.filters:
          print('Filtering with',message_filter)
          message=message_filter(message)
      print("Done.")
      return message

##Pipe Execution##

In [62]:
pipe = Pipe()
pipe.add(replacing_empty_strings_with_nulls)
pipe.add(dropping_duplicates_by_name)
final_dataset = pipe.execute(df)
final_dataset.to_excel("filtered.xlsx")

Executing pipeline...
Filtering with <function replacing_empty_strings_with_nulls at 0x79bdac4c8ca0>
Filtering with <function dropping_duplicates_by_name at 0x79bda9fcbbe0>
Done.
