# Scrapping seasons from a web site
We use this page to scrap seasons <a href="https://www.fruits-legumes.org/mois/#1" target="_blank">fruits-legumes.org</a>.

1. Install 'Scrapy':

In [1]:
!pip install scrapy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Scrap la page des saisons
Page à scrapper : [https://www.fruits-legumes.org/mois/](https://www.fruits-legumes.org/mois/)

In [2]:
# Import os => Library used to easily manipulate operating systems
## More info => https://docs.python.org/3/library/os.html
import os

# Import logging => Library used for logs manipulation
## More info => https://docs.python.org/3/library/logging.html
import logging

# Import scrapy and scrapy.crawler
import scrapy
from scrapy.crawler import CrawlerProcess

class SeasonsSpider(scrapy.Spider):

    # Name of your spider
    name = "seasons"

    # Url to start your spider from
    start_urls = [
        'https://www.fruits-legumes.org/mois/',
    ]

    # Callback function that will be called when starting your spider
    # It will get text, author and tags of the <div> with class="quote"
    # /html/body/div/div[2]/div[1]/div[1]/span[1]
    def parse(self, response):
        months = response.xpath('//*[@id="fruit-legume"]')   # xpath de la racine des mois
        for month in months:
            list_1 = month.xpath('div[1]/ul/li/a/text()').extract()
            for item in range(len(list_1)):
              list_1[item] = list_1[item].replace('\u00e9', 'e').replace('\u00fb', 'u').replace('\u00e2', 'a').replace('\u00ee', 'i').replace('\u00e0', 'i').replace('\u00ea', 'e').replace('\u00e8', 'e')
              #print(f"item : {item}")

            list_2 = month.xpath('div[2]/ul/li/a/text()').extract()
            for item in range(len(list_2)):
              list_2[item] = list_2[item].replace('\u00e9', 'e').replace('\u00fb', 'u').replace('\u00e2', 'a').replace('\u00ee', 'i').replace('\u00e0', 'i').replace('\u00ea', 'e').replace('\u00e8', 'e')
              #print(f"item : {item}")


            yield {
                'month' : month.xpath('h2/text()').get().replace('\n                    ', '').replace('    ', '').replace('\u00fb', 'u').replace('\u00e9', 'e'),
                month.xpath('div[1]/h3/a/text()').get().replace('\u00e9','e').lower() : list_1,
                month.xpath('div[2]/h3/a/text()').get().replace('\u00e9','e').lower() : list_2,
            }

#lists = list(map(lambda x: x.replace('Hadoop', 'MongoDB'), lists))

# Name of the file where the results will be saved
filename = "saisons.json"

# If file already exists, delete it before crawling (because Scrapy will
# concatenate the last and new results otherwise)
if filename in os.listdir('src/'):
        os.remove('src/' + filename)

# Declare a new CrawlerProcess with some settings
## USER_AGENT => Simulates a browser on an OS
## LOG_LEVEL => Minimal Level of Log
## FEEDS => Where the file will be stored
## More info on built-in settings => https://docs.scrapy.org/en/latest/topics/settings.html?highlight=settings#settings
process = CrawlerProcess(settings = {
    'USER_AGENT': 'Chrome/114.0',
    'LOG_LEVEL': logging.INFO,
    "FEEDS": {
        'src/' + filename: {"format": "json"},
    }
})

# Start the crawling using the spider you defined above
process.crawl(SeasonsSpider)
process.start()

INFO:scrapy.utils.log:Scrapy 2.9.0 started (bot: scrapybot)
2023-06-19 14:49:59 [scrapy.utils.log] INFO: Scrapy 2.9.0 started (bot: scrapybot)
INFO:scrapy.utils.log:Versions: lxml 4.9.2.0, libxml2 2.9.14, cssselect 1.2.0, parsel 1.8.1, w3lib 2.1.1, Twisted 22.10.0, Python 3.10.12 (main, Jun  7 2023, 12:45:35) [GCC 9.4.0], pyOpenSSL 23.2.0 (OpenSSL 3.1.0 14 Mar 2023), cryptography 40.0.2, Platform Linux-5.15.107+-x86_64-with-glibc2.31
2023-06-19 14:49:59 [scrapy.utils.log] INFO: Versions: lxml 4.9.2.0, libxml2 2.9.14, cssselect 1.2.0, parsel 1.8.1, w3lib 2.1.1, Twisted 22.10.0, Python 3.10.12 (main, Jun  7 2023, 12:45:35) [GCC 9.4.0], pyOpenSSL 23.2.0 (OpenSSL 3.1.0 14 Mar 2023), cryptography 40.0.2, Platform Linux-5.15.107+-x86_64-with-glibc2.31
INFO:scrapy.crawler:Overridden settings:
{'LOG_LEVEL': 20, 'USER_AGENT': 'Chrome/114.0'}
2023-06-19 14:49:59 [scrapy.crawler] INFO: Overridden settings:
{'LOG_LEVEL': 20, 'USER_AGENT': 'Chrome/114.0'}


See the documentation of the 'REQUEST_FIN

# Fonctions

**Quels sont les légumes de la saison en cours ?**

Interrogation basée sur la date du jour.

In [3]:
import pandas as pd
saisons = pd.read_json('/content/src/saisons.json')
display(saisons)

INFO:numexpr.utils:NumExpr defaulting to 2 threads.
2023-06-19 14:50:00 [numexpr.utils] INFO: NumExpr defaulting to 2 threads.


Unnamed: 0,month,legumes,fruits
0,Juin,"[Ail, Artichaut, Asperge blanche, Asperge vert...","[Banane, Cerise, Citron, Fraise, Fraise des bo..."
1,Juillet,"[Ail, Artichaut, Aubergine, Bette, Betterave r...","[Abricot, Airelle, Banane, Cassis, Cerise, Cit..."
2,Aout,"[Ail, Artichaut, Aubergine, Bette, Betterave r...","[Abricot, Airelle, Amande, Banane, Cassis, Cer..."
3,Septembre,"[Artichaut, Aubergine, Bette, Betterave rouge,...","[Amande, Avocat, Banane, Chataigne, Citron, Co..."
4,Octobre,"[Bette, Betterave rouge, Brocoli, Carotte, Cat...","[Avocat, Banane, Chataigne, Citron, Coing, Dat..."
5,Novembre,"[Carotte, Catalonia, Celeri, Celeri branche, C...","[Ananas, Avocat, Banane, Chataigne, Citron, Cl..."
6,Decembre,"[Carotte, Catalonia, Celeri, Celeri branche, C...","[Ananas, Avocat, Banane, Chataigne, Citron, Cl..."
7,Janvier,"[Carotte, Celeri, Celeri branche, Celeri rave,...","[Ananas, Avocat, Banane, Citron, Clementine, D..."
8,Fevrier,"[Carotte, Celeri, Celeri branche, Celeri rave,...","[Ananas, Avocat, Banane, Citron, Clementine, D..."
9,Mars,"[Carotte, Celeri, Chou blanc, Chou frise, Chou...","[Ananas, Avocat, Banane, Citron, Datte, Fruit ..."


In [4]:
# Obtenir le mois en cours en toutes lettres pour interroger ensuite le json

from datetime import datetime

current_month = datetime.now().month
#print("Mois en cours :", current_month)
current_month = str(current_month)

months_calendar = {'1':'Janvier', '2':'Fevrier', '3':'Mars', '4':'Avril', '5':'Mai', '6':'Juin',
          '7':'Juillet', '8':'Aout', '9':'Septembre', '10':'Octobre', '11':'Novembre', '12':'Decembre'}

for month in months_calendar:
  if month == current_month:
    m_month = months_calendar[month]
    print(m_month)


Juin


# Requêtes sur le dico json

In [42]:
import json

# Opening JSON file
f = open('src/saisons.json')

# returns JSON object as
# a dictionary
calendar = json.load(f)

# Closing file
f.close()

In [43]:
calendar

[{'month': 'Juin',
  'legumes': ['Ail',
   'Artichaut',
   'Asperge blanche',
   'Asperge verte',
   'Aubergine',
   'Bette',
   'Betterave rouge',
   'Brocoli',
   'Chou blanc',
   'Chou frise',
   'Chou Romanesco',
   'Chou rouge',
   'Chou-chinois',
   'Chou-fleur',
   'Chou-rave',
   'Concombre',
   'Courgette',
   'Epinard',
   'Fenouil',
   'Haricot',
   'Laitue romaine',
   'Navet',
   'Petit oignon blanc',
   'Petit pois',
   'Pois mange-tout',
   'Poivron',
   'Pomme de terre',
   'Radis',
   'Radis long',
   'Rhubarbe'],
  'fruits': ['Banane',
   'Cerise',
   'Citron',
   'Fraise',
   'Fraise des bois',
   'Framboise',
   'Fruit de la passion',
   'Groseille',
   'Groseille i maquereau',
   'Litchi',
   'Mangue',
   'Melon',
   'Nectarine',
   'Papaye',
   'Pasteque',
   'Peche',
   'Tomate',
   'Tomate charnue',
   'Tomate Peretti']},
 {'month': 'Juillet',
  'fruits': ['Abricot',
   'Airelle',
   'Banane',
   'Cassis',
   'Cerise',
   'Citron',
   'Fraise',
   'Fraise des bo

In [76]:
# Liste des fruits de saison
fruits_de_saison = pd.json_normalize(calendar)
fruits_de_saison[fruits_de_saison['month'] == m_month]['fruits'][0]

['Banane',
 'Cerise',
 'Citron',
 'Fraise',
 'Fraise des bois',
 'Framboise',
 'Fruit de la passion',
 'Groseille',
 'Groseille i maquereau',
 'Litchi',
 'Mangue',
 'Melon',
 'Nectarine',
 'Papaye',
 'Pasteque',
 'Peche',
 'Tomate',
 'Tomate charnue',
 'Tomate Peretti']

In [77]:
# Liste des légumes de saison
legumes_de_saison = pd.json_normalize(calendar)
legumes_de_saison[legumes_de_saison['month'] == m_month]['legumes'][0]

['Ail',
 'Artichaut',
 'Asperge blanche',
 'Asperge verte',
 'Aubergine',
 'Bette',
 'Betterave rouge',
 'Brocoli',
 'Chou blanc',
 'Chou frise',
 'Chou Romanesco',
 'Chou rouge',
 'Chou-chinois',
 'Chou-fleur',
 'Chou-rave',
 'Concombre',
 'Courgette',
 'Epinard',
 'Fenouil',
 'Haricot',
 'Laitue romaine',
 'Navet',
 'Petit oignon blanc',
 'Petit pois',
 'Pois mange-tout',
 'Poivron',
 'Pomme de terre',
 'Radis',
 'Radis long',
 'Rhubarbe']

In [82]:
# Est-ce qu'un fruit est de saison en ce mois de juin ?
#Tests : Pamplemousse/Cassis qui n'est pas de saison OU Concombre qui est de saison

def saison(fruit):
  season = False

  legumes_saison = legumes_de_saison[legumes_de_saison['month'] == m_month]['legumes']
  fruits_saison = fruits_de_saison[legumes_de_saison['month'] == m_month]['fruits']

  for ingredient in legumes_saison:
    if fruit in ingredient:
      #print(f'{fruit} est de saison')
      season = True

  for ingredient in fruits_saison:
    if fruit in ingredient:
      #print(f'{fruit} est de saison')
      season = True

  return season

saison('Courge')

False