# Data scrapping complete notebook

This notebook compile the different codes and functions used to do the web scrapping that I did to get all the haikus data that I used in this project. 

This is an important part for me as I never did web scrapping before this project and this was the opportunity to discover and learn some new very interesting things !

This is mainly the part of the project that could easily be improved as having more and more experience in this new domain for me would be benefic and having more data would help my models to learn better and give better predictions.

In [None]:
import time

import bs4
from bs4 import BeautifulSoup
from langdetect import detect
import pandas as pd
import requests
import re

## Herons nest

In this section, we are scrapping data from the Herons Nest website.

This step parsing each archives pages of the site and collect the found haikus to store them in a list before saving them in a csv file. 

This website was found in the list of this GitHub during my research about the subject : [haiku-scraper's GitHub](https://github.com/ytixu/haiku-scraper/tree/master).

This is the website used here : [Herons Nest](https://theheronsnest.com/June2024/index.html). 

In [None]:
def parsing_archive_page(archive_page_link) :
    page_index = 1

    # List to store all the haikus 
    archive_haikus_list = []

    # Loop to get all the possible pages
    archive_parsing = True
    while archive_parsing :
        # Getting the final link and doing the request
        archive_link = archive_page_link + str(page_index) + ".html"
        # print(f"Parsing page {archive_link}...")
        response_archive = requests.get(archive_link, headers={"User-Agent": "XY"})

        # Checking the status of the request (if we got a 404 or not) and the link of the request (if we got redirected)
        response_status = response_archive.status_code
        response_link = response_archive.request.url
        
        # If we have a successful request, then we are parsing it
        if response_status != 404 and (response_link == archive_link) :
            archive_soup = bs4.BeautifulSoup(response_archive.text)

            # Parsing the texts
            haikus_html_list = archive_soup.body.find_all('p', {'class':"haiku"})

            # Now getting the corps
            for haiku in haikus_html_list :
                haiku_corps = haiku.text.split('\n')
                if len(haiku_corps) == 3 :
                    haiku_corps = [re.sub(r'\xa0', '', haiku_text) for haiku_text in haiku_corps]
                    haiku_corps = [haiku_text.strip() for haiku_text in haiku_corps]
                    archive_haikus_list.append(haiku_corps)
        else :
            archive_parsing = False
        
        page_index += 1
    return archive_haikus_list


def parse_heronsnest_haikus(verbose = False) :
    response = requests.get("https://theheronsnest.com/archives.html", headers={"User-Agent": "XY"})
    soup = bs4.BeautifulSoup(response.text)
    archives_href = [i['href'] for i in soup.body.section.find_all('a', href=True)]

    haikus_dict = {}
    haikus_count = 0

    for href_month in archives_href :
        # Setting up the link for the request 
        splitted_link = href_month.split("/")
        archive_page_link = splitted_link[0] + "//" + splitted_link[2] + "/" + splitted_link[3]+ "/haiku-p"

        if splitted_link[3] == "Archives" :
            continue

        # List to store all the haikus 
        archive_haikus_list = parsing_archive_page(archive_page_link)
        
        if len(archive_haikus_list) == 0 : 
            print("Warning : Must do a different parsing for this page !")
        
        haikus_dict[splitted_link[3]] = archive_haikus_list

        if verbose :
            print("Paring haikus of : " + splitted_link[3])
            print(f"Parsed {len(archive_haikus_list)} haikus !\n")

    return haikus_dict

res_parsing_heronsnest = parse_heronsnest_haikus()


And now, we will be saving the resulting data in the corresponding csv file !

In [None]:
def heronsnest_to_csv(parsed_haikus) :
    cols_names = ["line_1", "line_2", "line_3"]
    csv_list = []
    for month in parsed_haikus :
        csv_list.extend(parsed_haikus[month])
    csv = pd.DataFrame(csv_list, columns=cols_names)
    csv["source"] = "herons_nest"
    return csv

heronsnest_csv = heronsnest_to_csv(res_parsing_heronsnest)
heronsnest_csv.to_csv('heronsnest_csv.csv', index=False)

## Temps libre

In this section, we are scrapping data from the Temps Libre website.

This step parsing each archives pages of the site and collect the found haikus to store them in a list before saving them in a csv file. 

This website was found in the list of this GitHub during my research about the subject : [haiku-scraper's GitHub](https://github.com/ytixu/haiku-scraper/tree/master).

This is the website used here : [Temps libre](https://www.tempslibres.org/tl/tlphp/dblang.php?lg=e). 

### French Haikus

In [None]:
def french_haikus_tempslibre_parsing(verbose=False) :
    haikus_list = []
    response = requests.get("https://www.tempslibres.org/tl/tlphp/dbauteursl.php?lang=fr&lg=e", headers={"User-Agent": "XY"})
    soup = bs4.BeautifulSoup(response.text)
    archives_href = [i['href'] for i in soup.article.find_all('a', href=True)]

    for href in archives_href : 
        link =  "https://www.tempslibres.org/tl/tlphp/" + href
        href_response = requests.get(link, headers={"User-Agent": "XY"})
        href_soup = bs4.BeautifulSoup(href_response.text)
        haikus_href_list = href_soup.body.find_all('p', {"class": "haiku"})
        for haikus_href in haikus_href_list : 
            haiku_corps = haikus_href.text.split('\n')
            if len(haiku_corps) == 3 :
                haiku_corps = [re.sub(r'\xa0', '', haiku_text) for haiku_text in haiku_corps]
                haiku_corps = [re.sub(r'\r', '', haiku_text) for haiku_text in haiku_corps]
                haiku_corps = [re.sub(r'\‏', '', haiku_text) for haiku_text in haiku_corps]
                haiku_corps = list(filter(None, haiku_corps))
                if len(haiku_corps) == 2 :
                    if verbose :
                        print("Error with haiku : ")
                        print(haiku_corps)
                        print("SKIPPING\n")
                    continue
                
                haikus_list.append(haiku_corps)
                if verbose :
                    print(haiku_corps)
    
    return haikus_list

res_parsing_tempslibre_french = french_haikus_tempslibre_parsing()

And now, we will be saving the resulting data in the corresponding csv file !

In [None]:
def french_tempslibre_to_csv(parsed_haikus) :
    cols_names = ["line_1", "line_2", "line_3"]
    csv = pd.DataFrame(parsed_haikus, columns=cols_names)
    csv["source"] = "tempslibre"
    return csv

french_tempslibre_csv = french_tempslibre_to_csv(res_parsing_tempslibre_french)
french_tempslibre_csv.to_csv('french_tempslibre_csv.csv', index=False)

### English haikus

In [None]:
def parse_languages_haikus(texts_list) :
    fr_list, en_list = [], [] 
    haiku_list = []
    haiku_string = ""
    # Looping into the texts
    for text  in texts_list : 
        # If we reached an empty line
        if text == '' :
            lang = detect(haiku_string)
            if lang == "fr" : 
                fr_list.append(haiku_list)
            elif lang == "en" :
                en_list.append(haiku_list)
            haiku_list = []
            haiku_string = ""

        # Otherwise, store the text
        else : 
            haiku_string += text + '\n'
            haiku_list.append(text)

    # At the end, check if there is an haiku and parse it
    if len(haiku_list) == 3 :
        lang = detect(haiku_string)
        if lang == "fr" : 
            fr_list.append(haiku_list)
        elif lang == "en" :
            en_list.append(haiku_list)
        haiku_list = []
    
    return fr_list, en_list

def english_haikus_tempslibre_parsing(verbose=False) :
    french_haikus_list = []
    english_haikus_list = []
    response = requests.get("https://www.tempslibres.org/tl/tlphp/dbauteursl.php?lang=en&lg=e", headers={"User-Agent": "XY"})
    soup = bs4.BeautifulSoup(response.text)
    archives_href = [i['href'] for i in soup.article.find_all('a', href=True)]

    for href in archives_href : 
        link =  "https://www.tempslibres.org/tl/tlphp/" + href
        href_response = requests.get(link, headers={"User-Agent": "XY"})
        href_soup = bs4.BeautifulSoup(href_response.text)
        haikus_href_list = href_soup.body.find_all('p', {"class": "haiku"})
        for haikus_href in haikus_href_list : 
            haiku_corps = haikus_href.text.split('\n')
            haiku_corps = [re.sub(r'\xa0', '', haiku_text) for haiku_text in haiku_corps]
            haiku_corps = [re.sub(r'\r', '', haiku_text) for haiku_text in haiku_corps]
            haiku_corps = [re.sub(r'\(', '', haiku_text) for haiku_text in haiku_corps]
            haiku_corps = [re.sub(r'\)', '', haiku_text) for haiku_text in haiku_corps]

            if len(haiku_corps) == 3 or len(haiku_corps) == 7 :
                haiku_parsed = parse_languages_haikus(haiku_corps)
                if len(haiku_parsed[0]) != 0 :
                    french_haikus_list.append(haiku_parsed[0][0])
                if len(haiku_parsed[1]) != 0 :
                    english_haikus_list.append(haiku_parsed[1][0])
            
            if verbose :
                print(haiku_corps)
    
    return french_haikus_list, english_haikus_list

res_parsing_tempslibre_english = english_haikus_tempslibre_parsing(False)

And now, we will be saving the resulting data in the corresponding csv file !

In [None]:
def english_tempslibre_to_csv(parsed_haikus) :
    cols_names = ["line_1", "line_2", "line_3"]
    csv_french = pd.DataFrame(parsed_haikus[0], columns=cols_names)
    csv_french["source"] = "tempslibre"
    csv_english = pd.DataFrame(parsed_haikus[1], columns=cols_names)
    csv_english["source"] = "tempslibre"
    return csv_french, csv_english

french_2_tempslibre_csv, english_tempslibre_csv = english_tempslibre_to_csv(res_parsing_tempslibre_english)
french_2_tempslibre_csv.to_csv('french_2_tempslibre_csv.csv', index=False)
english_tempslibre_csv.to_csv('english_tempslibre_csv.csv', index=False)

## Modern Haikus

In this section, we are scrapping data from the Temps Libre website.

This step parsing each archives pages of the site and collect the found haikus to store them in a list before saving them in a csv file. 

This website was found in the list of this GitHub during my research about the subject : [haiku-scraper's GitHub](https://github.com/ytixu/haiku-scraper/tree/master).

This is the website used here : [Moderns Haikus](http://www.modernhaiku.org/previousissue.html). 

In [None]:
def parse_modern_haikus() :
    response = requests.get("https://www.modernhaiku.org/previousissue.html", headers={"User-Agent": "XY"})
    soup = bs4.BeautifulSoup(response.text)

    haikus_list = []
    href_link_list = soup.table.table.find_all('a', href=True)
    for href_link in href_link_list : # Can also be : soup.select('table table td p a')
        if not "MH-Archive" in href_link['href'] :
            href_response = requests.get("https://www.modernhaiku.org/" + href_link['href'].split("/")[0] + "/haiku.html", headers={"User-Agent": "XY"})
            href_soup = bs4.BeautifulSoup(href_response.text)

            for haiku in href_soup.select('table table table p') :
                haiku_corps = haiku.text.split('\n')
                haiku_corps = [haiku_text.strip() for haiku_text in haiku_corps]
                haiku_corps = list(filter(None, haiku_corps))

                if len(haiku_corps) == 3 :
                    haiku_corps = [re.sub(r'\xa0', '', haiku_text) for haiku_text in haiku_corps]
                    haiku_corps = [re.sub(r'\r', '', haiku_text) for haiku_text in haiku_corps]
                    haiku_corps = [re.sub(r'\(', '', haiku_text) for haiku_text in haiku_corps]
                    haiku_corps = [re.sub(r'\)', '', haiku_text) for haiku_text in haiku_corps]

                    haikus_list.append(haiku_corps)

    return haikus_list

res_parsing_modern_haikus = parse_modern_haikus()

And now, we will be saving the resulting data in the corresponding csv file !

In [None]:
def modern_haikus_to_csv(parsed_haikus) :
    cols_names = ["line_1", "line_2", "line_3"]
    csv = pd.DataFrame(parsed_haikus, columns=cols_names)
    csv["source"] = "modern_haikus"
    return csv

modern_haikus_csv = modern_haikus_to_csv(res_parsing_modern_haikus)
modern_haikus_csv.to_csv('modern_haikus_csv.csv', index=False)