In [70]:
# Import necessary libraries
import requests 
from bs4 import BeautifulSoup 
import csv
import pandas as pd
import PyPDF2
import io

In [71]:
def extpdf(pdf_url):
  """
  Extracts text from a PDF file.

  Parameters:
      url (str): The URL of the PDF file to extract text from.

  Returns:
      str: The extracted text, or "epub version" if an error occurs.

  Raises:
      None

  Example:
      >>> text = extpdf("https://www.example.com/sample.pdf")
      >>> print(text)
      This is a sample PDF file.
  """
  try:
    # Make a request to the PDF file URL and get the PDF content
    response = requests.get(pdf_url)
    pdf_file = io.BytesIO(response.content)
    # Create a PDF reader object and extract text from each page
    pdf_reader = PyPDF2.PdfReader(pdf_file)

    text = ""
    for page_num in range(len(pdf_reader.pages)):
        page = pdf_reader.pages[page_num]
        text += page.extract_text()
    # Return the extracted text
    return text
  except:
    # Return "Not pdf" if an url is not for a pdf
    return "Not pdf"

In [72]:
# Define the base URL that I'll use to get urls of pdfs
url2 = 'https://www.bancaditalia.it'
# Create empty lists to store the speeches information
speaker_names = []
dates = []
contents = []
titles = []
urls = []

# Open a CSV file to store the speeches information
with open('speeches.csv', 'w', encoding='utf-8') as file:
    writer = csv.writer(file)

    # Write the header row to the CSV file
    writer.writerow(['speaker', 'date', 'content', 'title','url'])

    # Loop through the pages of speeches on the website
    for i in range(1,36):
      url = f'https://www.bancaditalia.it/chi-siamo/funzioni-governance/direttorio/ignazio-visco/interventi/index.html?com.dotmarketing.htmlpage.language=1&dotcache=refresh&page={i}&dotcache=refresh'
      
      # Make a request to the speeches page and parse the HTML content
      response = requests.get(url)
      soup = BeautifulSoup(response.content, 'html.parser')
      container = soup.find(id="listacom")

      # Extract the speaker names, dates, titles, and PDF URLs from the HTML content
      speaker_name = container.find_all("span", class_="link-int link-int-bold")
      date = container.find_all("span", class_="link-date")
      title = container.find_all("span",class_="link-title")
      pdf_urls = container.find_all("a")

      # Loop through the speeches on the current page and extract their information
      for i in range(len(speaker_name)):
        # Extract the speaker name, date, and title
        speaker_names.append(speaker_name[i].text)
        titles.append(title[i].text)
        dates.append(date[i].text)

        # Extract the PDF URL and content
        pdf_url = pdf_urls[i].get('href')
        contents.append(extpdf(url2+pdf_url[:-1]))

        # Extract the URL of the speech page
        urls.append(url2+pdf_url)

        # Write the speeches information to the CSV file
        writer.writerow([speaker_name[i].text, date[i].text, extpdf(url2+pdf_url[:-1]), title[i].text, url2+pdf_url[:-1]])

In [73]:
#Read the data into a DataFrame
df = pd.read_csv('/content/speeches.csv')
df.head()

Unnamed: 0,speaker,date,content,title,url
0,"by Ignazio Visco, Governor of the Bank of Italy",Data di pubblicazione:20-04-2023,Monetary Policy and the Return of Inflation. \...,Monetary Policy and the Return of Inflation. Q...,https://www.bancaditalia.it/pubblicazioni/inte...
1,"by Ignazio Visco, Governor of the Bank of Italy",Data di pubblicazione:18-04-2023,TMEuropa e Italia: prosperità nell’unione e ne...,Europa e Italia: prosperità nell'unione e nell...,https://www.bancaditalia.it/pubblicazioni/inte...
2,"by Ignazio Visco, Governor of the Bank of Italy",Data di pubblicazione:12-04-2023,DEVELOPMENT COMMITTEE \n(Joint Ministerial Co...,Statement by Ignazio Visco at the 107th Meetin...,https://www.bancaditalia.it/pubblicazioni/inte...
3,"by Ignazio Visco, Governor of the Bank of Italy",Data di pubblicazione:31-03-2023,TMThe return of inflation is severely affectin...,"Inflation, Monetary Policy and Inequalities. S...",https://www.bancaditalia.it/pubblicazioni/inte...
4,"by Ignazio Visco, Governor of the Bank of Italy",Data di pubblicazione:31-03-2023,Address by the GovernorOrdinary General Meetin...,Address by the Governor Ignazio Visco,https://www.bancaditalia.it/pubblicazioni/inte...


In [74]:
#Drop rows with "Not pdf" in content because it's a duplicate of another row,but in other format
df.drop(df[df['content'] == 'Not pdf'].index, inplace=True)

In [75]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 255 entries, 0 to 346
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   speaker  255 non-null    object
 1   date     255 non-null    object
 2   content  255 non-null    object
 3   title    255 non-null    object
 4   url      255 non-null    object
dtypes: object(5)
memory usage: 12.0+ KB


In [76]:
#Change the date to a datetime datatype
df['date'] = df['date'].str.replace('Data di pubblicazione:', '')
df['date'] = pd.to_datetime(df['date'], format='%d-%m-%Y')

In [77]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 255 entries, 0 to 346
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   speaker  255 non-null    object        
 1   date     255 non-null    datetime64[ns]
 2   content  255 non-null    object        
 3   title    255 non-null    object        
 4   url      255 non-null    object        
dtypes: datetime64[ns](1), object(4)
memory usage: 12.0+ KB


In [78]:
df.head()

Unnamed: 0,speaker,date,content,title,url
0,"by Ignazio Visco, Governor of the Bank of Italy",2023-04-20,Monetary Policy and the Return of Inflation. \...,Monetary Policy and the Return of Inflation. Q...,https://www.bancaditalia.it/pubblicazioni/inte...
1,"by Ignazio Visco, Governor of the Bank of Italy",2023-04-18,TMEuropa e Italia: prosperità nell’unione e ne...,Europa e Italia: prosperità nell'unione e nell...,https://www.bancaditalia.it/pubblicazioni/inte...
2,"by Ignazio Visco, Governor of the Bank of Italy",2023-04-12,DEVELOPMENT COMMITTEE \n(Joint Ministerial Co...,Statement by Ignazio Visco at the 107th Meetin...,https://www.bancaditalia.it/pubblicazioni/inte...
3,"by Ignazio Visco, Governor of the Bank of Italy",2023-03-31,TMThe return of inflation is severely affectin...,"Inflation, Monetary Policy and Inequalities. S...",https://www.bancaditalia.it/pubblicazioni/inte...
4,"by Ignazio Visco, Governor of the Bank of Italy",2023-03-31,Address by the GovernorOrdinary General Meetin...,Address by the Governor Ignazio Visco,https://www.bancaditalia.it/pubblicazioni/inte...


In [79]:
# Save the DataFrame to a CSV file
df.to_csv('cleaned.csv', index=False)