In [24]:
import configparser
import os
import json
import pandas as pd
import wikipediaapi
import requests
from bs4 import BeautifulSoup

In [36]:
from langchain.llms import OpenAI
from langchain import LLMChain
from langchain import PromptTemplate
from langchain.document_loaders import TextLoader

In [3]:
config = configparser.ConfigParser()
config.read('config.ini')

# Define the API Key.
API_KEY = config['OPENAI_API']['API_KEY']
os.environ['OPENAI_API_KEY'] = API_KEY

In [21]:
from sqlalchemy import create_engine

# Read the MySQL configuration from the JSON file
with open('config.json', 'r') as config_file:
    config = json.load(config_file)

# Extract MySQL connection details
mysql_config = config.get('mysql', {})
username = mysql_config.get('username', 'default_username')
password = mysql_config.get('password', 'default_password')
host = mysql_config.get('host', 'localhost')
database_name = mysql_config.get('database_name', 'your_database')

# Create the MySQL database connection string
db_url = f"mysql+mysqlconnector://{username}:{password}@{host}/{database_name}"

# Create an SQLAlchemy engine
engine = create_engine(db_url)

# Use the engine to connect to the database
connection = engine.connect()

# Specify the SQL query to retrieve data from a table
query = "SELECT * FROM rome_geo_tags"

# Use Pandas to read data from the database into a DataFrame
df = pd.read_sql(query, connection)

# Close the database connection
connection.close()

df.head()

Unnamed: 0,gt_id,gt_lat,gt_lon,gt_page_id,url
0,306683262,41.884,12.491,48234033,http://en.wikipedia.org/?curid=48234033
1,306683263,41.886,12.495,48234033,http://en.wikipedia.org/?curid=48234033
2,306683264,41.891389,12.480278,48234033,http://en.wikipedia.org/?curid=48234033
3,306683265,41.89385,12.48194,48234033,http://en.wikipedia.org/?curid=48234033
4,306683266,41.907222,12.498611,48234033,http://en.wikipedia.org/?curid=48234033


In [23]:
wiki = wikipediaapi.Wikipedia(user_agent="krystek.pietrzak@gmail.com", language='en', extract_format=wikipediaapi.ExtractFormat.WIKI)

In [40]:
def getText(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        # Extract data from the webpage and append it to scraped_data
        title = soup.find_all('title')[0].text[:-12]
        print(f'Title of the page: {title}')
        
        page_py = wiki.page(title)
        if page_py.exists():
            print(f"Page ID; {page_py.pageid}")
        else:
            print('Page does not exist')
    else:
        print(f"Failed to fetch data from {url}")
    
    with open('text.txt', 'w') as f:
        f.write(page_py.text)


In [41]:
getText(df.iloc[766]['url'])

Title of the page: Line B (Rome Metro)
Page ID; 20974602


In [42]:
# Document Loader
loader = TextLoader('text.txt')
documents = loader.load()

In [43]:
import textwrap

def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

In [44]:
print(wrap_text_preserve_newlines(str(documents[0])))

page_content="Line B is a metro line serving Rome, Italy, and part of the Rome Metro. Despite its name, Line B
was the first line to be built in the city. It crosses Rome diagonally from north-east, starting at Rebibbia
and at Jonio stations, to south, terminating at Laurentina, in the EUR district. It crosses Line A at Termini
station. The line has 26 stations and is shown in blue on Metro maps.\n\nOverview\nIts first service runs at
05:30 and its last at 23:30. From 18 January 2008, the last Friday and Saturday service runs at 1:30. It
carries 345,000 passengers a day and runs 377 trains a day, with a peak time frequency of one train every 3
minutes in the shared section and 4,5 minutes in the branches. Every 6 minutes at other times, at a maximum
frequency of 9 minutes at the most off-peak times.\n\nHistory\nDespite its name, Line B was the first metro
line in Rome.  The line was planned during the 1930s by the Fascist government in search of a rapid connection
between the main trai

In [45]:
# Text Splitter
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

Created a chunk of size 1537, which is longer than the specified 1000


In [46]:
len(docs)

4