# **Retrieve Company and Country Information**

In [2]:
!pip install openai langchain langchain-community langchain-openai langgraph python-dotenv pypdf requests pandas semantic-router pinecone-client serpapi google-search-results pycountry wikipedia-api

Collecting langchain-community
  Downloading langchain_community-0.3.17-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-openai
  Downloading langchain_openai-0.3.4-py3-none-any.whl.metadata (2.3 kB)
Collecting langgraph
  Downloading langgraph-0.2.70-py3-none-any.whl.metadata (17 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting pypdf
  Downloading pypdf-5.3.0-py3-none-any.whl.metadata (7.2 kB)
Collecting semantic-router
  Downloading semantic_router-0.0.72-py3-none-any.whl.metadata (10 kB)
Collecting pinecone-client
  Downloading pinecone_client-5.0.1-py3-none-any.whl.metadata (19 kB)
Collecting serpapi
  Downloading serpapi-0.1.5-py2.py3-none-any.whl.metadata (10 kB)
Collecting google-search-results
  Downloading google_search_results-2.4.2.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting langchain-core<0.4.0,>=0.3.33 (from langchain)
  Downloading langchain_core-0.3.34-py3-none-any.whl.me

In [10]:
import os
import json
import wikipediaapi
import pycountry


USER_AGENT = "MyCountryInfoScript/1.0 (habtamufeyera95@gmail.com)"


def get_country_info(country_name):
    """
    Fetches key facts about a country from Wikipedia, first by looking for structured sections,
    and then scanning the full text if necessary.

    Args:
        country_name (str): The name of the country to search for.

    Returns:
        dict: Dictionary containing country information.
    """
    # Initialize the Wikipedia API client with a custom User-Agent
    headers = {'User-Agent': USER_AGENT}
    wiki_wiki = wikipediaapi.Wikipedia('en', headers=headers)

    # Fetch the page for the country
    page = wiki_wiki.page(country_name)


    if not page.exists():
        print(f"Page for '{country_name}' does not exist on Wikipedia.")
        return None

    # Extract general country info
    country_info = {
        "Country": country_name,
        "Summary": page.summary[:2000],
        "Economy": None,
        "Culture": None,
        "Business Environment": None
    }


    for section in page.sections:
        title = section.title.lower()

        if "economy" in title:
            country_info["Economy"] = section.text[:2000]
        elif "culture" in title:
            country_info["Culture"] = section.text[:2000]
        elif any(keyword in title for keyword in ["business", "investment", "trade", "corporate"]):
            country_info["Business Environment"] = section.text[:2000]


    if not country_info["Economy"]:
        country_info["Economy"] = search_full_text_for_section(page.text, "economy")
    if not country_info["Culture"]:
        country_info["Culture"] = search_full_text_for_section(page.text, "culture")
    if not country_info["Business Environment"]:
        country_info["Business Environment"] = search_full_text_for_section(page.text, "business")


    for key in ["Economy", "Culture", "Business Environment"]:
        if country_info[key] is None:
            country_info[key] = "No relevant section found."

    return country_info

def search_full_text_for_section(full_text, keyword):
    """
    Scans the full text to find and return relevant information based on a keyword.

    Args:
        full_text (str): The full Wikipedia page text.
        keyword (str): The keyword to search for in the text.

    Returns:
        str: The relevant information extracted from the text.
    """
    start_index = full_text.lower().find(keyword)
    if start_index != -1:
        end_index = full_text.find("\n", start_index + 1000)
        return full_text[start_index:end_index].strip()
    return "Relevant information not found in full text."

def get_all_countries_info(json_file_path='files/all_countries_info.json'):
    """
    Fetches and saves information for all countries from Wikipedia into a single JSON file.

    Args:
        json_file_path (str): The file path where the JSON data will be saved.

    Returns:
        list: List of dictionaries containing country information.
    """

    os.makedirs(os.path.dirname(json_file_path), exist_ok=True)

    all_country_info = []


    country_names = [country.name for country in pycountry.countries]

    for country_name in country_names:
        print(f"Fetching data for {country_name}...")
        country_info = get_country_info(country_name)

        if country_info:
            all_country_info.append(country_info)

    # Save all data in one JSON file
    try:
        with open(json_file_path, 'w', encoding='utf-8') as f:
            json.dump(all_country_info, f, ensure_ascii=False, indent=4)
            print(f"All country data saved to {json_file_path}")
    except Exception as e:
        print(f"Error saving data: {e}")

    return all_country_info

if __name__ == "__main__":
    # Fetch and store all countries' info in one document
    all_countries_data = get_all_countries_info()


Fetching data for Aruba...
Fetching data for Afghanistan...
Fetching data for Angola...
Fetching data for Anguilla...
Fetching data for Åland Islands...
Fetching data for Albania...
Fetching data for Andorra...
Fetching data for United Arab Emirates...
Fetching data for Argentina...
Fetching data for Armenia...
Fetching data for American Samoa...
Fetching data for Antarctica...
Fetching data for French Southern Territories...
Fetching data for Antigua and Barbuda...
Fetching data for Australia...
Fetching data for Austria...
Fetching data for Azerbaijan...
Fetching data for Burundi...
Fetching data for Belgium...
Fetching data for Benin...
Fetching data for Bonaire, Sint Eustatius and Saba...
Fetching data for Burkina Faso...
Fetching data for Bangladesh...
Fetching data for Bulgaria...
Fetching data for Bahrain...
Fetching data for Bahamas...
Fetching data for Bosnia and Herzegovina...
Fetching data for Saint Barthélemy...
Fetching data for Belarus...
Fetching data for Belize...
Fetch

In [31]:
import wikipediaapi
import concurrent.futures
import json
import re

# Custom User-Agent string for Wikipedia API request
USER_AGENT = "MyCompanyInfoScript/1.0 (habtamufeyera95@gmail.com)"

def search_full_text_for_section(text, keyword):
    """
    Searches the full text for relevant information on a given keyword.

    Args:
        text (str): The full text of the Wikipedia page.
        keyword (str): The keyword to search for in the text.

    Returns:
        str: A substring of text (up to 1000 characters) starting at the keyword, or None.
    """
    lower_text = text.lower()
    lower_keyword = keyword.lower()
    if lower_keyword in lower_text:
        start_idx = lower_text.find(lower_keyword)
        end_idx = start_idx + 1000  # Limit to 1000 characters
        return text[start_idx:end_idx]
    return None

def extract_industry_from_summary(summary):
    """
    Attempts to extract an industry phrase from the page summary.
    Looks for a pattern like "is a ... company".

    Args:
        summary (str): The summary text of the Wikipedia page.

    Returns:
        str: The captured industry phrase or None.
    """
    # Pattern example: "Apple Inc. is a technology company" -> capture "technology"
    match = re.search(r"is a ([\w\s,&-]+?) (company|firm)", summary, re.IGNORECASE)
    if match:
        return match.group(1).strip()
    return None

def get_company_info(company_name):
    """
    Fetches key facts about a company from Wikipedia.
    It extracts:
      - Background (from a section like "History")
      - Industry (from a dedicated section, a full-text search, or from the summary)
      - Achievements (from sections like "Business strategy", "Technology", or "Achievements")
      - Impact (from sections like "Lawsuits and controversies", "Corporate affairs", or "Impact")

    Args:
        company_name (str): The name of the company to search for.

    Returns:
        dict: A dictionary containing the company’s information.
    """
    # Initialize Wikipedia API with language and user agent
    wiki_wiki = wikipediaapi.Wikipedia(language='en', user_agent=USER_AGENT)
    page = wiki_wiki.page(company_name)

    if not page.exists():
        return {
            "Company": company_name,
            "Background": "No relevant section found.",
            "Industry": "No relevant section found.",
            "Achievements": "No relevant section found.",
            "Impact": "No relevant section found.",
            "Error": "Page not found"
        }

    company_info = {
        "Company": company_name,
        "Background": None,
        "Industry": None,
        "Achievements": None,
        "Impact": None
    }

    # --- Extract Background ---
    background_keywords = ["history"]
    background_text = None
    for section in page.sections:
        if any(keyword in section.title.lower() for keyword in background_keywords):
            background_text = section.text[:2000]  # Limit to first 2000 characters
            break
    if not background_text:
        background_text = page.text[:2000]  # Fallback to the first part of the page text
    company_info["Background"] = background_text

    # --- Extract Industry ---
    industry_keywords = ["industry", "sector", "products", "services", "automotive"]
    industry_text = None
    for section in page.sections:
        if any(keyword in section.title.lower() for keyword in industry_keywords):
            industry_text = section.text[:2000]  # Limit to first 2000 characters
            break
    # If not found, try a full-text search for "industry"
    if not industry_text:
        industry_text = search_full_text_for_section(page.text, "industry")
    # As a last resort, attempt to extract from the summary
    if not industry_text:
        industry_text = extract_industry_from_summary(page.summary)
    company_info["Industry"] = industry_text or "No relevant section found."

    # --- Extract Achievements ---
    achievements_keywords = ["business strategy", "technology", "achievements"]
    achievements_text = None
    for section in page.sections:
        if any(keyword in section.title.lower() for keyword in achievements_keywords):
            achievements_text = section.text[:2000]
            break
    if not achievements_text:
        achievements_text = search_full_text_for_section(page.text, "achievements")
    company_info["Achievements"] = achievements_text or "No relevant section found."

    # --- Extract Impact ---
    impact_keywords = ["lawsuits and controversies", "corporate affairs", "impact"]
    impact_text = None
    for section in page.sections:
        if any(keyword in section.title.lower() for keyword in impact_keywords):
            impact_text = section.text[:2000]
            break
    if not impact_text:
        impact_text = search_full_text_for_section(page.text, "impact")
    company_info["Impact"] = impact_text or "No relevant section found."

    return company_info

def fetch_company_data_concurrently(company_names):
    """
    Fetches company data concurrently for multiple companies using threading.

    Args:
        company_names (list): List of company names.

    Returns:
        list: List of dictionaries with company information.
    """
    all_company_info = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(get_company_info, company_name) for company_name in company_names]
        for future in concurrent.futures.as_completed(futures):
            all_company_info.append(future.result())
    return all_company_info

if __name__ == "__main__":
    company_names = [
        "Apple", "Tesla", "Microsoft", "Google", "Amazon", "Facebook", "Netflix",
        "IBM", "Oracle", "Adobe", "Twitter", "Zoom", "Uber", "Airbnb", "Spotify",
        "Intel", "Nvidia", "Salesforce", "Snapchat", "LinkedIn", "Pinterest",
        "Lyft", "Slack", "Square", "Atlassian", "Shopify", "PayPal", "Stripe",
        "Dropbox", "Reddit", "Instacart", "Postmates", "Figma", "Robinhood",
        "Plaid", "ByteDance", "TikTok", "Alibaba", "Tencent", "Huawei", "Xiaomi",
        "Baidu", "JD.com", "Meituan", "Didi", "Zebra Technologies", "Palantir",
        "Databricks", "Snowflake", "Elastic", "MongoDB", "HashiCorp", "Twilio",
        "T-Mobile", "ZoomInfo", "Veeva", "Coupa", "Okta", "ServiceTitan",
        "Qualys", "Datadog", "Coupang", "Cognizant", "Accenture", "DXC Technology",
        "Intuit", "Cigna", "Aetna", "UnitedHealth", "CVS Health", "Merck", "Pfizer",
        "Eli Lilly", "Bristol-Myers Squibb", "AbbVie", "Johnson & Johnson",
        "Procter & Gamble", "Unilever", "Nestlé", "Coca-Cola", "PepsiCo",
        "General Electric", "Lockheed Martin", "Raytheon", "Ethio Telecom",
        "Ethiopian Airlines", "DHL Ethiopia", "Bunna International Bank",
        "Commercial Bank of Ethiopia", "Zemen Bank", "Awash International Bank",
        "Dashen Bank", "Addis Ababa Light Rail", "Mulugeta Teshome Group"
    ]

    company_data = fetch_company_data_concurrently(company_names)

    # Save data to a JSON file
    json_data = json.dumps(company_data, indent=4)
    with open("company_info.json", "w") as json_file:
        json_file.write(json_data)


In [32]:
import json

def load_json_file(path):
    """
    Loads a JSON file from the given path.

    Args:
        path (str): The file path to the JSON file.

    Returns:
        list/dict: The parsed JSON data.
    """
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def convert_record_to_text(record):
    """
    Converts a country or company record (dictionary) into a unified text document.

    For a company record (which contains the key "Company"):
      - Entity Type: Company
      - Name: <Company Name>
      - Overview: from the "Background" field (or fallback if missing)
      - Industry: from the "Industry" field
      - Achievements: from the "Achievements" field
      - Impact: from the "Impact" field

    For a country record (which contains the key "Country"):
      - Entity Type: Country
      - Name: <Country Name>
      - Overview: from the "Summary" field
      - Economy: from the "Economy" field
      - Culture: from the "Culture" field
      - Business Environment: from the "Business Environment" field

    Args:
        record (dict): A dictionary representing an extracted entity (country or company).

    Returns:
        str: A unified text representation of the record.
    """
    if "Company" in record:
        # Company record
        entity_type = "Company"
        name = record.get("Company", "N/A")
        overview = record.get("Background", "N/A")
        industry = record.get("Industry", "N/A")
        achievements = record.get("Achievements", "N/A")
        impact = record.get("Impact", "N/A")

        unified_text = (
            f"Entity Type: {entity_type}\n"
            f"Name: {name}\n"
            f"Overview: {overview}\n"
            f"Industry: {industry}\n"
            f"Achievements: {achievements}\n"
            f"Impact: {impact}"
        )
    elif "Country" in record:
        # Country record
        entity_type = "Country"
        name = record.get("Country", "N/A")
        overview = record.get("Summary", "N/A")
        economy = record.get("Economy", "N/A")
        culture = record.get("Culture", "N/A")
        business_env = record.get("Business Environment", "N/A")

        unified_text = (
            f"Entity Type: {entity_type}\n"
            f"Name: {name}\n"
            f"Overview: {overview}\n"
            f"Economy: {economy}\n"
            f"Culture: {culture}\n"
            f"Business Environment: {business_env}"
        )
    else:

        unified_text = "Unknown entity type or invalid record structure."

    return unified_text

def convert_records(records):
    """
    Converts a list of records (dictionaries) into a list of unified text documents.

    Args:
        records (list): A list of dictionaries representing countries or companies.

    Returns:
        list: A list of strings, each string is the unified text representation of a record.
    """
    return [convert_record_to_text(record) for record in records]

def main():
    # Load the extracted country and company JSON files.
    countries = load_json_file("/content/files/all_countries_info.json")
    companies = load_json_file("/content/company_info.json")

    # Convert each record into a unified text representation.
    country_docs = convert_records(countries)
    company_docs = convert_records(companies)

    # Combine both corpora.
    unified_corpus = country_docs + company_docs

    output_path = "unified_corpus.txt"
    with open(output_path, "w", encoding="utf-8") as f:
        for doc in unified_corpus:
            f.write(doc + "\n" + "="*80 + "\n")

    for doc in unified_corpus[:3]:
        print("=" * 80)
        print(doc)
        print("=" * 80)

if __name__ == "__main__":
    main()


Entity Type: Country
Name: Aruba
Overview: Aruba ( ə-ROO-bə, Dutch pronunciation: [aːˈrubaː] or [aːˈrybaː] , Papiamento pronunciation: [aˈruba]), officially the Country of Aruba (Dutch: Land Aruba; Papiamento: Pais Aruba), is a constituent island country within the Kingdom of the Netherlands, in the southern Caribbean Sea 29 kilometres (18 mi) north of the Venezuelan peninsula of Paraguaná and 80 kilometres (50 mi) northwest of Curaçao. In 1986, Aruba became a constituent country within the Kingdom of the Netherlands and acquired the formal name the Country of Aruba.
Aruba has an area of 179 km2 (69.1 sq mi). Aruba measures 32 kilometres (20 mi) in length from its northwestern to its southeastern end and is 10 kilometres (6 mi) across at its widest point. Aruba is geologically located in South-America, lying on the South-American continental shelf. Alongside Bonaire and Curaçao, Aruba forms a group referred to as the ABC islands. The Dutch Caribbean encompasses the ABC islands along wi

In [34]:
import json
import pandas as pd

# --- Functions for Loading and Converting Records ---

def load_json_file(path):
    """
    Loads a JSON file from the given path.

    Args:
        path (str): The file path to the JSON file.

    Returns:
        list: The parsed JSON data (a list of records).
    """
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def convert_record_to_text(record):
    """
    Converts a country or company record (dictionary) into a unified text document.

    For a company record (which contains the key "Company"):
      - Entity Type: Company
      - Name: <Company Name>
      - Overview: from the "Background" field (or fallback if missing)
      - Industry: from the "Industry" field
      - Achievements: from the "Achievements" field
      - Impact: from the "Impact" field

    For a country record (which contains the key "Country"):
      - Entity Type: Country
      - Name: <Country Name>
      - Overview: from the "Summary" field
      - Economy: from the "Economy" field
      - Culture: from the "Culture" field
      - Business Environment: from the "Business Environment" field

    Args:
        record (dict): A dictionary representing an extracted entity (country or company).

    Returns:
        str: A unified text representation of the record.
    """
    if "Company" in record:
        # Company record
        entity_type = "Company"
        name = record.get("Company", "N/A")
        overview = record.get("Background", "N/A")
        industry = record.get("Industry", "N/A")
        achievements = record.get("Achievements", "N/A")
        impact = record.get("Impact", "N/A")

        unified_text = (
            f"Entity Type: {entity_type}\n"
            f"Name: {name}\n"
            f"Overview: {overview}\n"
            f"Industry: {industry}\n"
            f"Achievements: {achievements}\n"
            f"Impact: {impact}"
        )
    elif "Country" in record:
        # Country record
        entity_type = "Country"
        name = record.get("Country", "N/A")
        overview = record.get("Summary", "N/A")
        economy = record.get("Economy", "N/A")
        culture = record.get("Culture", "N/A")
        business_env = record.get("Business Environment", "N/A")

        unified_text = (
            f"Entity Type: {entity_type}\n"
            f"Name: {name}\n"
            f"Overview: {overview}\n"
            f"Economy: {economy}\n"
            f"Culture: {culture}\n"
            f"Business Environment: {business_env}"
        )
    else:
        # Fallback for unknown record structure.
        unified_text = "Unknown entity type or invalid record structure."

    return unified_text

def convert_records(records):
    """
    Converts a list of records (dictionaries) into a list of unified text documents.

    Args:
        records (list): A list of dictionaries representing countries or companies.

    Returns:
        list: A list of strings, each string is the unified text representation of a record.
    """
    return [convert_record_to_text(record) for record in records]

# --- Main Routine to Load, Convert, and Display as a DataFrame ---

def main():
    # Load your extracted JSON files for countries and companies.
    countries = load_json_file("/content/files/all_countries_info.json")
    companies = load_json_file("/content/company_info.json")


    all_records = countries + companies

    # Convert each record into its unified text representation.
    unified_documents = convert_records(all_records)

    # Create a pandas DataFrame where each row represents one unified document.
    df = pd.DataFrame(unified_documents, columns=["Unified Document"])


    print(df.head(20))

if __name__ == "__main__":
    main()


                                     Unified Document
0   Entity Type: Country\nName: Aruba\nOverview: A...
1   Entity Type: Country\nName: Afghanistan\nOverv...
2   Entity Type: Country\nName: Angola\nOverview: ...
3   Entity Type: Country\nName: Anguilla\nOverview...
4   Entity Type: Country\nName: Åland Islands\nOve...
5   Entity Type: Country\nName: Albania\nOverview:...
6   Entity Type: Country\nName: Andorra\nOverview:...
7   Entity Type: Country\nName: United Arab Emirat...
8   Entity Type: Country\nName: Argentina\nOvervie...
9   Entity Type: Country\nName: Armenia\nOverview:...
10  Entity Type: Country\nName: American Samoa\nOv...
11  Entity Type: Country\nName: Antarctica\nOvervi...
12  Entity Type: Country\nName: French Southern Te...
13  Entity Type: Country\nName: Antigua and Barbud...
14  Entity Type: Country\nName: Australia\nOvervie...
15  Entity Type: Country\nName: Austria\nOverview:...
16  Entity Type: Country\nName: Azerbaijan\nOvervi...
17  Entity Type: Country\nNa

In [35]:
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter

def load_unified_corpus(file_path):
    """
    Loads the unified corpus text file.

    Args:
        file_path (str): Path to the unified corpus text file.

    Returns:
        str: The full text content of the file.
    """
    with open(file_path, "r", encoding="utf-8") as f:
        return f.read()

def split_into_documents(unified_text, delimiter="=" * 80):
    """
    Splits the unified text into individual documents using a delimiter.

    Args:
        unified_text (str): The full text content containing all documents.
        delimiter (str): The string that separates documents.

    Returns:
        list: A list of document strings.
    """
    # Split and filter out empty documents
    documents = [doc.strip() for doc in unified_text.split(delimiter) if doc.strip()]
    return documents

def split_document_into_chunks(document, chunk_size=500, chunk_overlap=50):
    """
    Splits a single document into chunks using LangChain's RecursiveCharacterTextSplitter.

    Args:
        document (str): The text of the document.
        chunk_size (int): The maximum number of characters per chunk.
        chunk_overlap (int): The number of characters that overlap between chunks.

    Returns:
        list: A list of chunk strings.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        # The separators define the splitting hierarchy
        separators=["\n\n", "\n", " ", ""]
    )
    return text_splitter.split_text(document)

def process_unified_corpus(file_path, chunk_size=500, chunk_overlap=50, delimiter="=" * 80):
    """
    Processes the unified corpus file: loads it, splits into documents, splits each document into chunks,
    and returns a list of dictionaries containing chunk information.

    Args:
        file_path (str): Path to the unified corpus text file.
        chunk_size (int): Maximum characters per chunk.
        chunk_overlap (int): Overlap between consecutive chunks.
        delimiter (str): The document delimiter.

    Returns:
        list: A list of dictionaries with keys 'doc_id', 'chunk_id', and 'chunk_text'.
    """
    unified_text = load_unified_corpus(file_path)
    documents = split_into_documents(unified_text, delimiter=delimiter)

    chunk_data = []
    for doc_id, doc in enumerate(documents):
        chunks = split_document_into_chunks(doc, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        for chunk_id, chunk in enumerate(chunks):
            chunk_data.append({
                "doc_id": doc_id,
                "chunk_id": chunk_id,
                "chunk_text": chunk
            })
    return chunk_data

def main():
    # Path to your unified corpus file
    file_path = "/content/unified_corpus.txt"

    # Process the unified corpus into chunks
    chunks = process_unified_corpus(file_path, chunk_size=500, chunk_overlap=50, delimiter="=" * 80)

    # Convert the list of chunk dictionaries into a DataFrame
    df = pd.DataFrame(chunks)

    # Show a preview of the DataFrame
    print("First few rows of the DataFrame:")
    print(df.head())

    # Save the DataFrame to CSV
    output_csv = "/content/unified_corpus_chunks.csv"
    df.to_csv(output_csv, index=False, encoding="utf-8")
    print(f"DataFrame saved to {output_csv}")

if __name__ == "__main__":
    main()


  class PointerInstance(PointerType):


First few rows of the DataFrame:
   doc_id  chunk_id                                         chunk_text
0       0         0                  Entity Type: Country\nName: Aruba
1       0         1  Overview: Aruba ( ə-ROO-bə, Dutch pronunciatio...
2       0         2  country within the Kingdom of the Netherlands ...
3       0         3  Aruba has an area of 179 km2 (69.1 sq mi). Aru...
4       0         4  islands, the SSS islands. In contrast to much ...
DataFrame saved to /content/unified_corpus_chunks.csv


In [43]:
import os
import logging
from pinecone import Pinecone, ServerlessSpec

# Ensure that the Pinecone API key is set in the environment.
pinecone_api_key = os.getenv("PINECONE_API_KEY")
if not pinecone_api_key:
    raise ValueError("PINECONE_API_KEY is not set in the environment variables.")

# Initialize Pinecone client.
pc = Pinecone(api_key=pinecone_api_key)

index_name = "unified-corpus-index"

def manage_pinecone_index() -> None:
    # List existing Pinecone indexes
    existing_indexes = pc.list_indexes().names()
    logging.info(f"Existing Pinecone indexes: {existing_indexes}")

    # Check if the index already exists
    if index_name in existing_indexes:
        logging.info(f"Index '{index_name}' already exists.")
        return


    if len(existing_indexes) >= 5:
        index_to_delete = existing_indexes[0]
        logging.info(f"Deleting index '{index_to_delete}' to free up space...")
        pc.delete_index(index_to_delete)
        logging.info(f"Index '{index_to_delete}' deleted successfully.")

    # Create the index with the spec argument
    logging.info(f"Creating index '{index_name}'...")
    try:
        spec = ServerlessSpec(cloud="aws", region="us-east-1")
        pc.create_index(name=index_name, dimension=1536, metric="cosine", spec=spec)
        logging.info(f"Index '{index_name}' created successfully.")
    except Exception as e:
        logging.error(f"Error creating index: {e}")

# Call the function to manage the Pinecone index
manage_pinecone_index()

# Connect to the index
index = pc.Index(index_name)


In [45]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [90]:
import pandas as pd

# Load the chunked data from the CSV file
df_chunks = pd.read_csv('/content/unified_corpus_chunks.csv')

# Check the first few rows
df_chunks.head()

Unnamed: 0,doc_id,chunk_id,chunk_text
0,0,0,Entity Type: Country\nName: Aruba
1,0,1,"Overview: Aruba ( ə-ROO-bə, Dutch pronunciatio..."
2,0,2,country within the Kingdom of the Netherlands ...
3,0,3,Aruba has an area of 179 km2 (69.1 sq mi). Aru...
4,0,4,"islands, the SSS islands. In contrast to much ..."


In [52]:
def chunk_data(df_chunks, chunk_size=100):
    """Splits the data into smaller chunks for uploading to Pinecone."""
    for i in range(0, len(data), chunk_size):
        yield data[i:i + chunk_size]

# Prepare the data for Pinecone
pinecone_data = [
    {
        "id": f"{row.doc_id}_{row.chunk_id}",
        "values": embedding,
        "metadata": {
            "doc_id": row.doc_id,
            "chunk_id": row.chunk_id,
            "chunk_text": row.chunk_text
        }
    }
    for row, embedding in zip(df_chunks.itertuples(), chunk_embeddings)
]

# Split the data into smaller chunks and upload
for batch in chunk_data(pinecone_data, chunk_size=100):
    upload_to_pinecone(batch, index)


Data uploaded to Pinecone!
Data uploaded to Pinecone!
Data uploaded to Pinecone!
Data uploaded to Pinecone!
Data uploaded to Pinecone!
Data uploaded to Pinecone!
Data uploaded to Pinecone!
Data uploaded to Pinecone!
Data uploaded to Pinecone!
Data uploaded to Pinecone!
Data uploaded to Pinecone!
Data uploaded to Pinecone!
Data uploaded to Pinecone!
Data uploaded to Pinecone!
Data uploaded to Pinecone!
Data uploaded to Pinecone!
Data uploaded to Pinecone!
Data uploaded to Pinecone!
Data uploaded to Pinecone!
Data uploaded to Pinecone!
Data uploaded to Pinecone!
Data uploaded to Pinecone!
Data uploaded to Pinecone!
Data uploaded to Pinecone!
Data uploaded to Pinecone!
Data uploaded to Pinecone!
Data uploaded to Pinecone!
Data uploaded to Pinecone!
Data uploaded to Pinecone!
Data uploaded to Pinecone!
Data uploaded to Pinecone!
Data uploaded to Pinecone!
Data uploaded to Pinecone!
Data uploaded to Pinecone!
Data uploaded to Pinecone!
Data uploaded to Pinecone!
Data uploaded to Pinecone!
D

In [67]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 5322}},
 'total_vector_count': 5322}

In [86]:
from langchain.embeddings.openai import OpenAIEmbeddings

# Instantiate the OpenAI embedding model
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

def get_embedding(text: str):
    """
    Get the embedding for a given text using the OpenAI Embeddings model.
    """
    return embedding_model.embed_query(text)  # Use embed_query for text input


query_embedding = get_embedding("What is Tesla known for?")

In [87]:
print(query_embedding)

[0.006992719356403181, -0.018755868667964692, 0.005666514105738843, -0.007442391246250343, 0.0001883814461718042, -0.012173718224898093, -0.0324545587389517, 0.004447838897264812, -0.0065495648621384335, -0.02342202588174002, 0.009703783624005771, 0.02234020713293195, -0.01352925082436069, 0.005754493358165812, 0.012929688615005343, 0.00589460851562298, 0.02802301472762683, 0.020007128526044287, 0.014050608943340087, -0.019199023627890742, -0.03667756285544618, -0.009827605758168207, -0.010928975762400915, 0.012095515065844747, -0.012310575150225357, -0.0042164862552507695, 0.023369890442371123, -0.014598034782003933, -0.00498548957387764, -0.005073468826304608, 0.03383616092074395, 0.006412708402472472, -0.016540095730979653, -0.007312051716505493, -0.02888325506514927, 0.0005661626524713335, 0.013372842643608789, -0.007787790860375802, 0.03800702587257913, -0.017348200629133198, -0.0012822158572815038, 0.028726846884397368, -0.010251209462669617, -0.008432972510501539, -0.03190713476

In [75]:
def query_pinecone(query_embedding, top_k=5):
    """
    Query Pinecone to get the most similar vectors to the given query embedding.
    """
    # Query Pinecone index
    response = index.query(
        vector=query_embedding,
        top_k=top_k,
        include_metadata=True
    )

    # Extract relevant data from the response
    results = response['matches']
    return results


top_matches = query_pinecone(query_embedding)

for match in top_matches:
    print(f"ID: {match['id']}, Score: {match['score']}")

ID: 249_0, Score: 0.843167961
ID: 249_3, Score: 0.841895878
ID: 249_1, Score: 0.836924
ID: 249_4, Score: 0.83199805
ID: 249_2, Score: 0.825293899


In [88]:
from langchain_openai import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains import RetrievalQA

# Connect to Pinecone index
vectorstore = Pinecone(index, embeddings, text_key="chunk_text")
# Set up retriever
retriever = vectorstore.as_retriever()

# Initialize OpenAI model
llm = ChatOpenAI(model="gpt-4", temperature=0.7)

# Create RAG pipeline
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

query = "Tell me about the Ethiopia Airlines"
response = qa_chain.invoke(query)

print(response)


{'query': 'Tell me about the Ethiopia Airlines', 'result': "Ethiopian Airlines, formerly known as Ethiopian Air Lines (EAL), is the flag carrier of Ethiopia and is owned by the country's government. It was founded on 21 December 1945 and started operations on 8 April 1946, expanding to international flights in 1951. The company became a share company in 1965 and changed its name to Ethiopian Airlines. \n\nThe airline is a member of the International Air Transport Association and the African Airlines Association. It also joined the Star Alliance in December 2011. Its slogan is 'The New Spirit of Africa.' The airline has its hub and headquarters at Bole International Airport in Addis Ababa, from where it serves a network of 155 passenger destinations—22 of them domestic—and 68 freighter destinations. The airline also has secondary hubs in Togo and Malawi.\n\nEthiopian Airlines offers two classes on most of its flights - Cloud Nine and Economy Class. It is the largest airline in Africa in