In [1]:
from langchain.llms.base import LLM
import requests
from typing import Optional, List

class LocalAPILLM(LLM):
    def __init__(self, api_base: str, model_name: str):
        self.api_base = api_base
        self.model_name = model_name

    @property
    def _llm_type(self) -> str:
        return "local_api"

    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        # Construct the API request payload
        payload = {
            "model": self.model_name,
            "prompt": prompt,
            "max_tokens": 1024,         # Adjust as needed
            "temperature": 0.1,         # Adjust as needed
            "stop": stop
        }

        # Send the request to the local model API
        response = requests.post(f"{self.api_base}/generate", json=payload)

        # Check if the request was successful
        if response.status_code != 200:
            raise Exception(f"Request failed with status {response.status_code}: {response.text}")

        # Extract the generated text from the response
        return response.json().get("generated_text", "")

# Instantiate the custom LLM with your local API details
local_llm = LocalAPILLM(api_base="http://localhost:11434/v1", model_name="llama3")

# Use the LLM with a prompt
response = local_llm("What is the capital of France?")
print(response)


ValueError: "LocalAPILLM" object has no field "api_base"

In [4]:
# Import necessary modules from LangChain and Ollama
from langchain_experimental.llms.ollama_functions import OllamaFunctions
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

# Initialize OllamaFunctions with your local Ollama API and model name
ollama_llm = OllamaFunctions(api_base="http://localhost:11434/v1", model="llama3.2")

# Define a prompt template for summarization
summarization_prompt = PromptTemplate(
    input_variables=["text"],  # Variable to be used in the prompt
    template="Summarize the following text: {text}"  # Template for the summarization task
)

# Set up the Summarization Chain
summarization_chain = LLMChain(
    llm=ollama_llm,  # The LLM used in the chain
    prompt=summarization_prompt  # The prompt template to use
)

# Example document to summarize
docs = """
Artificial intelligence (AI) is intelligence demonstrated by machines, in contrast to the natural intelligence displayed by humans and animals.
Leading AI textbooks define the field as the study of "intelligent agents": any device that perceives its environment and takes actions that maximize its chance of successfully achieving its goals.
"""

# Run the summarization chain with the provided document
summary = summarization_chain.run(text=docs)
print("Summary:", summary)

# --- Direct invocation of the Ollama model ---
# Use the invoke method to directly query the model with a specific prompt

prompt = "What is the capital of France?"
response = ollama_llm.invoke(prompt)
print("Response to direct invocation:", response)


Summary: Artificial intelligence is intelligence shown by machines, focusing on intelligent agents that perceive their environment and achieve their goals.


ValueError: 'llama3.2' did not respond with valid JSON. 
                Please try again. 
                Response: {
  "tool": "__conversational_response",
  "response": "The capital of France is Paris."

In [10]:
from langchain.embeddings import HuggingFaceEmbeddings

# Initialize the embeddings with the dunzhang/stella_en_400M_v5 model
embeddings = HuggingFaceEmbeddings(model_name='dunzhang/stella_en_400M_v5', model_kwargs={'trust_remote_code': True})

# Example text to verify the embeddings
text = ["Hello, how are you?", "This is a test sentence."]

# Embed the documents and print the embeddings
embedding_vectors = embeddings.embed_documents(text)

# Print out the shape and type to verify if embeddings are generated
print(type(embedding_vectors))  # Should be a list of lists (or numpy array if you use one)
print(len(embedding_vectors))   # Number of sentences processed
print(len(embedding_vectors[0]))  # Length of embedding vector


A new version of the following files was downloaded from https://huggingface.co/dunzhang/stella_en_400M_v5:
- configuration.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/dunzhang/stella_en_400M_v5:
- modeling.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


AssertionError: please install xformers

In [12]:
import pandas as pd
import spacy

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Load the CSV and get column names
def get_column_names_from_csv(file_path):
    df = pd.read_csv(file_path)
    return set(df.columns.str.lower())  # Standardize to lowercase for consistent matching

# Dynamically match query terms to column names
def interpret_question(query, file_path):
    # Get the column names from the CSV
    column_names = get_column_names_from_csv(file_path)
    
    # Parse the question with spaCy
    doc = nlp(query.lower())  # Convert query to lowercase for case-insensitive matching
    
    # Collect relevant keywords
    keywords = set()
    
    # Function to match terms dynamically with column names
    def match_term_to_column(term):
        # First, check for an exact match or a close match
        for col in column_names:
            if term == col:
                return col
        
        # Check for partial match within column names
        for col in column_names:
            if term in col or col in term:  # Allows "gdp" to match "gdp_per_capita"
                return col
        
        return None
    
    # Match entities and noun chunks dynamically to column names
    for ent in doc.ents:
        matched_col = match_term_to_column(ent.text)
        if matched_col:
            keywords.add(matched_col)
    
    for chunk in doc.noun_chunks:
        matched_col = match_term_to_column(chunk.text)
        if matched_col:
            keywords.add(matched_col)
    
    # Add individual nouns and important terms if not already matched
    if not keywords:
        for token in doc:
            if token.pos_ in {"NOUN", "PROPN", "NUM"} and not token.is_stop:
                matched_col = match_term_to_column(token.text)
                if matched_col:
                    keywords.add(matched_col)
    
    # Ensure time-based terms like "year" are included if present in column names
    time_terms = ["year", "date", "time"]
    for term in time_terms:
        matched_col = match_term_to_column(term)
        if matched_col:
            keywords.add(matched_col)
    
    # Join keywords into a refined query
    refined_query = " ".join(keywords)
    return refined_query

# Example usage
file_path = "data/world_bank_dataset.csv"
query = "Which country has the highest population and low gdp in the year 2010?"
print(interpret_question(query, file_path))


country population year


In [16]:
import pandas as pd
import spacy
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity

# Load spaCy model for named entity recognition (NER)
nlp = spacy.load("en_core_web_sm")

# Load a pre-trained Word2Vec model (e.g., Google News embeddings or other pre-trained model)
# You can download the Google Word2Vec model from https://code.google.com/archive/p/word2vec/
# For the sake of this example, we will assume you have a word2vec model file loaded.

# Assuming 'word_vectors' is the pre-trained model (for example, Google's word2vec)
# word_vectors = KeyedVectors.load_word2vec_format('path_to_pretrained_model', binary=True)

# Load CSV and get column names
def get_column_names_from_csv(file_path):
    df = pd.read_csv(file_path)
    return list(df.columns)

# Extract terms dynamically from query using NER and spaCy
def interpret_question(query, file_path):
    column_names = get_column_names_from_csv(file_path)
    doc = nlp(query.lower())  # Process the query text
    
    # Extracting entities and important words from the query
    keywords = set()
    for ent in doc.ents:
        keywords.add(ent.text)

    # Also extract noun chunks, as they could contain important terms like "gdp" or "population"
    for chunk in doc.noun_chunks:
        keywords.add(chunk.text)

    # If no matches found, consider individual noun tokens
    if not keywords:
        for token in doc:
            if token.pos_ in {"NOUN", "PROPN", "NUM"} and not token.is_stop:
                keywords.add(token.text)
    
    # We need to convert both the query and column names to vectors for comparison
    query_vector = None
    for word in keywords:
        try:
            word_vector = word_vectors[word]
            if query_vector is None:
                query_vector = word_vector
            else:
                query_vector += word_vector  # Summing up word vectors
        except KeyError:
            continue  # Skip words that don't have embeddings
    
    # If no embeddings were found for the query, return an empty result
    if query_vector is None:
        return "No match found in query."

    # Compare the vectorized query with column names to find the most relevant column
    max_similarity = -1
    best_match_column = None
    for column in column_names:
        column_vector = None
        for word in column.split():
            try:
                word_vector = word_vectors[word]
                if column_vector is None:
                    column_vector = word_vector
                else:
                    column_vector += word_vector
            except KeyError:
                continue
        
        if column_vector is not None:
            similarity = cosine_similarity([query_vector], [column_vector])[0][0]
            if similarity > max_similarity:
                max_similarity = similarity
                best_match_column = column
    
    return best_match_column

# Example usage
file_path = "data/world_bank_dataset.csv"
query = "Which country has the highest population and low gdp in the year 2010?"
result = interpret_question(query, file_path)
print(result)


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [22]:
import pandas as pd
import spacy
from fuzzywuzzy import process
import re

# Load spaCy model for named entity recognition (NER)
nlp = spacy.load("en_core_web_sm")

# Load CSV and get column names
def get_column_names_from_csv(file_path):
    df = pd.read_csv(file_path)
    # Normalize column names by removing special characters and converting to lowercase
    column_names = [re.sub(r'[^\w\s]', '', col.lower()) for col in df.columns]
    return column_names

# Extract relevant query terms using spaCy
def extract_query_terms(query):
    doc = nlp(query.lower())  # Lowercase for case-insensitive matching
    
    # Extract entities and noun chunks
    keywords = []
    for ent in doc.ents:
        keywords.append(ent.text)
    
    # Extract noun chunks (like 'highest population', 'low gdp')
    for chunk in doc.noun_chunks:
        keywords.append(chunk.text)
    
    # Remove duplicates and return the list of query terms
    return list(set(keywords))

# Fuzzy match the query terms with column names
def match_terms_to_columns(query_terms, column_names, threshold=60):
    matched_columns = []
    for term in query_terms:
        # Perform fuzzy matching for each term
        matches = process.extractOne(term, column_names, score_cutoff=threshold)
        if matches:
            matched_columns.append(matches[0])
    return matched_columns

# Main function to interpret the question and match with columns
def interpret_question(query, file_path):
    # Get the column names from the CSV file
    column_names = get_column_names_from_csv(file_path)
    
    # Extract relevant terms from the query
    query_terms = extract_query_terms(query)
    
    # Perform fuzzy matching for the extracted query terms
    matched_columns = match_terms_to_columns(query_terms, column_names)
    
    return matched_columns

# Example usage
file_path = "data/world_bank_dataset.csv"  # Path to your dataset
query = "Which country has the highest population and low gdp in the year 2010?"

# Get matched columns
matched_columns = interpret_question(query, file_path)
print("Matched Columns:", matched_columns)


Matched Columns: ['population', 'year', 'year', 'country']


In [40]:
import pandas as pd
import spacy
from difflib import get_close_matches

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Load the CSV and get column names
def get_column_names_from_csv(file_path):
    df = pd.read_csv(file_path)
    return set(df.columns.str.lower())  # Standardize to lowercase for consistent matching

# Dynamically match query terms to column names
def interpret_question(query, file_path):
    # Get the column names from the CSV
    column_names = get_column_names_from_csv(file_path)
    
    # Abbreviation mappings to expand terms dynamically
    abbreviation_mapping = {
        "gdp": "gross domestic product",
        "pop": "population",
        # Add other abbreviations as needed
    }
    
    # Parse the question with spaCy
    doc = nlp(query.lower())  # Convert query to lowercase for case-insensitive matching
    
    # Collect relevant keywords
    keywords = set()
    
    # Function to find closest matching column names for a term
    def find_closest_column_name(term):
        # Try finding the closest exact or partial match
        matches = get_close_matches(term, column_names, n=1, cutoff=0.3)
        if matches:
            return matches[0]
        
        # If no close match, attempt a substring search in column names
        for col in column_names:
            if term in col:
                return col
        
        # If still no match, check the abbreviation mapping
        if term in abbreviation_mapping:
            expanded_term = abbreviation_mapping[term]
            return find_closest_column_name(expanded_term)
        
        return None
    
    # Match entities and noun phrases to column names dynamically
    for ent in doc.ents:
        closest_match = find_closest_column_name(ent.text)
        if closest_match:
            keywords.add(closest_match)
    
    for chunk in doc.noun_chunks:
        closest_match = find_closest_column_name(chunk.text)
        if closest_match:
            keywords.add(closest_match)
    
    # Extract additional nouns if no phrases or entities are matched
    if not keywords:
        for token in doc:
            if token.pos_ in {"NOUN", "PROPN", "NUM"} and not token.is_stop:
                closest_match = find_closest_column_name(token.text)
                if closest_match:
                    keywords.add(closest_match)
    
    # Ensure essential keywords (e.g., time indicators) are added if they exist in column names
    essential_terms = ["year", "date", "time"]
    for term in essential_terms:
        match = find_closest_column_name(term)
        if match:
            keywords.add(match)
    
    # Join keywords into a refined query
    refined_query = " ".join(keywords)
    return refined_query

# Example usage
file_path = "data/world_bank_dataset.csv"
query = "Which country has the highest population and low gdp in the year 2010?"
print(interpret_question(query, file_path))


gdp (usd) country population year


working solutions for refined query

In [41]:
import pandas as pd
import spacy
from difflib import get_close_matches

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Load the CSV and get column names
def get_column_names_from_csv(file_path):
    df = pd.read_csv(file_path)
    return set(df.columns.str.lower())  # Standardize to lowercase for consistent matching

# Function to extract entities based on syntactic dependencies
def extract_entities_with_dependencies(doc):
    entities = []
    for token in doc:
        # We collect tokens with certain syntactic dependencies
        if token.dep_ in {"nsubj", "dobj", "prep", "amod"}:  # Subject, object, prepositions, adjectives
            entities.append(token.text)
    return entities

# Dynamically match query terms to column names
def interpret_question(query, file_path):
    # Get the column names from the CSV
    column_names = get_column_names_from_csv(file_path)
    
    # Parse the question with spaCy
    doc = nlp(query.lower())  # Convert query to lowercase for case-insensitive matching
    
    # Collect relevant keywords based on entity extraction and dependencies
    keywords = set()
    
    # Extract entities using dependencies
    dependency_entities = extract_entities_with_dependencies(doc)
    for dep_entity in dependency_entities:
        closest_match = find_closest_column_name(dep_entity, column_names)
        if closest_match:
            keywords.add(closest_match)
    
    # Match named entities to column names dynamically
    for ent in doc.ents:
        closest_match = find_closest_column_name(ent.text, column_names)
        if closest_match:
            keywords.add(closest_match)
    
    # Match noun chunks to column names dynamically
    for chunk in doc.noun_chunks:
        closest_match = find_closest_column_name(chunk.text, column_names)
        if closest_match:
            keywords.add(closest_match)
    
    # Extract additional nouns if no phrases or entities are matched
    if not keywords:
        for token in doc:
            if token.pos_ in {"NOUN", "PROPN", "NUM"} and not token.is_stop:
                closest_match = find_closest_column_name(token.text, column_names)
                if closest_match:
                    keywords.add(closest_match)
    
    # Ensure essential keywords (e.g., time indicators) are added if they exist in column names
    essential_terms = ["year", "date", "time"]
    for term in essential_terms:
        match = find_closest_column_name(term, column_names)
        if match:
            keywords.add(match)
    
    # Join keywords into a refined query
    refined_query = " ".join(keywords)
    return refined_query

# Function to find the closest column name from the CSV columns
def find_closest_column_name(term, column_names):
    # Try finding the closest exact or partial match
    matches = get_close_matches(term, column_names, n=1, cutoff=0.3)
    if matches:
        return matches[0]
    
    # If no close match, attempt a substring search in column names
    for col in column_names:
        if term in col:
            return col
    
    return None

# Example usage
file_path = "data/world_bank_dataset.csv"
query = "Which country has the highest population and low gdp in the year 2010?"
print(interpret_question(query, file_path))


gdp (usd) country population year


In [49]:
from sentence_transformers import SentenceTransformer, util
import spacy

# Load pre-trained Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Load spaCy model for NLP processing
nlp = spacy.load("en_core_web_sm")

# Load CSV column names (example)
column_names = ['GDP (USD)', 'Population', 'Country', 'Year']

# Function to extract relevant terms dynamically from the query
def extract_relevant_terms(query):
    doc = nlp(query.lower())  # Process the query text with spaCy
    terms = set()

    # Extract entities and key tokens
    for ent in doc.ents:
        terms.add(ent.text)
    
    # Also, extract important nouns or keywords that are not entities
    for token in doc:
        if token.pos_ in {"NOUN", "PROPN", "NUM"} and not token.is_stop:
            terms.add(token.text)

    return terms

# Function to find the best matching column using semantic similarity
def find_best_column_match(query, column_names):
    # Encode the query and column names to embeddings
    query_embedding = model.encode(query, convert_to_tensor=True)
    column_embeddings = model.encode(column_names, convert_to_tensor=True)

    # Compute cosine similarities between the query and column names
    cosine_scores = util.pytorch_cos_sim(query_embedding, column_embeddings)

    # Find best match for the query
    best_match_idx = cosine_scores.argmax()
    return column_names[best_match_idx]

# Example query
query = "What country has the highest population and lowest GDP in 2010?"

# Extract relevant terms from the query
relevant_terms = extract_relevant_terms(query)

# Find best matches for each term in the column names
matches = {}
for term in relevant_terms:
    best_match = find_best_column_match(term, column_names)
    matches[term] = best_match

# Output the extracted terms and their matched columns
print(f"Extracted relevant terms: {relevant_terms}")
for term, match in matches.items():
    print(f"Term '{term}' matched to column '{match}'")


Extracted relevant terms: {'country', '2010', 'gdp', 'population'}
Term 'country' matched to column 'Country'
Term '2010' matched to column 'Year'
Term 'gdp' matched to column 'GDP (USD)'
Term 'population' matched to column 'Population'


In [51]:
import spacy
from sentence_transformers import SentenceTransformer
import pandas as pd

# Load spaCy model and Sentence Transformer model
nlp = spacy.load("en_core_web_sm")
model = SentenceTransformer('all-MiniLM-L6-v2')

# Load CSV and get column names
def get_column_names_from_csv(file_path):
    df = pd.read_csv(file_path)
    return df.columns.str.lower().tolist()  # Standardize to lowercase

# Function to process the query and extract relevant terms dynamically
def process_query(query):
    doc = nlp(query.lower())  # Process query with spaCy
    terms = set()
    comparison_action = None
    metrics = set()

    # Extract named entities (countries, years, etc.)
    for ent in doc.ents:
        terms.add(ent.text)

    # Detect comparison actions (highest, lowest, etc.) using dependency parsing
    for token in doc:
        if token.dep_ in {"amod", "superlative", "comparative"}:  # Capturing adjectives like highest, lowest, more, etc.
            comparison_action = token.text

    # Extract relevant metrics (e.g., GDP, population)
    for token in doc:
        if token.pos_ in {"NOUN", "PROPN", "NUM"} and not token.is_stop:
            metrics.add(token.text)

    return terms, comparison_action, metrics

# Function to refine the query dynamically
def refine_query(query, file_path):
    column_names = get_column_names_from_csv(file_path)
    relevant_terms, comparison_action, relevant_metrics = process_query(query)
    
    # Remove any terms that are not found in the columns
    refined_terms = [term for term in relevant_terms if term in column_names]
    refined_metrics = [metric for metric in relevant_metrics if metric in column_names]

    # If a comparison action is found, include it in the refined query
    if comparison_action:
        refined_terms.append(comparison_action)

    # Include relevant metrics in the refined query
    refined_terms.extend(refined_metrics)

    # Return a refined query based on dynamic extraction
    return " ".join(refined_terms)

# Example usage:
file_path = "data/world_bank_dataset.csv"  # Update with your file path
query = "What country has the highest population and lowest GDP in 2010?"

# Refine the query based on dynamic extraction
refined_query = refine_query(query, file_path)
print("Refined Query:", refined_query)

# Perform document-specific retrieval based on the refined query
# docs = docsearch.similarity_search(refined_query, k=5)  # You would use this for actual search


Refined Query: lowest country population


In [None]:
import re
def extract_code_from_response(response):
    """
    This function uses regex to extract code block between triple backticks from the LLM response.
    """
    code_pattern = re.compile(r'```python(.*?)```', re.DOTALL)
    match = code_pattern.search(response)
    if match:
        python_code= match.group(1).strip()
        # Remove any line that loads the DataFrame (i.e., pd.read_csv)
        #python_code = re.sub(r"pd\.read_csv\([^\)]*\)\s*", "", python_code)
        #python_code = re.sub(r"^(\s*pd\.read_csv\([^\)]*\))", r"# \1", python_code, flags=re.MULTILINE)
         # Add a '#' to the beginning of lines with pd.read_csv to comment them out
        # This regex will match the line starting with any amount of spaces followed by pd.read_csv
        # Remove any lines that contain pd.read_csv
        
        python_code = re.sub(r"^\s*df\s*=\s*pd\.read_csv\([^\)]*\)\s*$", "", python_code, flags=re.MULTILINE)
        print(python_code)

        return python_code
    
    else:
        return None
    

In [6]:
def execute_python_code(python_code):
    """
    This function takes the extracted Python code and executes it in the current environment.
    It replaces any placeholders like 'your_data_file.csv' with the actual file path.
    """
    try:
        # Replace the placeholder 'your_data_file.csv' with the actual file path
        #python_code = re.sub(r"pd\.read_csv\([\'\"]([^\'\"]+)[\'\"]\)", 
        #                     lambda match: f"pd.read_csv('{file_path}')", python_code)

        # Execute the extracted Python code
        exec(python_code)  # Execute the Python code generated by the model
        print("Code executed successfully.")
    except Exception as e:
        print(f"Error executing code: {e}")

In [27]:
python_code= """import pandas as pd

# Load your data into a DataFrame
df = pd.read_csv('data/world_bank_dataset.csv')

# Filter the DataFrame to include only the year 2010
df_2010 = df[df['Year'] == 2010]

# Find the country with the least life expectancy
country_with_least_life_expectancy = df_2010[df_2010['Life Expectancy'] == df_2010['Life Expectancy'].min()]['Country'].values[0]       

# Print the result
print(f"The country with the least life expectancy in the year 2010 is: {country_with_least_life_expectancy}")"""

execute_python_code(python_code)


The country with the least life expectancy in the year 2010 is: Argentina
Code executed successfully.


In [40]:
response = """```python
import pandas as pd

# Load your data into a DataFrame
# df = pd.read_csv('your_data_file.csv')

# Filter the DataFrame to include only the year 2010
df_2010 = df[df['Year'] == 2010]

# Find the country with the least life expectancy
country_with_least_life_expectancy = df_2010[df_2010['Life Expectancy'] == df_2010['Life Expectancy'].min()]['Country'].values[0]       

# Print the result
print(f"The country with the least life expectancy in the year 2010 is: {country_with_least_life_expectancy}")
```"""
extract_code_from_response(response)

import pandas as pd

# Load your data into a DataFrame
# df = # Filter the DataFrame to include only the year 2010
df_2010 = df[df['Year'] == 2010]

# Find the country with the least life expectancy
country_with_least_life_expectancy = df_2010[df_2010['Life Expectancy'] == df_2010['Life Expectancy'].min()]['Country'].values[0]       

# Print the result
print(f"The country with the least life expectancy in the year 2010 is: {country_with_least_life_expectancy}")


'import pandas as pd\n\n# Load your data into a DataFrame\n# df = # Filter the DataFrame to include only the year 2010\ndf_2010 = df[df[\'Year\'] == 2010]\n\n# Find the country with the least life expectancy\ncountry_with_least_life_expectancy = df_2010[df_2010[\'Life Expectancy\'] == df_2010[\'Life Expectancy\'].min()][\'Country\'].values[0]       \n\n# Print the result\nprint(f"The country with the least life expectancy in the year 2010 is: {country_with_least_life_expectancy}")'