In [None]:
# Last amended: 30th April, 2024
# Ref: https://github.com/asokraju/LangChainDatasetForge/blob/main/Datagen.ipynb

In [None]:
# Objective: 
#       Use LLM to transform structured data into unstructured text
#       Example: Data: {"distance" : "30km" , "mode" : "Car" }
#       Transformed LLM output:
#                       I travelled 30km in a car.

In [None]:
# DO NOT RUN THIS FILE IN --COLAB--. WE USE OLLAMA

# Generate data
Our final file is structured.txt

In [2]:
# 1.0 We need to generate random data
import random

In [3]:
# 1.0.1 Display multiple command outputs from a cell:

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


### Define constants

Define some constants and what values they can have

In [4]:
# 1.0.2 List addition
[1,2,3] + [3,4]

[1, 2, 3, 3, 4]

In [5]:
# 2.0 Two lists are being added
#      to create one list with contents from each list:

DURATION_POOL = [f"{i} days" for i in range(1, 31)] + [f"{i} weeks" for i in range(1, 9)]
DURATION_POOL[:5] ; DURATION_POOL[-5:] 

['1 days', '2 days', '3 days', '4 days', '5 days']

['4 weeks', '5 weeks', '6 weeks', '7 weeks', '8 weeks']

In [6]:
# 2.0.1 Expanded Data Pools

DESTINATIONS_POOL  = ["Paris", "New York", "Tokyo", "London", "Sydney", "Cairo", "Rio", "Cape Town", "Moscow", "Beijing"]
ACTIVITIES_POOL    = ["sightseeing", "trekking", "culinary experiences", "museum visits", "beach relaxation", "mountain climbing"]
BUDGET_POOL        = [f"${i}000" for i in range(1, 21)]
ACCOMMODATION_POOL = ["hotel", "hostel", "B&B", "luxury resort", "rented apartment"]
TRANSPORTATION_POOL= ["bus", "train", "flight", "rented car", "bicycle"]

In [7]:
# 2.0.2 Additional Elements

TRAVELERS_POOL         = [f"{i} persons" for i in range(1, 11)]
SEASON_POOL            = ["spring", "summer", "fall", "winter"]
MEAL_PREF_POOL         = ["local cuisine", "vegetarian meals", "vegan options", "seafood delights", "fast food"]
TRAVEL_TYPE_POOL       = ["solo", "couple", "family", "group", "business", "backpacking"]
BOOKING_MODE_POOL      = ["travel agency", "online platform", "direct booking", "last minute deals"]
GUIDE_PREF_POOL        = ["guided tours", "self-exploration", "audio guide", "local guide", "group tour", "private tour"]
LANG_PREF_POOL         = ["English", "French", "Spanish", "local language", "multilingual"]
CULTURAL_INTEREST_POOL = ["historical sites", "modern attractions", "folk performances", "art galleries", "music concerts"]

In [8]:
# 3.0 This function generates 
def generate_travel_plan():
    # 3.0.1 Make random choices from above data pool
    elements = {
                "Duration": random.choice(DURATION_POOL),
                "Destinations": ", ".join(random.sample(DESTINATIONS_POOL, random.randint(1, 3))),
                "Activities": ", ".join(random.sample(ACTIVITIES_POOL, random.randint(1, 3))),
                "Budget": random.choice(BUDGET_POOL),
                "Accommodation": random.choice(ACCOMMODATION_POOL),
                "Transportation": ", ".join(random.sample(TRANSPORTATION_POOL, random.randint(1, 3))),
                
                "Travelers": random.choice(TRAVELERS_POOL),
                "Season": random.choice(SEASON_POOL),
                "Meal Preference": random.choice(MEAL_PREF_POOL),
                "Travel Type": random.choice(TRAVEL_TYPE_POOL),
                "Booking Mode": random.choice(BOOKING_MODE_POOL),
                
                "Guide Preference": random.choice(GUIDE_PREF_POOL),
                "Language Preference": random.choice(LANG_PREF_POOL),
                "Cultural Interest": random.choice(CULTURAL_INTEREST_POOL)
            }

    # 3.0.2 Drop some keys for variability
    num_elements_to_use = random.randint(6, 10)  # Using between 6 and 10 elements. Say 8
    keys_to_use = random.sample(list(elements.keys()), num_elements_to_use)  # From the 'elements' pick any 8 elements

    # 3.0.3 Construct the travel plan
    plan_elements = [f"{key}: {elements[key]}" for key in keys_to_use]
    travel_plan = ", ".join(plan_elements) + '.'

    # 3.0.4
    return travel_plan

In [9]:
# 3.1 Generate 100k structured travel plans

data_points = 100_000 # Means same as: 100,000. Underscore (_) is used readability for large numbers, 

# 3.1.1 Mote 'i' is not being used:
structured_data = [generate_travel_plan() for i in range(data_points)] # Generate 100000 travel plans
structured_data[:3]

['Booking Mode: direct booking, Accommodation: luxury resort, Activities: sightseeing, mountain climbing, Destinations: Paris, Language Preference: local language, Season: spring, Duration: 22 days, Cultural Interest: historical sites.',
 'Travelers: 7 persons, Destinations: Rio, New York, Cultural Interest: modern attractions, Travel Type: group, Language Preference: English, Duration: 5 weeks, Season: spring.',
 'Language Preference: local language, Destinations: Sydney, Paris, Transportation: bus, rented car, Meal Preference: fast food, Accommodation: rented apartment, Booking Mode: direct booking, Travel Type: backpacking.']

In [10]:
# 3.1.2 For demonstration, let's print the first 5 entries
for entry in structured_data[:3]:
    print(entry)
    print("---------------------")

Booking Mode: direct booking, Accommodation: luxury resort, Activities: sightseeing, mountain climbing, Destinations: Paris, Language Preference: local language, Season: spring, Duration: 22 days, Cultural Interest: historical sites.
---------------------
Travelers: 7 persons, Destinations: Rio, New York, Cultural Interest: modern attractions, Travel Type: group, Language Preference: English, Duration: 5 weeks, Season: spring.
---------------------
Language Preference: local language, Destinations: Sydney, Paris, Transportation: bus, rented car, Meal Preference: fast food, Accommodation: rented apartment, Booking Mode: direct booking, Travel Type: backpacking.
---------------------


## Transform structured data to unstructured text

In [11]:
# 4.0 o save the structured data into a text file
with open("/home/ashok/Documents/langchain/structured_data.txt", "w") as f:
    for entry in structured_data:
        _ = f.write(entry + "\n---------------------\n")


In [12]:
# 4.0.1 We have all these already installed:
"""
!pip install langchain
!pip install tqdm
!pip install OpenAI
"""

'\n!pip install langchain\n!pip install tqdm\n!pip install OpenAI\n'

In [36]:
# 4.0.2 To wrangle or data:

import pandas as pd
import numpy as np

In [15]:
# 4.1 Read the structured data file into a list:

with open("structured_data.txt", "r") as file:
    lines = file.readlines()   # Each travel plan is an element of list
    

In [16]:
# 4.2 Split the list by the separator to get individual entries:
#     Separator is: "---------------------"

entries = [entry.strip() for entry in ''.join(lines).split("---------------------") if entry.strip() != ""]


In [17]:
# 4.3 Convert the list into a NumPy array:

structured_data_array = np.array(entries)

# 4.4 Display the first few entries:

print(structured_data_array[:5])

['Booking Mode: direct booking, Accommodation: luxury resort, Activities: sightseeing, mountain climbing, Destinations: Paris, Language Preference: local language, Season: spring, Duration: 22 days, Cultural Interest: historical sites.'
 'Travelers: 7 persons, Destinations: Rio, New York, Cultural Interest: modern attractions, Travel Type: group, Language Preference: English, Duration: 5 weeks, Season: spring.'
 'Language Preference: local language, Destinations: Sydney, Paris, Transportation: bus, rented car, Meal Preference: fast food, Accommodation: rented apartment, Booking Mode: direct booking, Travel Type: backpacking.'
 'Travelers: 1 persons, Duration: 5 days, Booking Mode: last minute deals, Language Preference: English, Meal Preference: fast food, Destinations: Sydney, Beijing, Cairo, Season: winter, Guide Preference: private tour, Travel Type: backpacking.'
 'Season: fall, Duration: 6 weeks, Meal Preference: vegan options, Language Preference: English, Travel Type: family, Ac

In [18]:
# 5.0 Pick style of narration:
def pick_style():
    styles = [
        "Narrative", "Persuasive", "Expository", "Journalistic",
        "Satirical", "Stream-of-Consciousness", "Epistolary", "Conversational",
        "Didactic", "Slang or Colloquial"
    ]

    return random.choice(styles)

In [19]:
# 5.0.1
chosen_style = pick_style()

# 5.0.2
plan_list = [
              {"user_plan": plan, "style": pick_style()} for plan in structured_data_array
            ]
# 5.0.3
plan_list[0]

{'user_plan': 'Booking Mode: direct booking, Accommodation: luxury resort, Activities: sightseeing, mountain climbing, Destinations: Paris, Language Preference: local language, Season: spring, Duration: 22 days, Cultural Interest: historical sites.',
 'style': 'Conversational'}

## Develop prompt and invoke llm chain

Steps:   
See this [Quick Reference](https://python.langchain.com/docs/modules/model_io/prompts/quick_start/)

>1. Create a demo pormpt template with some variables    
>2. Create a proper prompt template    
>3. Create llm chain    
>4. Invoke chain & while invoking supply variable values   

In [None]:
# Whcih prompttemplate to use when:
#  PromptTemplate vs ChatPromptTemplate
# Refer: https://python.langchain.com/docs/modules/model_io/prompts/quick_start/#chatprompttemplate
"""

# When there is just one msg string:
PromptTemplate ("one string")

# Multiple messages: Maybe, one from System. another from human, another from ai 
ChatPromptTemplate([ ("one msg"),
                     ("IInd msg"),
                     ("IIIrd msg")
                   ]


"""

### 1.0 Create demo prompt template with variables

In [20]:
# 6.0 Use PromptTemplate when there is just one string, as here:
#     (When there are a list of strings, use chatPromptTemplate
from langchain import PromptTemplate


# 6.0.1  Write demo template with proper instructions
demo_template='''
I want you to come up with a unstructered text for the following plan: {user_plan}.
Use the writing style: {style}
for example: I'm thinking of a solo trip for about 2 weeks, primarily focusing on sightseeing and museum visits in Paris. I have a budget of around $5000.
'''


### 2,0 Create PromptTemplate object
From demo template

In [21]:
# 6.1  From deom create proper prompt template:

prompt=PromptTemplate(
                        input_variables=['user_plan','style'],
                        template=demo_template
                     )

# 6.1.1
prompt.format(user_plan=structured_data_array[0], style=pick_style())

"\nI want you to come up with a unstructered text for the following plan: Booking Mode: direct booking, Accommodation: luxury resort, Activities: sightseeing, mountain climbing, Destinations: Paris, Language Preference: local language, Season: spring, Duration: 22 days, Cultural Interest: historical sites..\nUse the writing style: Conversational\nfor example: I'm thinking of a solo trip for about 2 weeks, primarily focusing on sightseeing and museum visits in Paris. I have a budget of around $5000.\n"

### 3.0 create llm chain

In [27]:
# from langchain.llms import ollama           # This library does not work. 
                                              #   if you want to use model parameters.
                                              #     Note small case 'o' in ollama
from langchain_community.llms import Ollama   #  This library works when you want to 
                                              #    specify ollama model parameters
                                              #     Note capital 'O' in Ollama
from langchain.chains import LLMChain

In [29]:
# llm = ollama()    # without any parameters

llm = Ollama(       # with parameters
             model="llama2", 
             temperature=0.9,
             num_predict=64
            )

In [31]:
chain=LLMChain(llm=llm,prompt=prompt)

### 4.0 Invoke chain. 
While invoking, supply variable values in a dictionary format

In [32]:
from tqdm import tqdm

summary = []
for item in tqdm(plan_list[:10], desc='Processing topics'):
    summary.append(chain.invoke(item))

  warn_deprecated(
Processing topics: 100%|████████████████████████| 10/10 [00:49<00:00,  4.99s/it]


In [33]:
for struct, unstruct in zip(structured_data_array, summary):
    print(struct)
    print(unstruct)
    print("-----------------------")

Booking Mode: direct booking, Accommodation: luxury resort, Activities: sightseeing, mountain climbing, Destinations: Paris, Language Preference: local language, Season: spring, Duration: 22 days, Cultural Interest: historical sites.
 Oh wow, you're planning an epic adventure! 😍 I can totally help you plan this luxurious 22-day trip to Paris! 🇫🇷 *adjusts sunglasses*

First things first, have you
-----------------------
Travelers: 7 persons, Destinations: Rio, New York, Cultural Interest: modern attractions, Travel Type: group, Language Preference: English, Duration: 5 weeks, Season: spring.
"Ah, an exciting journey awaits! A group of 7 intrepid travelers, with a keen interest in modern attractions, shall embark upon a thrilling adventure spanning five weeks and traversing the cultural hotspots of Rio and New York. English shall be the lingua
-----------------------
Language Preference: local language, Destinations: Sydney, Paris, Transportation: bus, rented car, Meal Preference: fast f

In [34]:
topic_list = [
    {"user_plan": user_plan, "user_input": user_input}
    for user_plan, user_input in zip(structured_data_array, summary)
]

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(topic_list)

# Save the DataFrame to a CSV file
df.to_csv('dataset.csv', index=False, encoding='utf-8')
df.head()

Unnamed: 0,user_plan,user_input
0,"Booking Mode: direct booking, Accommodation: l...","Oh wow, you're planning an epic adventure! 😍 ..."
1,"Travelers: 7 persons, Destinations: Rio, New Y...","""Ah, an exciting journey awaits! A group of 7 ..."
2,"Language Preference: local language, Destinati...","Ah, an exciting adventure is coming your way!..."
3,"Travelers: 1 persons, Duration: 5 days, Bookin...","Ah, an adventurous traveler seeking to explore..."
4,"Season: fall, Duration: 6 weeks, Meal Preferen...","Ah, fall is one of my favorite seasons! And y..."


In [35]:
topic_list[0]

{'user_plan': 'Booking Mode: direct booking, Accommodation: luxury resort, Activities: sightseeing, mountain climbing, Destinations: Paris, Language Preference: local language, Season: spring, Duration: 22 days, Cultural Interest: historical sites.',
 'user_input': " Oh wow, you're planning an epic adventure! 😍 I can totally help you plan this luxurious 22-day trip to Paris! 🇫🇷 *adjusts sunglasses*\n\nFirst things first, have you"}

In [None]:
########## DONE #############