In [29]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import string
import re

In [30]:
df_city = pd.read_csv("City_Dataset.csv")
df_city

Unnamed: 0,Response,Review,Rating,Context,Region,Unnamed: 5,Unnamed: 6
0,Aamby Valley City,aamby valley beautiful place clear blue skies ...,5.000000,"Aamby Valley City, nestled amidst the pictures...",West,,
1,Abhaneri,visit jaipur trip usually itinerary need ask t...,4.000000,"Abhaneri, a quaint village nestled in the Daus...",West,,
2,Achooranam,wayanad tea museum awesome place wayanad miss ...,4.000000,"Achooranam, situated amidst the picturesque la...",South,,
3,Acharapakkam,hill shrines situated tamil nadu history place...,3.500000,"Acharapakkam, situated in the Kanchipuram dist...",South,,
4,Achrol,say overwelhmed expecting world class service ...,4.000000,"Achrol, a quaint town nestled in the Aravalli ...",West,,
...,...,...,...,...,...,...,...
1090,Neil Island,clean n tidy n white sand beach water floating...,4.400000,"Shaheed Dweep, earlier known as Neil Island is...",south,,
1091,Nelliyampathy,visit place thrilling drives enjoy nature real...,4.600000,"From the town of Nenmara in Palakkad district,...",south,,
1092,Nelamangala,worth visit easily accessible crowded spite ni...,4.000000,Nelamangala is a city in India. Its taluk head...,south,,
1093,Nellore,powerfull ammavaru chengalamma thalli pls visi...,4.388889,Nellore is a city located on the banks of Penn...,South,,


In [31]:
df_city.columns

Index(['Response', 'Review', 'Rating', 'Context', 'Region', 'Unnamed: 5',
       'Unnamed: 6'],
      dtype='object')

In [32]:
df_city.drop(columns=['Unnamed: 5', 'Unnamed: 6'], inplace=True)
df_city

Unnamed: 0,Response,Review,Rating,Context,Region
0,Aamby Valley City,aamby valley beautiful place clear blue skies ...,5.000000,"Aamby Valley City, nestled amidst the pictures...",West
1,Abhaneri,visit jaipur trip usually itinerary need ask t...,4.000000,"Abhaneri, a quaint village nestled in the Daus...",West
2,Achooranam,wayanad tea museum awesome place wayanad miss ...,4.000000,"Achooranam, situated amidst the picturesque la...",South
3,Acharapakkam,hill shrines situated tamil nadu history place...,3.500000,"Acharapakkam, situated in the Kanchipuram dist...",South
4,Achrol,say overwelhmed expecting world class service ...,4.000000,"Achrol, a quaint town nestled in the Aravalli ...",West
...,...,...,...,...,...
1090,Neil Island,clean n tidy n white sand beach water floating...,4.400000,"Shaheed Dweep, earlier known as Neil Island is...",south
1091,Nelliyampathy,visit place thrilling drives enjoy nature real...,4.600000,"From the town of Nenmara in Palakkad district,...",south
1092,Nelamangala,worth visit easily accessible crowded spite ni...,4.000000,Nelamangala is a city in India. Its taluk head...,south
1093,Nellore,powerfull ammavaru chengalamma thalli pls visi...,4.388889,Nellore is a city located on the banks of Penn...,South


In [33]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/drishh207/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/drishh207/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/drishh207/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [34]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def preprocess_text(text):
    text = str(text)
    # Step 1: Lowercasing
    text = text.lower()
    
    # Step 2: Tokenization
    tokens = word_tokenize(text)
    
    # Step 3: Remove Stop Words and Punctuations
    tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]
    
    # Step 4: Lemmatization
    tokens = [lemmatizer.lemmatize(stemmer.stem(word)) for word in tokens]
    
    # Step 5: Remove Citations
    tokens = [re.sub(r'\[\d+\]', '', word) for word in tokens]  # Remove citations like [1]
    
    # Step 6: Join tokens back into a string
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text


In [35]:
df_city['Context_Refined'] = df_city['Context'].apply(preprocess_text)
df_city['Context_Refined'] = df_city['Context_Refined'].str.replace(" 's", "")
df_city

Unnamed: 0,Response,Review,Rating,Context,Region,Context_Refined
0,Aamby Valley City,aamby valley beautiful place clear blue skies ...,5.000000,"Aamby Valley City, nestled amidst the pictures...",West,aambi valley citi nestl amidst picturesqu sahy...
1,Abhaneri,visit jaipur trip usually itinerary need ask t...,4.000000,"Abhaneri, a quaint village nestled in the Daus...",West,abhaneri quaint villag nestl dausa district ra...
2,Achooranam,wayanad tea museum awesome place wayanad miss ...,4.000000,"Achooranam, situated amidst the picturesque la...",South,achooranam situat amidst picturesqu landscap i...
3,Acharapakkam,hill shrines situated tamil nadu history place...,3.500000,"Acharapakkam, situated in the Kanchipuram dist...",South,acharapakkam situat kanchipuram district tamil...
4,Achrol,say overwelhmed expecting world class service ...,4.000000,"Achrol, a quaint town nestled in the Aravalli ...",West,achrol quaint town nestl arav rang near jaipur...
...,...,...,...,...,...,...
1090,Neil Island,clean n tidy n white sand beach water floating...,4.400000,"Shaheed Dweep, earlier known as Neil Island is...",south,shahe dweep earlier known neil island island a...
1091,Nelliyampathy,visit place thrilling drives enjoy nature real...,4.600000,"From the town of Nenmara in Palakkad district,...",south,town nenmara palakkad district cloud-caress pe...
1092,Nelamangala,worth visit easily accessible crowded spite ni...,4.000000,Nelamangala is a city in India. Its taluk head...,south,nelamangala citi india taluk headquart locat b...
1093,Nellore,powerfull ammavaru chengalamma thalli pls visi...,4.388889,Nellore is a city located on the banks of Penn...,South,nellor citi locat bank penna river nellor dist...


In [36]:
df_city.iloc[396].Context_Refined

'dharmapuri citi north western part tamil nadu india serv administr headquart dharmapuri district first district creat tamil nadu independ india split salem district 2 octob 1965. citi locat 50 km krishnagiri 69 km salem 60 km tirupattur 90 km hosur 117 km thiruvannamalai 126 km bangalor erod 181 km tiruppur 200 km coimbator tiruchirapp 300 km madurai state capit chennai locat latitud n 11 47 ’ 12 33 ’ longitud e 77 02 ’ 78 40 ’ one major lead cultiv produc mango state along krishnagiri often refer mango capit india'

In [37]:
df_city.iloc[396].Context

'Dharmapuri is a city in the north western part of Tamil Nadu, India. It serves as the administrative headquarters of Dharmapuri district which is the first district created in Tamil Nadu after the independence of India by splitting it from then Salem district on 2 October 1965. The city is located 50 km from Krishnagiri, 69 km from Salem, 60 km from Tirupattur, 90 km from Hosur, 117 km from Thiruvannamalai, 126 km from Bangalore and Erode, 181 km from Tiruppur, 200 km from Coimbatore and Tiruchirappalli, 300 km from Madurai and the state capital Chennai. It is located between latitudes N 11 47’ and 12 33’ and longitudes E 77 02’ and 78 40’. It is one of the major leading cultivators and producers of mangoes in the state along with Krishnagiri, and is often referred as Mango Capital of India.'

In [47]:
df_city['Region'] = df_city['Region'].astype(str).str.lower()
df_city['training_data'] = df_city['Region'] + " " + df_city['Review'] + df_city['Context_Refined']

In [48]:
df_city

Unnamed: 0,Response,Review,Rating,Context,Region,Context_Refined,training_data
0,Aamby Valley City,aamby valley beautiful place clear blue skies ...,5.000000,"Aamby Valley City, nestled amidst the pictures...",west,aambi valley citi nestl amidst picturesqu sahy...,west aamby valley beautiful place clear blue s...
1,Abhaneri,visit jaipur trip usually itinerary need ask t...,4.000000,"Abhaneri, a quaint village nestled in the Daus...",west,abhaneri quaint villag nestl dausa district ra...,west visit jaipur trip usually itinerary need ...
2,Achooranam,wayanad tea museum awesome place wayanad miss ...,4.000000,"Achooranam, situated amidst the picturesque la...",south,achooranam situat amidst picturesqu landscap i...,south wayanad tea museum awesome place wayanad...
3,Acharapakkam,hill shrines situated tamil nadu history place...,3.500000,"Acharapakkam, situated in the Kanchipuram dist...",south,acharapakkam situat kanchipuram district tamil...,south hill shrines situated tamil nadu history...
4,Achrol,say overwelhmed expecting world class service ...,4.000000,"Achrol, a quaint town nestled in the Aravalli ...",west,achrol quaint town nestl arav rang near jaipur...,west say overwelhmed expecting world class ser...
...,...,...,...,...,...,...,...
1090,Neil Island,clean n tidy n white sand beach water floating...,4.400000,"Shaheed Dweep, earlier known as Neil Island is...",south,shahe dweep earlier known neil island island a...,south clean n tidy n white sand beach water fl...
1091,Nelliyampathy,visit place thrilling drives enjoy nature real...,4.600000,"From the town of Nenmara in Palakkad district,...",south,town nenmara palakkad district cloud-caress pe...,south visit place thrilling drives enjoy natur...
1092,Nelamangala,worth visit easily accessible crowded spite ni...,4.000000,Nelamangala is a city in India. Its taluk head...,south,nelamangala citi india taluk headquart locat b...,south worth visit easily accessible crowded sp...
1093,Nellore,powerfull ammavaru chengalamma thalli pls visi...,4.388889,Nellore is a city located on the banks of Penn...,south,nellor citi locat bank penna river nellor dist...,south powerfull ammavaru chengalamma thalli pl...


Ranked Retrieval Using langchain

In [23]:
!pip install -q langchain
!pip install -q torch
!pip install -q transformers
!pip install -q sentence-transformers
!pip install -q datasets
!pip install -q faiss-cpu

[33mDEPRECATION: distro-info 1.1build1 has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of distro-info or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m[33mDEPRECATION: python-debian 0.1.43ubuntu1 has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of python-debian or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m[33mDEPRECATION: distro-info 1.1build1 has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of distro-info or contact the author to suggest that they release a version with a

In [24]:
!pip install langchain_core

Defaulting to user installation because normal site-packages is not writeable
[33mDEPRECATION: distro-info 1.1build1 has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of distro-info or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m[33mDEPRECATION: python-debian 0.1.43ubuntu1 has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of python-debian or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

In [25]:
from langchain.document_loaders import HuggingFaceDatasetLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import AutoTokenizer, pipeline
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA

2024-04-15 22:52:09.964511: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-15 22:52:10.040202: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-15 22:52:10.246461: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-15 22:52:10.246519: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-15 22:52:10.247489: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to regi

In [84]:
import io
from langchain.docstore.document import Document

def dataframe_to_list(df):
    data = []
    for index, row in df.iterrows():
        context = row['training_data']
        metadata = {'Response': row['Response'], 'Rating': row['Rating']}
        doc = Document(page_content=context, metadata=metadata)
        data.append(doc)
    return data

# Assuming 'df' is your DataFrame containing the data
result = dataframe_to_list(df_city)
with open('documents.txt', 'w') as f:
    for doc in result:
        f.write(str(doc) + '\n')

In [85]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
# Create an instance of the RecursiveCharacterTextSplitter class with specific parameters.
# It splits text into chunks of 1000 characters each with a 150-character overlap.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)

# 'data' holds the text you want to split, split the text into documents using the text splitter.
docs = text_splitter.split_documents(result)

docs[0]

Document(page_content='west aamby valley beautiful place clear blue skies fresh green grass family visited aamby valley celebrate mother birthday mom splendid time thanks pinky bharadwaj handling booking bombayaambi valley citi nestl amidst picturesqu sahyadri mountain maharashtra india travel seek blend luxuri adventur spiritu rejuven citi boast rang tourist attract includ icon aambi valley lake visitor indulg seren boat ride lakesid picnic amidst breathtak sceneri seek spiritu solac picturesqu st. mari church tranquil shri amba devi templ offer moment quiet contempl rever adventur enthusiast partak myriad thrill activ exhilar water sport jet ski kayak lake adrenaline-pump adventur like zip-lin rock climb rappel surround hill aambi valley citi seamlessli combin luxuri spiritu adventur promis travel unforgett experi amidst natur splendor sahyadri', metadata={'Response': 'Aamby Valley City', 'Rating': 5.0})

In [86]:
from langchain.embeddings import HuggingFaceEmbeddings
# Define the path to the pre-trained model you want to use
modelPath = "sentence-transformers/all-MiniLM-l6-v2"

# Create a dictionary with model configuration options, specifying to use the CPU for computations
model_kwargs = {'device':'cpu'}

# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
encode_kwargs = {'normalize_embeddings': False}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    model_name=modelPath,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
)

In [87]:
db = FAISS.from_documents(docs, embeddings)

In [93]:
db.save_local("faiss_index")
#new_db = FAISS. load_local("faiss_index", embeddings)
#docs = new_db. similarity_search(query)

In [100]:
question = "Places to visit in east india"
# Perform similarity search
new_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
search_results = new_db.similarity_search_with_relevance_scores(question, k = 30)

search_results[0:10]

[(Document(page_content='east visited similar places avoided places difficult reach time consuming\nbest place worship chaibasa place hindu muslims bhai come worship\ncity centre park attraction town kids beautiful flowers swings attracts lot children\ngood place enjoy family outings evenings multiple installations garden', metadata={'Response': 'Chaibasa', 'Rating': 3.6}),
  0.5797643945037003),
 (Document(page_content='east central india mainly represented state madhya pradesh undoubtedly incredible regions visited extraordinary country known cities state small datia way gwalior\nfamous jain temples area altar parts temple maintained properly\nproper temple idol worship pandits energy place totally different place peaceful finds lot people sad hands\nreached place saturday right eyes line darshan kept growing growing like hanuman tail place revered devotees near far temples bagalamukhi dhumavati temple dhumavati', metadata={'Response': 'Datia', 'Rating': 3.6}),
  0.5618669881646403),

In [81]:
# Initialize a list to store the updated similarity scores
updated_results = []

# Iterate over each search result
for result, score in search_results:
    # Extract the 'Response' metadata value
    response = result.metadata['Response']
    
    # Find the corresponding row in the original DataFrame based on the 'Response' value
    row = df_city[df_city['Response'] == response]
    
    # Retrieve the rating from the corresponding row
    rating = row['Rating'].values[0]  # Assuming 'User_Rating' is the column name
    
    # Add the rating to the similarity score
    updated_score = score + rating
    
    # Append the updated score to the list
    updated_results.append((result, updated_score))

# Print the updated similarity scores
print(updated_results)

[(Document(page_content='synchronize enjoying path destination station occurs amazing roads biking valleys dams natural produces sums station memorizing experience experience need start journey early morning enjoymunnar town western ghat mountain rang india ’ kerala state hill station former resort british raj elit surround roll hill dot tea plantat establish late 19th centuri eravikulam nation park habitat endang mountain goat nilgiri tahr home lakkam waterfal hike trail 2,695m-tall anamudi peak', metadata={'Response': 'Munnar'}), 4.664632829525141), (Document(page_content='beautiful view hill nice treck suggest tourists read information board temple hill enjoy magnificent himalayan range view\ntemple near kanatal hardly min drive near dhanaulti altitude metres ft trek paved steep stairs concrete roads hairpin bends leadsdhanaulti quiet hill station elev 2286 meter sea level offer panoram view lofti himalaya situat foothil garhwal himalayan rang locat 40 km 25 mi new tehri district he

In [82]:
sorted_results = sorted(updated_results, key=lambda x: x[1], reverse=True)

# Print the sorted results
sorted_results[0:10]

[(Document(page_content='south searching relaxation good choice near really excellent greenishcheruthuruthi also known vallathol nagar small town india near wadakkancheri thrissur bank nila bharathapuzha river kozhimamparambu bhagavathi templ nedumpura kulasekharanellur siva templ nedumpura chirakkulangara templ kaipancheri narasimhamoorthi templ pangavu siva templ st. thoma church jumamosqu attract cheruthuruthi anoth place visit cheruthuruthi palac kochi maharaja recent convert 3-star ayurved heritag resort name river retreat palac ‘ kavalapara mooppil nair ’ locat 8 km cheruthuruthi anoth worth watch pilgrimag place cheruthuruthi shiva templ fit exampl kerala ’ tradit architectur kozhimamparambu pooram one highest crowd pooram thrissur pooram festiv conduct earli summer feb/march everi year seven team neighbor villag templ panjal pudusseri nedumpura cheruthuruthi pallikk thazhapra-vettikkattiri attoor particip kozhimamparambu pooram', metadata={'Response': 'Cheruthuruthi'}),
  5.396

API

In [7]:
from flask import Flask, jsonify, request
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings

# Initialize Flask app
app = Flask(__name__)

# Initialize Langchain components
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
modelPath = "sentence-transformers/all-MiniLM-l6-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(model_name=modelPath, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)

# Define API routes
@app.route('/process_text', methods=['POST'])
def process_text():
    # Extract text from request
    #data = request.json
    #text = data['text']
    text = 'Religious places in north'
    
    # Split text into documents
    #docs = text_splitter.split_documents(text)
    
    # Embed documents
    #embeddings_results = embeddings.embed_documents(docs)
    
    # Process embeddings_results as needed
    new_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
    search_results = new_db.similarity_search_with_relevance_scores(text, k = 30)

    final_results = search_results[0:10]
    
    # Return response
    return jsonify(jsonify({'results': final_results}))

# Run Flask app
if __name__ == '__main__':
    app.run(port=9001, host='0.0.0.0', debug=True)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:9001
 * Running on http://192.168.51.85:9001
Press CTRL+C to quit
 * Restarting with stat
Traceback (most recent call last):
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/usr/lib/python3/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/drishh207/.local/lib/python3.10/site-packages/traitlets/config/application.py", line 1074, in launch_instance
    app.initialize(argv)
  File "/home/drishh207/.local/lib/python3.10/site-packages/traitlets/config/application.py", line 118, in inner
    return method(app, *args, **kwargs)
  File "/usr/lib/python3/dist-packages/ipykernel/kernelapp.py", line 632, in initialize
    self.init_sockets()
  File "/usr/lib/python3/dist-packages/ipykernel/kernelapp.py", line 2

SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
