## DataThon 2025

In [1]:
import os

#This is to gather the Data

folder_path = "/Users/jeanmarckceant/Desktop/Datathon/Datathon2025_RAW_"
#folder_path = "/content/drive/SharedWithMe/Datathon2025_RAW"

file_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path)]

for path in file_paths:
    print(path)


/Users/jeanmarckceant/Desktop/Datathon/Datathon2025_RAW_/media.csv
/Users/jeanmarckceant/Desktop/Datathon/Datathon2025_RAW_/reviews.csv
/Users/jeanmarckceant/Desktop/Datathon/Datathon2025_RAW_/places.csv


In [2]:
#Required for the RAG

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
import plotly.express as px
import plotly.graph_objects as go
from tqdm.notebook import tqdm

plt.style.use("seaborn-v0_8-whitegrid")
sns.set_theme(style="whitegrid")

In [3]:
try:
  from sentence_transformers import SentenceTransformer
except:
  !pip install -q sentence-transformers
  from sentence_transformers import SentenceTransformer

# Load a pre-trained sentence transformer model
sent_trans_model = SentenceTransformer('all-MiniLM-L6-v2')


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# Jeanmarck

# Loading the CSV files

In [10]:
places = pd.read_csv("/Users/jeanmarckceant/Desktop/Datathon/Datathon2025_RAW_/places.csv")
reviews = pd.read_csv("/Users/jeanmarckceant/Desktop/Datathon/Datathon2025_RAW_/reviews.csv")
media = pd.read_csv("/Users/jeanmarckceant/Desktop/Datathon/Datathon2025_RAW_/media.csv")

print(places.shape)
print(reviews.shape)
print(media.shape)

places.head()

(1500, 8)
(15243, 2)
(37622, 2)


Unnamed: 0,place_id,name,neighborhood,latitude,longitude,tags,short_description,emoji
0,place_1,Public Records,Brooklyn,40.68227,-73.9864,"{night_club,cafe,bar,restaurant}",vinyl dance club,üíø
1,place_2,Silence Please,,40.71895,-73.9949,{cafe},vinyl cafe,üíø
2,place_3,schmuck.,,40.72637,-73.98647,{bar},craft cocktails,üç∏
3,place_4,The Django,Tribeca,40.71941,-74.00491,"{bar,night_club,restaurant}",underground jazz,üé∑
4,place_5,Honeycomb Hi-Fi Lounge,Park Slope,40.68077,-73.97775,{bar},listening bar,üéµ


# Preprocessing Step: Build Vibe Texts for Each Place

In this step, we are preparing the full text description for each place.
First, we combine all the reviews for each place into one paragraph.
Then, we pick one photo for each place from the media data.
After that, we merge the places data with the grouped reviews and selected photo.
Finally, we create a new column called 'combined_text_blob' that includes the name, neighborhood, tags, short description, and reviews for each place.
This text will be used to create embeddings for the vibe search later.


In [11]:
# Step 1: Merge reviews
reviews_grouped = reviews.groupby('place_id')['review_text'].apply(lambda x: ' '.join(x)).reset_index()

# Step 2: Pick one photo per place
media_first = media.groupby('place_id')['media_url'].first().reset_index()

# Step 3: Merge everything together
places_full = places.merge(reviews_grouped, on='place_id', how='left')
places_full = places_full.merge(media_first, on='place_id', how='left')

# Step 4: Build the "vibe paragraph" (combined text blob)
def create_text_blob(row):
    parts = []
    parts.append(f"Name: {row['name']}")
    if pd.notnull(row['neighborhood']):
        parts.append(f"Neighborhood: {row['neighborhood']}")
    parts.append(f"Emoji: {row['emoji']}")
    parts.append(f"Tags: {row['tags']}")
    parts.append(f"Short description: {row['short_description']}")
    if pd.notnull(row['review_text']):
        parts.append(f"Reviews: {row['review_text']}")
    return ' | '.join(parts)

places_full['combined_text_blob'] = places_full.apply(create_text_blob, axis=1)

# Now your vibe_texts list will EXIST
vibe_texts = places_full['combined_text_blob'].tolist()


# generate embeddings for each place

We use the SentenceTransformer model to convert the combined vibe text for each place into a numeric vector.
These embeddings capture the meaning and vibe of each place and will be used later to compare user queries to the places during search.

In [12]:
# Step 5: Create embeddings
embeddings = sent_trans_model.encode(vibe_texts, show_progress_bar=True)


Batches:   0%|          | 0/47 [00:00<?, ?it/s]

# Now there are two things
1. place_full- the main dataframe
2. the embeddings - the array of vectors one for each place
Add the embeddings to the main dataframe so that when the user searches it can compare the query to the embedding vectors.

In [13]:
import numpy as np

# Turn the embeddings into a DataFrame
embedding_dim = len(embeddings[0])
embedding_columns = [f"embed_{i}" for i in range(embedding_dim)]

embeddings_df = pd.DataFrame(embeddings, columns=embedding_columns)

# Combine with the original places_full
places_full_with_embeddings = pd.concat([places_full.reset_index(drop=True), embeddings_df], axis=1)


# becareful im saving the dataframe so the previous codes are not neccesairy to be re run

In [15]:

places_full_with_embeddings.to_csv("/Users/jeanmarckceant/Desktop/Datathon/Datathon2025_RAW_/places_with_embeddings.csv", index=False)
places_full_with_embeddings.head()


Unnamed: 0,place_id,name,neighborhood,latitude,longitude,tags,short_description,emoji,review_text,media_url,...,embed_374,embed_375,embed_376,embed_377,embed_378,embed_379,embed_380,embed_381,embed_382,embed_383
0,place_1,Public Records,Brooklyn,40.68227,-73.9864,"{night_club,cafe,bar,restaurant}",vinyl dance club,üíø,the best place to dance until 4am in nyc. get ...,https://cdn.corner.inc/place-photo/AUjq9jnss_x...,...,0.017848,-0.053128,-0.115378,0.089903,-0.019357,0.070289,0.01521,-0.002804,-0.087874,-0.017765
1,place_2,Silence Please,,40.71895,-73.9949,{cafe},vinyl cafe,üíø,i heard they charge an entrance fee now at the...,https://cdn.corner.inc/place-photo/AWYs27xW6jd...,...,0.020004,0.040557,-0.050439,0.108197,-0.023294,0.014372,0.007188,0.030145,-0.082585,-0.004405
2,place_3,schmuck.,,40.72637,-73.98647,{bar},craft cocktails,üç∏,apparently this is very vibey and THE spot but...,https://cdn.corner.inc/ugc/0875b9e6-d6fe-4db1-...,...,0.070617,-0.053774,-0.037403,0.071637,0.03211,-0.020919,0.016921,0.070947,-0.082259,0.015914
3,place_4,The Django,Tribeca,40.71941,-74.00491,"{bar,night_club,restaurant}",underground jazz,üé∑,The prettiest jazz club I‚Äôve been! Good cockta...,https://cdn.corner.inc/place-photo/AUjq9jkq_2u...,...,0.009451,-0.018489,-0.076381,0.138031,-0.002393,-0.053571,-0.004497,-0.013319,-0.120357,-0.071296
4,place_5,Honeycomb Hi-Fi Lounge,Park Slope,40.68077,-73.97775,{bar},listening bar,üéµ,listening bar One of my favorite bars in NYC. ...,https://cdn.corner.inc/place-photo/cb4ddc19-d8...,...,0.000143,0.015622,-0.082808,0.075175,0.058516,-0.060124,0.087145,0.025067,-0.116184,0.023544


# Building the search engine

1. User willl tyoe a querie
2. the query wil be embed
3. then the cosine simularity would be calculated btw the places query
4. it would return the most simmiler Query

In [16]:
from sklearn.metrics.pairwise import cosine_similarity

# 1. Embed the user query
query = input("Enter a Query: ")
query_embedding = sent_trans_model.encode([query])

# 2. Compute cosine similarity
similarities = cosine_similarity(query_embedding, embeddings_df.values)[0]  # shape (n_places,)

# 3. Find top 5 places
top_k = 5
top_k_indices = similarities.argsort()[-top_k:][::-1]  # highest similarity first

# 4. Show the results
results = places_full_with_embeddings.iloc[top_k_indices]
results[['name', 'neighborhood', 'short_description', 'emoji']]


Unnamed: 0,name,neighborhood,short_description,emoji
442,Raines Law Room Chelsea,Chelsea,speakeasy,ü•É
876,230 Fifth Rooftop Bar,NoMad,rooftop lounge,üåÜ
919,Greenacre Park,Turtle Bay,waterfall garden,üå≥
623,Liz's Book Bar,Carroll Gardens Historic District,books and wine,üìö
840,Hi-Note,Alphabet City,cafe & workspace,‚òï


# Making a reusable search function

This function takes in a user's text query and finds the places that best match the vibe they are looking for.
It first converts the query into an embedding using the same SentenceTransformer model we used for the places.
Then, it calculates the cosine similarity between the query embedding and all the place embeddings.
 It sorts the places by similarity and returns the top results, showing the name, neighborhood, short description, and emoji for each place.
This allows users to search naturally, based on how a place feels, rather than relying only on keywords.


In [17]:
def vibe_search(query, top_k):
    query_embedding = sent_trans_model.encode([query])
    similarities = cosine_similarity(query_embedding, embeddings_df.values)[0]
    top_k_indices = similarities.argsort()[-top_k:][::-1]
    results = places_full_with_embeddings.iloc[top_k_indices]
    return results[['name', 'neighborhood', 'short_description', 'emoji']]


In [20]:
vibe_search("places for a rainy afternoon", top_k=5)

Unnamed: 0,name,neighborhood,short_description,emoji
442,Raines Law Room Chelsea,Chelsea,speakeasy,ü•É
876,230 Fifth Rooftop Bar,NoMad,rooftop lounge,üåÜ
919,Greenacre Park,Turtle Bay,waterfall garden,üå≥
623,Liz's Book Bar,Carroll Gardens Historic District,books and wine,üìö
840,Hi-Note,Alphabet City,cafe & workspace,‚òï


# Running a stream lit app on python  in google collab

In [21]:
!pip install streamlit

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
