## pre-process: 
1. collect city POIs' Yelp business data 
2. align SafeGraph and Yelp at POI level
3. collect city POIs' Yelp review and image
4. get descriptive text for image by GPT4v


In [1]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
import ast
from tqdm import tqdm
import shutil
from shapely.geometry import Point
from numpy import radians, sin, cos, arcsin, sqrt

In [2]:
city = 'New Orleans' #ATENTION: change target city name here!

yelp_path='/data/rawdata/Yelp/'  
geojson_path = '/data/rawdataset/city_geojson/'
yelp_business_file_name = 'yelp_academic_dataset_business.json'
yelp_business_file_path = yelp_path+yelp_business_file_name

## collect city Yelp bussiness

business_df (150346, 14) ['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'is_open',
       'attributes', 'categories', 'hours']
       

In [None]:
#json文件有多行 读取数据时要逐行阅读
all_data_temp=[]
with open(yelp_business_file_path, 'r', encoding='utf-8') as file:
    for line in file:
        try:
            data_temp = json.loads(line)
            all_data_temp.append(data_temp)
        except json.JSONDecodeError as e: #没有报错
            print(f"Error parsing JSON object on line: {file.lineno()} - {e}")
business_df = pd.DataFrame(all_data_temp)    
#(150346, 14)
# Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
#        'latitude', 'longitude', 'stars', 'review_count', 'is_open',
#        'attributes', 'categories', 'hours'],
#       dtype='object')

business_df.head(5)

In [None]:
city_geojson = gpd.read_file(geojson_path+f'{city}.geojson') 
city_cbgs = list(city_geojson.census_block_group)

business_df_filtered = []
for index, row in tqdm.tqdm(business_df.iterrows()):
    point = Point(row['longitude'], row['latitude'])
    if any(polygon.contains(point) for polygon in city_geojson.geometry):
        business_df_filtered.append(row)
business_df_filtered = pd.DataFrame(business_df_filtered)
business_df_filtered.reset_index(drop=True, inplace=True)
business_df_filtered.drop(columns=['within_city'], inplace=True)
business_df = business_df_filtered
del business_df_filtered

business_df.to_csv(yelp_path+f'/yelp_business_city/{city}_business.csv',index=False)

business_df.head()

## concat SafeGraph and Yelp
poi_datasets_df(safegraph) 

business_df(yelp)  14563
result_df  (5965,)
['placekey', 'location_name','visitor_home_cbgs', 'latitude',
       'longitude','poi_cbg', 'business_id', 'name', 'address', 'stars', 'review_count', 'is_open', 'attributes', 'categories',
       'hours']

philadelphia:14563-->5340?不确定
Tuscon: 10014,7502-->2703
Tampa: 10302, 6599-->2222
Reno: 6016 ,6373 --> 1866
New Orleans: 7145,6261-->2392

In [3]:
poi_datasets_df = pd.read_csv(f'data/{city}/{city}_poi_features.csv',usecols=['placekey','location_name','poi_cbg'])
poi_location = pd.read_csv(f'data/{city}/{city}_poi_location.csv')
poi_visitor = pd.read_csv(f'data/{city}/{city}_poi_visitor.csv')
poi_datasets_df = pd.merge(poi_datasets_df,poi_visitor,how='inner',on='placekey')
poi_datasets_df = pd.merge(poi_datasets_df,poi_location,how='inner',on='placekey').sort_values(by=['latitude', 'longitude'], ascending=[True, True])
business_df = pd.read_csv(yelp_path+f'/yelp_business_city/{city}_business.csv').sort_values(by=['latitude', 'longitude'], ascending=[True, True])
print(poi_datasets_df.columns,business_df.columns)

Index(['placekey', 'location_name', 'poi_cbg', 'visitor_home_cbgs', 'latitude',
       'longitude'],
      dtype='object') Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'is_open',
       'attributes', 'categories', 'hours'],
      dtype='object')


In [4]:
def disN7(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    d_lon = lon2 - lon1
    d_lat = lat2 - lat1
    aa = sin(d_lat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(d_lon / 2) ** 2
    bb=sqrt(aa)
    c = 2 * arcsin(bb)
    r = 6371
    return c * r

In [5]:
matched_df = pd.merge(poi_datasets_df, business_df, left_on='location_name',right_on='name', how='inner', indicator=True)  

therohold = 0.2

grouped = matched_df.groupby('placekey')
result_df = pd.DataFrame()

for (placekey, group) in tqdm(grouped):
    if len(group) == 1:
        disN7_value = disN7(group['longitude_x'].iloc[0], group['latitude_x'].iloc[0], group['longitude_y'].iloc[0], group['latitude_y'].iloc[0])
        group['disN7'] = disN7_value
        if disN7_value < therohold:
            result_df = result_df.append(group)
    else:
        group['disN7'] = disN7(group['longitude_x'].iloc[0], group['latitude_x'].iloc[0], group['longitude_y'], group['latitude_y'])
        min_disN7_row = group[group['disN7'] == group['disN7'].min()]
        if min_disN7_row['disN7'].iloc[0] <therohold:
            result_df = result_df.append(min_disN7_row)
            
print(result_df.shape)
result_df = result_df.sort_values(by='disN7').drop_duplicates(subset='business_id', keep='first').reset_index(drop=True)
print(result_df.shape)
result_df.head()

100%|██████████| 2894/2894 [00:14<00:00, 204.13it/s]

(2598, 22)
(2588, 22)





Unnamed: 0,placekey,location_name,poi_cbg,visitor_home_cbgs,latitude_x,longitude_x,business_id,name,address,city,...,latitude_y,longitude_y,stars,review_count,is_open,attributes,categories,hours,_merge,disN7
0,222-222@8dy-qv3-9xq,Big Easy Fresh Market,220710000000.0,"{""221030408035"": 75.80835959193861, ""220710024...",29.965759,-90.086833,115LqFpJV04pVbl0TMqAog,Big Easy Fresh Market,2669 Canal St,New Orleans,...,29.965759,-90.086833,4.5,22,1,"{'Caters': 'True', 'BusinessAcceptsCreditCards...","Food, Grocery","{'Monday': '6:0-22:0', 'Tuesday': '6:0-22:0', ...",both,3.6e-05
1,22x-222@8dy-rfj-bzf,Freret Hardware,220710100000.0,"{""220710128003"": 78.9006265727704, ""2207101170...",29.935142,-90.109681,gn3iHULx6eHKLQDLp5_Bcw,Freret Hardware,5109 Freret St,New Orleans,...,29.935143,-90.10968,4.0,17,1,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Shopping, Kitchen & Bath, Home & Garden, Hardw...","{'Monday': '8:0-16:0', 'Tuesday': '8:0-16:0', ...",both,0.000124
2,22h-222@8dy-qv4-3qz,Bud Rip's Old 9th Ward Bar,220710000000.0,"{""220710039001"": 88.52431470759399, ""220710112...",29.964967,-90.042379,7n1S1ehbFA3oL8SoB_oP3w,Bud Rip's Old 9th Ward Bar,900 Piety,New Orleans,...,29.964965,-90.042379,3.5,40,1,"{'OutdoorSeating': 'False', 'HasTV': 'True', '...","Bars, Nightlife",,both,0.000179
3,222-222@8dy-qsk-575,Pontilly Sno,220710100000.0,"{""220710017223"": 252.58972796691998, ""22071001...",30.002249,-90.043685,ZDDjY5Fq8PVX793gSxDSMQ,Pontilly Sno,3968 Old Gentilly Rd,New Orleans,...,30.00225,-90.043684,4.5,11,1,"{'WiFi': ""u'free'"", 'RestaurantsPriceRange2': ...","Ice Cream & Frozen Yogurt, Shaved Ice, Food","{'Monday': '0:0-0:0', 'Tuesday': '10:0-19:0', ...",both,0.000182
4,229-222@8dy-rf7-649,French Riviera Spa,220710100000.0,"{""220510249002"": 103.42595947014821, ""22071010...",29.962419,-90.117967,_y1Sa3lMU51_C0sR8hb5vQ,French Riviera Spa,8350 Earhart Blvd,New Orleans,...,29.962417,-90.117967,2.0,15,0,"{'GoodForKids': 'False', 'BusinessAcceptsCredi...","Trainers, Fitness & Instruction, Gyms, Active ...","{'Monday': '5:0-22:0', 'Tuesday': '5:0-22:0', ...",both,0.000191


In [6]:
result_df.drop(columns= ['latitude_y', 'longitude_y', '_merge', 'disN7', 'city', 'state', 'postal_code'], inplace=True) # 'geometry',
result_df.rename(columns={'latitude_x': 'latitude', 'longitude_x': 'longitude'}, inplace=True)
result_df.sort_values(by='placekey', inplace=True)
result_df.reset_index(drop=True, inplace=True)
print(f"{city} city: {poi_datasets_df.shape[0]} POI in SafeGraph,{business_df.shape[0]} POI in Yelp, "
      f"{result_df.shape[0]} POI matched")

New Orleans city: 7145 POI in SafeGraph,6261 POI in Yelp, 2588 POI matched


In [7]:
# delete those POIs whose visitor_num<500 or sum_visitors<500 
result_df['different_cbg_num'] = [x.count(":") for x in result_df.visitor_home_cbgs]
result_df['sum_visitors'] = result_df['visitor_home_cbgs'].apply(lambda x:sum(ast.literal_eval(x).values()))
result_df = result_df[result_df.different_cbg_num >= 5]
result_df = result_df[result_df.sum_visitors >= 500]
print(f"{result_df.shape[0]} after filted")
result_df.head()

2392 after filted


Unnamed: 0,placekey,location_name,poi_cbg,visitor_home_cbgs,latitude,longitude,business_id,name,address,stars,review_count,is_open,attributes,categories,hours,different_cbg_num,sum_visitors
0,222-222@8dy-qs9-p9z,The Radiator Shop,220710000000.0,"{""220710017351"": 292.0178718135912, ""220710033...",30.007954,-90.020985,89xXxx8dFbT7mVmOyH82jA,The Radiator Shop,3722 Downman Rd,3.5,5,1,{'BusinessAcceptsCreditCards': 'True'},"Auto Parts & Supplies, Home Services, Auto Rep...","{'Monday': '8:0-17:0', 'Tuesday': '8:0-17:0', ...",25,3262.656319
1,222-222@8dy-qs9-pd9,Daiquiri Chef,220710000000.0,"{""220710017511"": 643.0514205032194, ""220710017...",30.008492,-90.021147,Ggh6vaYzcOGoQ57w1NOFCQ,Daiquiri Chef,3738 Downman Rd,4.0,14,0,"{'BikeParking': 'True', 'OutdoorSeating': 'Fal...","Nightlife, Bars, Cocktail Bars, Drive-Thru Bar...","{'Monday': '14:0-23:0', 'Tuesday': '15:0-23:0'...",65,8229.103764
2,222-222@8dy-qsb-249,Wendy's,220710000000.0,"{""220710017352"": 757.0820159395057, ""220710033...",30.006397,-90.034648,XJQTL0Sjv2eWCdE4_c1J4A,Wendy's,4605 Chef Menteur Hwy.,2.5,14,1,"{'GoodForKids': 'True', 'BusinessParking': ""{'...","Fast Food, Restaurants, Hot Dogs, Burgers","{'Monday': '10:0-2:0', 'Tuesday': '10:0-2:0', ...",194,39393.926626
3,222-222@8dy-qsb-3bk,McDonald's,220710100000.0,"{""220710017203"": 1810.766465233393, ""220710017...",30.004355,-90.036442,-9yzQQ0d_rcOD2CzdTNO_Q,McDonald's,4240 Louisa St,2.0,20,1,"{'HasTV': 'True', 'NoiseLevel': ""u'loud'"", 'Re...","Fast Food, Restaurants, Coffee & Tea, Food, Bu...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...",288,88230.90987
4,222-222@8dy-qsc-9j9,Pepperoni Ray's Cafe,220710000000.0,"{""220710138002"": 122.33900789319748, ""22071002...",30.013102,-90.050696,uM1ahq0ZG5utDIV3JwjHyA,Pepperoni Ray's Cafe,5339 Franklin Ave,4.5,42,1,"{'GoodForMeal': ""{'dessert': False, 'latenight...","Cafes, Pizza, Chicken Wings, Restaurants, Salad","{'Monday': '15:0-20:0', 'Tuesday': '15:0-20:0'...",15,1839.890153


# get Review

['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny',
       'cool', 'text', 'date']
       6990280

In [8]:
yelp_reviews_file_name = 'yelp_academic_dataset_review.json'
yelp_reviews_file_path = yelp_path+yelp_reviews_file_name

all_data_temp=[]
business_id_temp=[]
with open(yelp_reviews_file_path, 'r', encoding='utf-8') as file:
    for line in file:
        try:
            data_temp = json.loads(line)
            all_data_temp.append(data_temp)
        except json.JSONDecodeError as e: 
            print(f"Error parsing JSON object on line: {file.lineno()} - {e}")
review_df = pd.DataFrame(all_data_temp)    

# print(review_df.shape) #(6990280, 9)
# print(review_df.columns) #['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny', 'cool', 'text', 'date']
review_df.head(5)

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3.0,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5.0,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3.0,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5.0,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4.0,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15


In [9]:
filtered_review_df = review_df[review_df['business_id'].isin(result_df['business_id'])]
del review_df
filtered_review_df.sort_values(by='business_id', inplace=True)
filtered_review_df.reset_index(drop=True, inplace=True)

print(filtered_review_df.shape)
# filtered_review_df.head()

(356016, 9)


In [10]:
grouped = filtered_review_df.groupby('business_id')

result_df['review'] = None

for business_id, group in tqdm(grouped):
    corresponding_rows = result_df[result_df['business_id'] == business_id]
    if corresponding_rows.shape[0]>1:
        print(corresponding_rows)
    if not corresponding_rows.empty:
        review_list = []
        for index, row in group.iterrows():
            review_dict = {
                'stars': row['stars'],
                'useful': row['useful'],
                'funny': row['funny'],
                'cool': row['cool'],
                'text': row['text'],
                'date': row['date']
            }
            review_list.append(review_dict)
        for index, row in corresponding_rows.iterrows():
            result_df.at[index, 'review'] = review_list 


100%|██████████| 2392/2392 [00:37<00:00, 63.70it/s] 


In [11]:
# print(type(result_df.loc[0,'review'])) #list
result_df['review_num']=result_df['review'].apply(lambda x:len(x))
print(f"review_num min:{result_df.review_num.min()}")
result_df.head()

review_num min:5


Unnamed: 0,placekey,location_name,poi_cbg,visitor_home_cbgs,latitude,longitude,business_id,name,address,stars,review_count,is_open,attributes,categories,hours,different_cbg_num,sum_visitors,review,review_num
0,222-222@8dy-qs9-p9z,The Radiator Shop,220710000000.0,"{""220710017351"": 292.0178718135912, ""220710033...",30.007954,-90.020985,89xXxx8dFbT7mVmOyH82jA,The Radiator Shop,3722 Downman Rd,3.5,5,1,{'BusinessAcceptsCreditCards': 'True'},"Auto Parts & Supplies, Home Services, Auto Rep...","{'Monday': '8:0-17:0', 'Tuesday': '8:0-17:0', ...",25,3262.656319,"[{'stars': 5.0, 'useful': 5, 'funny': 1, 'cool...",8
1,222-222@8dy-qs9-pd9,Daiquiri Chef,220710000000.0,"{""220710017511"": 643.0514205032194, ""220710017...",30.008492,-90.021147,Ggh6vaYzcOGoQ57w1NOFCQ,Daiquiri Chef,3738 Downman Rd,4.0,14,0,"{'BikeParking': 'True', 'OutdoorSeating': 'Fal...","Nightlife, Bars, Cocktail Bars, Drive-Thru Bar...","{'Monday': '14:0-23:0', 'Tuesday': '15:0-23:0'...",65,8229.103764,"[{'stars': 5.0, 'useful': 1, 'funny': 0, 'cool...",14
2,222-222@8dy-qsb-249,Wendy's,220710000000.0,"{""220710017352"": 757.0820159395057, ""220710033...",30.006397,-90.034648,XJQTL0Sjv2eWCdE4_c1J4A,Wendy's,4605 Chef Menteur Hwy.,2.5,14,1,"{'GoodForKids': 'True', 'BusinessParking': ""{'...","Fast Food, Restaurants, Hot Dogs, Burgers","{'Monday': '10:0-2:0', 'Tuesday': '10:0-2:0', ...",194,39393.926626,"[{'stars': 4.0, 'useful': 0, 'funny': 0, 'cool...",15
3,222-222@8dy-qsb-3bk,McDonald's,220710100000.0,"{""220710017203"": 1810.766465233393, ""220710017...",30.004355,-90.036442,-9yzQQ0d_rcOD2CzdTNO_Q,McDonald's,4240 Louisa St,2.0,20,1,"{'HasTV': 'True', 'NoiseLevel': ""u'loud'"", 'Re...","Fast Food, Restaurants, Coffee & Tea, Food, Bu...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...",288,88230.90987,"[{'stars': 4.0, 'useful': 1, 'funny': 1, 'cool...",20
4,222-222@8dy-qsc-9j9,Pepperoni Ray's Cafe,220710000000.0,"{""220710138002"": 122.33900789319748, ""22071002...",30.013102,-90.050696,uM1ahq0ZG5utDIV3JwjHyA,Pepperoni Ray's Cafe,5339 Franklin Ave,4.5,42,1,"{'GoodForMeal': ""{'dessert': False, 'latenight...","Cafes, Pizza, Chicken Wings, Restaurants, Salad","{'Monday': '15:0-20:0', 'Tuesday': '15:0-20:0'...",15,1839.890153,"[{'stars': 1.0, 'useful': 0, 'funny': 0, 'cool...",42


# Get images
images_df  200100
['photo_id', 'business_id', 'caption', 'label']

In [12]:
yelp_images_file_name = 'photos.json'
yelp_images_file_path = yelp_path+yelp_images_file_name

all_data_temp=[]
business_id_temp=[]
with open(yelp_images_file_path, 'r', encoding='utf-8') as file:
    for line in file:
        try:
            data_temp = json.loads(line)
            all_data_temp.append(data_temp)
        except json.JSONDecodeError as e: 
            print(f"Error parsing JSON object on line: {file.lineno()} - {e}")
images_df = pd.DataFrame(all_data_temp)    

print(images_df.shape)
print(images_df.columns)
images_df.head(5)

(200100, 4)
Index(['photo_id', 'business_id', 'caption', 'label'], dtype='object')


Unnamed: 0,photo_id,business_id,caption,label
0,zsvj7vloL4L5jhYyPIuVwg,Nk-SJhPlDBkAZvfsADtccA,Nice rock artwork everywhere and craploads of ...,inside
1,HCUdRJHHm_e0OCTlZetGLg,yVZtL5MmrpiivyCIrVkGgA,,outside
2,vkr8T0scuJmGVvN2HJelEA,_ab50qdWOk0DdB6XOrBitw,oyster shooter,drink
3,pve7D6NUrafHW3EAORubyw,SZU9c8V2GuREDN5KgyHFJw,Shrimp scampi,food
4,H52Er-uBg6rNrHcReWTD2w,Gzur0f0XMkrVxIwYJvOt2g,,food


In [13]:
filtered_images_df = images_df[images_df['business_id'].isin(result_df['business_id'])]
del images_df
filtered_images_df.sort_values(by='business_id', inplace=True)
filtered_images_df.reset_index(drop=True, inplace=True)

print(filtered_images_df.shape)
filtered_images_df.head()

(11576, 4)


Unnamed: 0,photo_id,business_id,caption,label
0,mE4cpHO-PFul6-cuCI6zeA,-0__F9fnKt8uioCKztF5Ww,Special Cocktail!,drink
1,ryMJZ3-87GYPT-eU2m2IIw,-0__F9fnKt8uioCKztF5Ww,,drink
2,pQC1WC7E2e0VQ4LPX4gqFw,-0__F9fnKt8uioCKztF5Ww,"""Drunk Tank""",drink
3,kgPd7SPAx15B2obdxH-Nmw,-0__F9fnKt8uioCKztF5Ww,Awesome spicy cocktails!,drink
4,zGWdngoiZa3jEMi9SycKvQ,-0__F9fnKt8uioCKztF5Ww,,drink


In [14]:
grouped = filtered_images_df.groupby('business_id')
result_df['images'] = None

for business_id, group in tqdm(grouped):
    corresponding_rows = result_df[result_df['business_id'] == business_id]
    if not corresponding_rows.empty:
        images_list = []
        for index, row in group.iterrows():
            images_dict = {
                'photo_id': row['photo_id'],
                'caption': row['caption'],
                'label': row['label']
            }
            images_list.append(images_dict)
        for index, row in corresponding_rows.iterrows():
            result_df.at[index, 'images'] = images_list 
            
result_df['images_num']=result_df['images'].apply(lambda x:len(x) if x is not None else 0)

print(result_df.columns)
print(result_df.shape)
print(f"{(result_df['images_num']>0).sum()} POI have at least 5 images.")
result_df.head()

100%|██████████| 1136/1136 [00:02<00:00, 537.32it/s]

Index(['placekey', 'location_name', 'poi_cbg', 'visitor_home_cbgs', 'latitude',
       'longitude', 'business_id', 'name', 'address', 'stars', 'review_count',
       'is_open', 'attributes', 'categories', 'hours', 'different_cbg_num',
       'sum_visitors', 'review', 'review_num', 'images', 'images_num'],
      dtype='object')
(2392, 21)
1136 POI have at least 5 images.





Unnamed: 0,placekey,location_name,poi_cbg,visitor_home_cbgs,latitude,longitude,business_id,name,address,stars,...,is_open,attributes,categories,hours,different_cbg_num,sum_visitors,review,review_num,images,images_num
0,222-222@8dy-qs9-p9z,The Radiator Shop,220710000000.0,"{""220710017351"": 292.0178718135912, ""220710033...",30.007954,-90.020985,89xXxx8dFbT7mVmOyH82jA,The Radiator Shop,3722 Downman Rd,3.5,...,1,{'BusinessAcceptsCreditCards': 'True'},"Auto Parts & Supplies, Home Services, Auto Rep...","{'Monday': '8:0-17:0', 'Tuesday': '8:0-17:0', ...",25,3262.656319,"[{'stars': 5.0, 'useful': 5, 'funny': 1, 'cool...",8,,0
1,222-222@8dy-qs9-pd9,Daiquiri Chef,220710000000.0,"{""220710017511"": 643.0514205032194, ""220710017...",30.008492,-90.021147,Ggh6vaYzcOGoQ57w1NOFCQ,Daiquiri Chef,3738 Downman Rd,4.0,...,0,"{'BikeParking': 'True', 'OutdoorSeating': 'Fal...","Nightlife, Bars, Cocktail Bars, Drive-Thru Bar...","{'Monday': '14:0-23:0', 'Tuesday': '15:0-23:0'...",65,8229.103764,"[{'stars': 5.0, 'useful': 1, 'funny': 0, 'cool...",14,"[{'photo_id': 'Q10coUruBbaPlS2uL9DHww', 'capti...",1
2,222-222@8dy-qsb-249,Wendy's,220710000000.0,"{""220710017352"": 757.0820159395057, ""220710033...",30.006397,-90.034648,XJQTL0Sjv2eWCdE4_c1J4A,Wendy's,4605 Chef Menteur Hwy.,2.5,...,1,"{'GoodForKids': 'True', 'BusinessParking': ""{'...","Fast Food, Restaurants, Hot Dogs, Burgers","{'Monday': '10:0-2:0', 'Tuesday': '10:0-2:0', ...",194,39393.926626,"[{'stars': 4.0, 'useful': 0, 'funny': 0, 'cool...",15,,0
3,222-222@8dy-qsb-3bk,McDonald's,220710100000.0,"{""220710017203"": 1810.766465233393, ""220710017...",30.004355,-90.036442,-9yzQQ0d_rcOD2CzdTNO_Q,McDonald's,4240 Louisa St,2.0,...,1,"{'HasTV': 'True', 'NoiseLevel': ""u'loud'"", 'Re...","Fast Food, Restaurants, Coffee & Tea, Food, Bu...","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...",288,88230.90987,"[{'stars': 4.0, 'useful': 1, 'funny': 1, 'cool...",20,,0
4,222-222@8dy-qsc-9j9,Pepperoni Ray's Cafe,220710000000.0,"{""220710138002"": 122.33900789319748, ""22071002...",30.013102,-90.050696,uM1ahq0ZG5utDIV3JwjHyA,Pepperoni Ray's Cafe,5339 Franklin Ave,4.5,...,1,"{'GoodForMeal': ""{'dessert': False, 'latenight...","Cafes, Pizza, Chicken Wings, Restaurants, Salad","{'Monday': '15:0-20:0', 'Tuesday': '15:0-20:0'...",15,1839.890153,"[{'stars': 1.0, 'useful': 0, 'funny': 0, 'cool...",42,,0


In [15]:
result_df.to_csv(f"data/{city}/{city}_poi_with_yelp_review_image.csv",index=False) 

# Get image_text by GPT4v

In [19]:
city_name = ['New Orleans','Tampa','Reno','Tucson']
image_num = 0
for city in city_name:
    result_df = pd.read_csv(f'data/{city}/{city}_poi_with_yelp_review_image.csv')
    image_num += result_df.images_num.sum()
print(image_num)

26736


In [32]:
import os
for index, row in result_df.iterrows():
    if row['images_num']>0:
        placekey_value = row['placekey']
        images_list = ast.literal_eval(row['images'])
        for i, image_dict in enumerate(images_list):
            photo_id = image_dict.get('photo_id')
            if photo_id:
                original_photo_path = yelp_path+f'/photos/{photo_id}.jpg'
                if os.path.exists(original_photo_path):
                    new_photo_path = yelp_path+f'/{city}_images/{placekey_value}_{i}.jpg'
                    os.makedirs(os.path.dirname(new_photo_path), exist_ok=True)
                    shutil.copyfile(original_photo_path, new_photo_path)
                else:
                    print(f'File not found: {original_photo_path}')
            else:
                print(f'Photo ID not found in image dict at index {index}')
    else:
        
        continue

In [None]:
from openai import OpenAI
import base64
import requests
import time
import pandas as pd
import csv
import os
from openai import AzureOpenAI
import openai
from PIL import Image


# Function to encode the image
def encode_image(image_path):
    try:
        img = Image.open(image_path)
        with open(image_path, "rb") as image_file:
            encoded_image = base64.b64encode(image_file.read()).decode('utf-8')
        return encoded_image,0

    except Exception as e:
        # 如果图片无法识别，记录文件名并跳过
        print(f"ERROR：{image_path}，{e}")
        return 1,1

def predict(prompt, base64_image):
    start_time = time.time()
    retry_count = 100
    retry_interval = 1

    for _ in range(retry_count):
        try:
            response = client.chat.completions.create(
                model=model_name,
                messages= [
                    {"role": "user", "content": [{"type": "text", "text": prompt},
                                                 {"type": "image_url",
                                                  "image_url": {"url": f"data:image/jpeg;base64,{base64_image}",
                                                                "detail": "auto"}}
                                                 ]
                     }
                ],
                max_tokens= 1000,
                temperature= 0.8
            )


            msg = response.choices[0].message.content

            answer_token = response.usage.completion_tokens
            ask_token = response.usage.prompt_tokens
            end_time = time.time()
            total_run_time = round(end_time - start_time, 3)
            print('one_request_time: {} s'.format(total_run_time))
            return msg,ask_token,answer_token

        except Exception as e:
            print("erroe：", e)
            print('re-ask....')
    
            retry_interval *= 2
            time.sleep(retry_interval)

def main():

    start_time_total = time.time()
    result_df = pd.read_csv(f'data/{city}/{city}_poi_with_yelp_review_image.csv')
    yelp_path = '/data/rawdata/Yelp'
    folder_path = yelp_path + f'/{city}_images/'
    prompt = """This image was taken in {name}, which is a {categories} place. Please give me a brief description for it , no more than 100 words, including as much information as possible from the picture."""
    print("total image num:",result_df.images_num.sum())

    text_result = []
    all_answer = 0
    all_ask = 0
    for index, row in result_df.iterrows():
        print(index,row['images_num'])
        placekey = row['placekey']
        row_name = row['name']
        row_categories = row['categories']
        all_res = []

        for i in range(row['images_num']):
            # get the base 64 string of the iamge
            image_path = folder_path + placekey + '_' + str(i) + '.jpg'
            base64_image,error = encode_image(image_path)
            if error:
                all_res.append('error')
            else:
                res,ask_token,answer_token = predict(prompt.format(name=row_name, categories=row_categories), base64_image)
                all_res.append(res)
                all_answer += answer_token
                all_ask += ask_token
                time.sleep(0.3)

        text_result.append(all_res)

        if (index + 1) % chunk_size == 0:
            save_path = folder_path+f'textdata/text_result_{index // chunk_size}.csv'
            os.makedirs(os.path.dirname(save_path), exist_ok=True)
            with open(save_path, mode='w', newline='', encoding='utf-8') as file:
                writer = csv.writer(file)
                for row in text_result:
                    writer.writerow(row)
            del text_result
            text_result = []

    if len(text_result) > 0:
        save_path = folder_path+f'textdata/text_result_{index // chunk_size}.csv'
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        with open(save_path, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            for row in text_result:
                writer.writerow(row)

    end_time_total = time.time()
    total_run_time_total = round(end_time_total - start_time_total, 3)
    print('Total_run_time: {} s'.format(total_run_time_total))
    print('ask token:', all_ask)
    print('answer token:',all_answer)
    print('total cost(￥):',all_ask*4.13/1000000+all_answer*4.13/1000000)


if __name__ == "__main__":
    #ATTENTION
    api_name = 'siliconflow' #control model
    city = 'New Orleans' #control city
    chunk_size = 100 #control chunk size

    if api_name == 'openai':
        API_KEY = 'your key'
        proxy = {
            'http': 'http://localhost:7890',
            'https': 'http://localhost:7890'
        }
        openai.proxy = proxy
        BASE_URL = "https://api.openai.com/v1/chat/completions"
        model_name = "gpt-4-turbo"
    elif api_name == 'siliconflow':
        API_KEY = 'your key'
        BASE_URL= "https://api.siliconflow.cn/v1"
        model_name = "Qwen/Qwen2-VL-72B-Instruct"
        client = OpenAI(
            api_key=API_KEY,
            base_url =BASE_URL
        )

    elif api_name == 'azureopenai':
        API_KEY ="your key"
        ENDPOINT= "https://fanbingbing-reem-gpt4o.openai.azure.com/"
        model_name = 'gpt-4o-mini'
        client = AzureOpenAI(
            api_key=API_KEY,
            api_version="2024-07-01-preview",
            azure_endpoint= ENDPOINT # end point
        )

    main()

### Organize text_result

In [28]:
import csv
import os

city = 'Tucson'
folder_path = f'/data/rawdata/Yelp/{city}_images/textdata/text_result_{}.csv' 
file_pattern = 'text_result_'
csv_files = [f for f in os.listdir(folder_path) if f.startswith(file_pattern) and f.endswith('.csv')]
csv_files.sort(key=lambda x: int(x.split('_')[-1].split('.')[0]))

all_text_results = []
for file_name in csv_files:  
    file_path = os.path.join(folder_path, file_name)
    print(file_name)
    with open(file_path, mode='r', newline='', encoding='utf-8') as file:
        reader = csv.reader(file)  
        text_result = list(reader)
        
    all_text_results.extend(text_result)
df_temp = pd.DataFrame(all_text_results)
df_temp.head()

text_result_0.csv
text_result_1.csv
text_result_2.csv
text_result_3.csv
text_result_4.csv
text_result_5.csv
text_result_6.csv
text_result_7.csv
text_result_8.csv
text_result_9.csv
text_result_10.csv
text_result_11.csv
text_result_12.csv
text_result_13.csv
text_result_14.csv
text_result_15.csv
text_result_16.csv
text_result_17.csv
text_result_18.csv


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,78,79,80,81,82,83,84,85,86,87
0,The image captures a vibrant dining scene at E...,The image showcases a chicken taco salad from ...,This vibrant dish from El Pollo Loco features ...,The image features a burrito from El Pollo Loc...,The image depicts an El Pollo Loco restaurant ...,The image features a spread from El Pollo Loco...,"The image showcases a meal from El Pollo Loco,...",The image showcases a vibrant and appetizing s...,"The image depicts a meal from El Pollo Loco, a...",,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
