In [1]:
import pandas as pd
import sys
import json
import os

current_dir = os.getcwd()
src_path = os.path.join(current_dir, '..')
sys.path.append(src_path)

from utils import *

### 1. Dataset-GSS distribution

In [2]:
image_directory = 'dataset1/'
files_in_directory = set(os.listdir(image_directory))
files_in_directory = {f.replace('.jpeg', '') for f in files_in_directory if f.endswith('.jpeg')}

df1 = pd.read_csv('dataset1_info.csv')
filtered_df1 = df1[df1['image_id'].isin(files_in_directory)].copy()
print(len(filtered_df1))

llm_df1 = process_jsonl_to_dataframe('dataset1_gemini-2.5-pro.jsonl')
llm_df1['image_id'] = llm_df1['image_file'].str.replace('.jpeg', '')
llm_df1 = llm_df1[['image_id', 'environment', 'scene_type', 'setting']]

final_df1 = pd.merge(filtered_df1, llm_df1, on='image_id', how='inner')
final_df1.columns

6152
Reading data from 'dataset1_gemini-2.5-pro.jsonl'...


Index(['image_id', 'source', 'orig_id', 'url', 'label_method', 'city',
       'city_id', 'country', 'continent', 'latitude', 'longitude',
       'datetime_local', 'sequence_index', 'sequence_id', 'split', 'img_path',
       'glare', 'lighting_condition', 'pano_status', 'platform', 'quality',
       'reflection', 'view_direction', 'weather', 'environment', 'scene_type',
       'setting'],
      dtype='object')

In [3]:
columns_to_summarize = [
    'lighting_condition', 
    'reflection', 
    'view_direction', 
    'weather'
]

all_summaries = []

for col in columns_to_summarize:
    if col in filtered_df1.columns:
        summary = filtered_df1[col].value_counts().reset_index()
        summary.columns = ['value', 'count']
        summary['feature'] = col
        all_summaries.append(summary)

final_summary_df = pd.concat(all_summaries, ignore_index=True)
final_summary_df = final_summary_df[['feature', 'value', 'count']]

print(final_summary_df)

               feature       value  count
0   lighting_condition         day   5601
1   lighting_condition   dusk/dawn    319
2   lighting_condition       night    232
3           reflection          no   6152
4       view_direction  front/back   5451
5       view_direction        side    701
6              weather       clear   4638
7              weather      cloudy   1189
8              weather       snowy    175
9              weather       rainy    147
10             weather       foggy      3


In [4]:
columns_to_summarize = [
    'environment', 
    'scene_type', 
    'setting'
]

all_summaries = []

for col in columns_to_summarize:
    if col in final_df1.columns:
        summary = final_df1[col].value_counts().reset_index()
        summary.columns = ['value', 'count']
        summary['feature'] = col
        all_summaries.append(summary)

final_summary_df = pd.concat(all_summaries, ignore_index=True)
final_summary_df = final_summary_df[['feature', 'value', 'count']]

print(final_summary_df)

        feature     value  count
0   environment   outdoor   6144
1   environment     mixed      5
2   environment    indoor      2
3    scene_type    street   4636
4    scene_type     mixed   1072
5    scene_type   scenery    414
6    scene_type  building     29
7       setting     urban   4350
8       setting  suburban    998
9       setting     rural    755
10      setting   natural     48


### 2. Dataset-UPC Distribution

In [6]:
image_directory = 'dataset2/'
files_in_directory = set(os.listdir(image_directory))
files_in_directory = {f.replace('.jpg', '') for f in files_in_directory if f.endswith('.jpg')}

df2 = pd.read_csv('dataset2_info.csv')
filtered_df2 = df2[df2['image_id'].isin(files_in_directory)].copy()
print(len(filtered_df2))

llm_df2 = process_jsonl_to_dataframe('dataset2_gemini-2.5-pro.jsonl')
llm_df2['image_id'] = llm_df2['image_file'].str.replace('.jpg', '')
llm_df2 = llm_df2[['image_id', 'environment', 'scene_type', 'setting']]

final_df2 = pd.merge(filtered_df2, llm_df2, on='image_id', how='inner')
final_df2.columns

2928
Reading data from 'dataset2_gemini-2.5-pro.jsonl'...


Index(['image_id', 'name', 'address', 'gmap_id', 'description', 'latitude',
       'longitude', 'category', 'avg_rating', 'num_of_reviews', 'price',
       'hours', 'MISC', 'relative_results', 'url', 'category_list', 'street',
       'city', 'state', 'environment', 'scene_type', 'setting'],
      dtype='object')

In [6]:
columns_to_summarize = [
    'environment', 
    'scene_type', 
    'setting'
]

all_summaries = []

for col in columns_to_summarize:
    if col in final_df2.columns:
        summary = final_df2[col].value_counts().reset_index()
        summary.columns = ['value', 'count']
        summary['feature'] = col
        all_summaries.append(summary)

final_summary_df = pd.concat(all_summaries, ignore_index=True)
final_summary_df = final_summary_df[['feature', 'value', 'count']]

print(final_summary_df)

        feature     value  count
0   environment   outdoor   2400
1   environment    indoor    522
2   environment     mixed      5
3    scene_type  building   1719
4    scene_type     mixed    685
5    scene_type   scenery    403
6    scene_type    street    120
7       setting  suburban   1675
8       setting     urban    595
9       setting     rural    373
10      setting   natural    283
11      setting     mixed      1


### 3. Dataset-PCW Distribution

In [8]:
image_directory = 'dataset3/'
files_in_directory = set(os.listdir(image_directory))
files_in_directory = {f.replace('.jpg', '') for f in files_in_directory if f.endswith('.jpg')}

df3 = pd.read_csv('dataset3_info.csv')
filtered_df3 = df3[df3['image_id'].isin(files_in_directory)].copy()
print(len(filtered_df3))

llm_df3 = process_jsonl_to_dataframe('dataset3_gemini-2.5-pro.jsonl')
llm_df3['image_id'] = llm_df3['image_file'].str.replace('.jpg', '')
llm_df3 = llm_df3[['image_id', 'environment', 'scene_type', 'setting']]

final_df3 = pd.merge(filtered_df3, llm_df3, on='image_id', how='inner')
final_df3.columns

270
Reading data from 'dataset3_gemini-2.5-pro.jsonl'...


Index(['image_id', 'address', 'latitude', 'longitude', 'street', 'city',
       'state', 'country', 'environment', 'scene_type', 'setting'],
      dtype='object')

In [8]:
columns_to_summarize = [
    'environment', 
    'scene_type', 
    'setting'
]

all_summaries = []

for col in columns_to_summarize:
    if col in final_df3.columns:
        summary = final_df3[col].value_counts().reset_index()
        summary.columns = ['value', 'count']
        summary['feature'] = col
        all_summaries.append(summary)

final_summary_df = pd.concat(all_summaries, ignore_index=True)
final_summary_df = final_summary_df[['feature', 'value', 'count']]

print(final_summary_df)

        feature     value  count
0   environment   outdoor    264
1   environment    indoor      4
2   environment     mixed      2
3    scene_type     mixed    141
4    scene_type   scenery     56
5    scene_type  building     40
6    scene_type    street     33
7       setting     urban    149
8       setting  suburban     81
9       setting   natural     31
10      setting     rural      8
11      setting     mixed      1
