In [1]:
import os
import ast
import datetime
import pandas as pd
import statistics

nacis = pd.read_csv('google_categories_annotation.csv')
naics_dict = dict(zip(nacis['category'], nacis['3-digit NAICS Code']))

def get_place_metrics(df, gmap_id):
    """Get metrics for a place given its gmap_id."""
    data = df[df['gmap_id']==gmap_id]
    total_reviews = data.shape[0]
    avg_sentiment = data['sentiment_value'].mean()
    avg_rating = data['rating'].mean()
    return [gmap_id, total_reviews, avg_sentiment, avg_rating]

def process_state(df):
    """Process the state file and return a dataframe with metrics for each place."""
    df['sentiment_value'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else (0 if x == 'neutral' else -1))
    #df['time'] = df['time'].apply(lambda ts: datetime.datetime.fromtimestamp(ts/1000))
    gmap_ids = list(set(df['gmap_id'].to_list()))
    place_metrics = []
    for gmap_id in gmap_ids:
        metrics = get_place_metrics(df, gmap_id)
        place_metrics.append(metrics)
    columns=['gmap_id', 'total_accessible_reviews', 'avg_accessible_sentiment', 'avg_accessible_rating']
    metrics_df = pd.DataFrame(place_metrics, columns = columns)
    return metrics_df


def map_category(category):
    """Map the Google categories to NAICS code."""
    try:
        # Check if category is a string and needs evaluation
        if isinstance(category, str):
            lst = ast.literal_eval(category)
        else:
            lst = category  # Assume it's already a list

        lst = [i.replace("'", "") for i in lst]
        naics_codes = []
        for i in lst:
            if i in naics_dict:
                naics_codes.append(int(naics_dict[i]))
            else:
                continue
        mode = statistics.mode(naics_codes)
        if naics_codes.count(mode) == 1:
            return mode
        else:
            return naics_codes[0]
    except:
        return None

In [2]:
sentiment_folder = "accessible-review-merged-result"
sentiment_filenames = os.listdir(sentiment_folder)
sentiment_filenames = sorted(sentiment_filenames)

meta_folder = "accessible-meta"
meta_filenames = os.listdir(meta_folder)
meta_filenames = sorted(meta_filenames)

save_folder = "accessible-poi-metrics"
if not os.path.exists(save_folder):
    os.makedirs(save_folder)

for sentiment_file, meta_file in zip(sentiment_filenames, meta_filenames):
    sentiment_filepath = os.path.join(sentiment_folder, sentiment_file)
    meta_filepath = os.path.join(meta_folder, meta_file)
    print(f"----- processing: {sentiment_filepath}-{meta_filepath} -----")
    
    # Read poi-review sentiment file
    df = pd.read_json(sentiment_filepath, lines=True)
    for col in df.select_dtypes(include='object').columns:
        df[col] = df[col].apply(lambda x: str(x) if isinstance(x, (list, dict)) else x)
    df = df.drop_duplicates()
    df = df.rename(columns={'label': 'sentiment'})
    df = df[df['sentiment']!='unrelated']
    print(len(df))
    metrics_df = process_state(df)

    # Read poi-meta file
    meta_df = pd.read_json(meta_filepath, lines=True)
    meta_df = meta_df.drop_duplicates(subset='gmap_id', keep='first')
    merge_df = pd.merge(metrics_df, meta_df, on='gmap_id', how='left')
    merge_df['naics_code'] = merge_df['category'].apply(map_category)
    #merge_df = merge_df.dropna(subset=['naics_code'])
    save_file = sentiment_filepath.split('/')[1].split('.')[0] + "-results.jsonl"
    save_path = os.path.join(save_folder, save_file)    
    merge_df.to_json(save_path, orient='records', lines=True)

----- processing: accessible-review-merged-result/review-Alabama-prediction.jsonl-accessible-meta/filtered_meta-Alabama.jsonl -----
5929
----- processing: accessible-review-merged-result/review-Alaska-prediction.jsonl-accessible-meta/filtered_meta-Alaska.jsonl -----
833
----- processing: accessible-review-merged-result/review-Arizona-prediction.jsonl-accessible-meta/filtered_meta-Arizona.jsonl -----
14444
----- processing: accessible-review-merged-result/review-Arkansas-prediction.jsonl-accessible-meta/filtered_meta-Arkansas.jsonl -----
3830
----- processing: accessible-review-merged-result/review-California-prediction.jsonl-accessible-meta/filtered_meta-California.jsonl -----
45812
----- processing: accessible-review-merged-result/review-Colorado-prediction.jsonl-accessible-meta/filtered_meta-Colorado.jsonl -----
12409
----- processing: accessible-review-merged-result/review-Connecticut-prediction.jsonl-accessible-meta/filtered_meta-Connecticut.jsonl -----
3487
----- processing: acces