In [1]:
import os
import ast
import statistics
import pandas as pd
import numpy as np

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams
import matplotlib.colors as mcolors

rcParams['figure.dpi'] = 500
rcParams['savefig.dpi'] = 500
rcParams['font.family'] = 'Serif'
rcParams['font.size'] = 18
rcParams['axes.labelsize'] = 18
rcParams['axes.titlesize'] = 18
rcParams['xtick.labelsize'] = 18
rcParams['ytick.labelsize'] = 18
rcParams['legend.fontsize'] = 18
rcParams['figure.titlesize'] = 18

In [3]:
mapping_df = pd.read_excel('places_summary.xlsx')
mapping_df = mapping_df.astype(str)
naics3_to_category = dict(zip(mapping_df['naics_code'], mapping_df['Categories']))
naics3_to_title = dict(zip(mapping_df['naics_code'], mapping_df['3-digit NAICS Title']))
naics3_to_naics2 = dict(zip(mapping_df['naics_code'], mapping_df['2-digit NAICS Code']))
naics2_to_title = dict(zip(mapping_df['2-digit NAICS Code'], mapping_df['2-digit NAICS Title']))
mapping_df

Unnamed: 0,naics_code,2-digit NAICS Code,2-digit NAICS Title,3-digit NAICS Code,3-digit NAICS Title,Categories
0,722,72,Accommodation and Food Services,722,Food Services and Drinking Places,Restaurant
1,713,71,"Arts, Entertainment, and Recreation",713,"Amusement, Gambling, and Recreation Industries",Recreation
2,445,44,Retail Trade,445,Food and Beverage Stores,Retail Trade
3,812,81,Other Services (except Public Administration),812,Personal and Laundry Services,Personal Service
4,452,45,Retail Trade,452,General Merchandise Stores,Retail Trade
...,...,...,...,...,...,...
90,313,31,Manufacturing,313,Textile Mills,
91,425,42,Wholesale Trade,425,Wholesale Electronic Markets and Agents and Br...,Retail Trade
92,521,52,Finance and Insurance,521,Monetary Authorities-Central Bank,Personal Service
93,316,31,Manufacturing,316,Leather and Allied Product Manufacturing,


In [4]:
df = pd.read_pickle('g_poisf.pkl')
df = df.dropna(subset=['naics_code'])
print(len(df))
df['3-digit-naics'] = df['naics_code'].astype(int).astype(str)
df['2-digit-naics'] = df['3-digit-naics'].apply(lambda x: str(x)[:2])
df['2-naics-title'] = df['2-digit-naics'].apply(lambda x: naics2_to_title.get(str(x), None))
df['3-naics-title'] = df['3-digit-naics'].apply(lambda x: naics3_to_title.get(str(x), None))
df['3-naics-category'] = df['3-digit-naics'].apply(lambda x: naics3_to_category.get(str(x), None))
df['county-FIPS'] = df['BGFIPS'].apply(lambda x: str(x)[:5])
df['poi_accessible_score'] = (df['total_accessible_reviews'] * df['avg_accessible_sentiment'].astype(float))
df.head(5)

279734


Unnamed: 0,gmap_id,total_accessible_reviews,avg_accessible_sentiment,avg_accessible_rating,name,address,description,latitude,longitude,category,...,naics_code,geometry,BGFIPS,3-digit-naics,2-digit-naics,2-naics-title,3-naics-title,3-naics-category,county-FIPS,poi_accessible_score
0,0x8891e8b900b862b5:0x1b6ec8a4f0e7e161,1,1.0,5.0,Point A Park RV Park and Campground,"Point A Park RV Park and Campground, 25882 Sai...",,31.376554,-86.511742,[RV park],...,721.0,POINT (897737.26 967210.788),10399616001,721,72,Accommodation and Food Services,Accommodation,Hotel,1039,1.0
1,0x889a57f09c39d6bd:0xa54c31c6404cee0f,1,-1.0,3.0,Aztecas Restaurant & Cantina,"Aztecas Restaurant & Cantina, 310 Industrial P...",,30.797465,-88.081955,[Family restaurant],...,722.0,POINT (754909.11 889414.803),10970054002,722,72,Accommodation and Food Services,Food Services and Drinking Places,Restaurant,1097,-1.0
2,0x8862133b4a02f6e5:0x2eeeb24d15f287b6,1,1.0,5.0,Publix Super Market at Piedmont Point Shopping...,Publix Super Market at Piedmont Point Shopping...,Supermarket chain with a wide selection of gro...,34.692772,-86.565992,[Grocery store],...,445.0,POINT (855944.014 1334951.877),10890019031,445,44,Retail Trade,Food and Beverage Stores,Retail Trade,1089,1.0
3,0x88626b4e3a74daf1:0xdffbb64475d20dc7,1,-1.0,3.0,City of Huntsville Dogspot,"City of Huntsville Dogspot, 200 Cleveland Ave ...",,34.737694,-86.587868,"[Dog park, Park]",...,713.0,POINT (853468.945 1339755.026),10890031002,713,71,"Arts, Entertainment, and Recreation","Amusement, Gambling, and Recreation Industries",Recreation,1089,-1.0
4,0x8892930728069f7f:0x9a2a7cc8e42c63a8,1,1.0,5.0,Ramsey Park,"Ramsey Park, 520 Virginia Dr, Dothan, AL 36301",,31.219578,-85.407373,[City park],...,713.0,POINT (1003760.244 960889.281),10690405001,713,71,"Arts, Entertainment, and Recreation","Amusement, Gambling, and Recreation Industries",Recreation,1069,1.0


In [5]:
df1 = df[(df['3-naics-category'].notna()) & (df['3-naics-category'] != 'nan')]
df2 = df1[df1['total_accessible_reviews'] >= 5]
print(len(df2))

10838


In [9]:
categories = ['Retail Trade', 'Recreation', 'Hotel', 'Personal Service',  'Restaurant', 'Health Care']

df3 = df2[(df2['3-naics-category'].isin(categories))]
df3[['latitude', 'longitude', '3-naics-category']].to_csv('poi_20_locations.csv', index=False)

In [None]:
def poi_distribution(df1, df2):
    plt.figure(figsize=(6, 7))
    
    # Get value counts for both dataframes
    value_counts1 = df1['3-naics-category'].value_counts()
    value_counts2 = df2['3-naics-category'].value_counts()
    
    # Combine categories from both dataframes
    all_categories = sorted(set(value_counts1.index) | set(value_counts2.index))
    count_dict2 = {cat: value_counts2.get(cat, 0) for cat in all_categories}
    sorted_categories = sorted(all_categories, key=lambda x: count_dict2[x], reverse=True)
    
    # Create x-axis positions for the bars
    x = range(len(sorted_categories))
    width = 0.35
    
    # Plot bars
    plt.barh([i - width/2 for i in x], 
             [value_counts1.get(cat, 0) for cat in sorted_categories], 
             height=width, color='blue', alpha=0.6, label='All POIs')
    plt.barh([i + width/2 for i in x], 
             [value_counts2.get(cat, 0) for cat in sorted_categories], 
             height=width, color='red', alpha=0.6, label='POIs (>=5 reviews)')
    
    # Add value labels
    for i, cat in enumerate(sorted_categories):
        value1 = value_counts1.get(cat, 0)
        value2 = value_counts2.get(cat, 0)
        plt.text(value1, i - width/2, str(value1), ha='left', va='center', fontsize=14)
        plt.text(value2, i + width/2, str(value2), ha='left', va='center', fontsize=14)
    
    plt.yticks(x, sorted_categories)
    plt.xlabel('POI count')
    plt.ylabel('Categories')
    # plt.legend()
    plt.tight_layout()
    plt.show()

poi_distribution(df1, df2)

In [None]:
def group_data(df, attribute):
    df_grouped = df.groupby([attribute, '3-naics-category']).agg(
        poi_accessible_score=('poi_accessible_score', 'mean'),
        total_accessible_reviews=('total_accessible_reviews', 'sum'),
        total_accessible_pois=('total_accessible_reviews', 'count')
    ).reset_index()

    df_grouped['poi_accessible_score'] = df_grouped['poi_accessible_score'].astype(float).round(5)
    df_grouped['avg_accessible_sentiment'] = df_grouped['poi_accessible_score'] / df_grouped['total_accessible_reviews']
    df_grouped = df_grouped[df_grouped['total_accessible_reviews'] >= 5]
    return df_grouped

cbg_df = group_data(df, 'BGFIPS')
county_df = group_data(df, 'county-FIPS')

In [None]:
categories = ['Retail Trade', 'Recreation', 'Hotel', 'Personal Service',  'Restaurant', 'Health Care']

def sentiment_distribution(df, categories):

    fig, axs = plt.subplots(ncols=3, nrows=2, figsize=(12, 7))
    for i, category in enumerate(categories):
        row = i // 3
        col = i % 3
        sns.histplot(data=df[df['3-naics-category'] == category], x='avg_accessible_sentiment', 
                    kde=True, color='blue', ax=axs[row, col], alpha=0.2)

        Q1 = df[df['3-naics-category'] == category]['avg_accessible_sentiment'].quantile(0.25)
        Q3 = df[df['3-naics-category'] == category]['avg_accessible_sentiment'].quantile(0.75)
        mean = df[df['3-naics-category'] == category]['avg_accessible_sentiment'].mean()

        axs[row, col].axvline(x=Q1, color='red', linestyle='--')
        axs[row, col].axvline(x=Q3, color='red', linestyle='--')
        axs[row, col].axvline(x=mean, color='blue', linestyle='-')
        axs[row, col].text(Q1-0.7, 520, f'Q1: {Q1:.2f}', color='red', fontsize=14)
        axs[row, col].text(mean, 420, f'Q2: {mean:.2f}', color='blue', fontsize=14, ha='center')
        axs[row, col].text(Q3+0.06, 520, f'Q3: {Q3:.2f}', color='red', fontsize=14)
        axs[row, col].set_xlabel('weighted sentiment')
        axs[row, col].set_ylabel('POI count')
        axs[row, col].set_ylim(0, 600)
        axs[row, col].set_title(f'{category}')

    plt.tight_layout()
    plt.show()

In [None]:
sentiment_distribution(df2, categories)