In [1]:
import pandas as pd
import re
import pyarrow as pa
import pyarrow.parquet as pq

In [2]:
df = pd.read_pickle('data//Yelp//business.pkl')

In [3]:
df = df.loc[:, ~df.columns.duplicated()]

In [4]:
df_bar = df[df['categories'].str.contains('Bar|bar', na=False)]

In [5]:
df_bar.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17592 entries, 8 to 150327
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   business_id   17592 non-null  object
 1   name          17592 non-null  object
 2   address       17592 non-null  object
 3   city          17592 non-null  object
 4   state         17592 non-null  object
 5   postal_code   17592 non-null  object
 6   latitude      17592 non-null  object
 7   longitude     17592 non-null  object
 8   stars         17592 non-null  object
 9   review_count  17592 non-null  object
 10  is_open       17592 non-null  object
 11  attributes    17414 non-null  object
 12  categories    17592 non-null  object
 13  hours         15813 non-null  object
dtypes: object(14)
memory usage: 2.0+ MB


In [6]:
keywords = ['barbers', 'eyebrow', 'barrister', 'barbecue', 'furniture', 'Supplies',
            'Military', 'barn', 'hyperbaric', 'bartending','Dentists', 'Barre',
            'Bartenders', 'Food Trucks', 'Day Spas', 'Nail Salons', 'Specialty Schools',
            'Nail Technicians', 'IV Hydration', 'Animal', 'pet', 'Doctors', 'Hair',
            'adult education', 'Smoothies'
            ]


pattern = re.compile('|'.join(keywords), flags=re.IGNORECASE)

df_bar = df_bar[~df_bar['categories'].str.contains(pattern)]

#VER CATEGORIAS
 Split the strings in the 'categories' column by comma (',') and flatten the list
categories_list = df_bar['categories'].str.split(',').explode()

 Count the occurrences of each unique string
category_counts = categories_list.value_counts()

 Save the unique strings and their counts to a text file
category_counts.to_csv('category_counts.txt', header=True, sep='\t')

In [7]:
# Filtrar todas las entradas que tengan barbeque que no sean bares tambien.
keyword_bbq = r'\bBarbeque\b'
keyword_bars = r'\bBars\b'

filtered_df = df_bar[~(df_bar['categories'].str.contains(keyword_bbq, case=False, regex=True) & ~df_bar['categories'].str.contains(keyword_bars, case=False, regex=True))]

In [8]:
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12546 entries, 8 to 150323
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   business_id   12546 non-null  object
 1   name          12546 non-null  object
 2   address       12546 non-null  object
 3   city          12546 non-null  object
 4   state         12546 non-null  object
 5   postal_code   12546 non-null  object
 6   latitude      12546 non-null  object
 7   longitude     12546 non-null  object
 8   stars         12546 non-null  object
 9   review_count  12546 non-null  object
 10  is_open       12546 non-null  object
 11  attributes    12475 non-null  object
 12  categories    12546 non-null  object
 13  hours         11251 non-null  object
dtypes: object(14)
memory usage: 1.4+ MB


In [9]:
# Convertir el DataFrame en una tabla PyArrow
table = pa.Table.from_pandas(filtered_df)

# Ruta del archivo Parquet de salida
parquet_file_path = 'Business_Filtered_Yelp.parquet'

# Guardar la tabla como un archivo Parquet
pq.write_table(table, parquet_file_path)

filtered_df.to_json('Business_Filtered_Yelp1.json', orient='records', lines=True)