In [104]:
import pandas as pd
import os
import numpy as np
import ast

from tpot import TPOTClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split

import datetime

import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'iframe'

In [84]:
folder_path = '../data/google_maps_data/'
all_dfs = [pd.read_csv(os.path.join(folder_path, file)) for file in [f for f in os.listdir(folder_path) if f.endswith('.csv')]]

In [85]:
df_raw = pd.concat(all_dfs, ignore_index=True)

In [86]:
df_raw.shape

(1137, 43)

In [95]:
df = df_raw.copy()

In [96]:
### Change to boolean ###
df['has_reservations'] = str_list_to_bool(df['reservations'])
df['can_order_online'] = str_list_to_bool(df['order_online_links'])
df['has_menu'] = str_list_to_bool(df['menu'])
df['has_website'] = [pd.isna(i) for i in df.website]
df['is_spending_on_ads'] = [pd.isna(i) for i in df.is_spending_on_ads]
df['has_reservation'] = [True if ast.literal_eval(i) else False for i in df['reservations']]

In [102]:
### Categories ###
# can be categories or main_category
"""
df['categories'] = [ast.literal_eval(i) if not pd.isna(i) else i for i in df['categories']]
tmp = df['categories'].explode()
binary_df = pd.get_dummies(tmp).groupby(level=0).any()
binary_df.columns = ['Category - '+str(i) for i in binary_df.columns]
"""
tmp = df['main_category'].explode()
binary_df = pd.get_dummies(tmp).groupby(level=0).any()
binary_df.columns = ['Category - '+str(i) for i in binary_df.columns]

df = pd.concat([df, binary_df], axis=1)

In [None]:
### Closed on ###
tmp = []
for i in df['closed_on']:
    if i=='Open All Days':
        tmp.append(["lundi","mardi","mercredi","jeudi","vendredi","samedi","dimanche"])
    else:
        tmp.append(ast.literal_eval(i))
df['closed_on'] = tmp

In [68]:
df = df.drop([
    'query',
    'categories',
    'main_category',
    'competitors', 
    'detailed_reviews',
    'main_category',
    'featured_reviews', 
    'popular_times', 
    'images',
    'phone',
    'description',
    'owner',
    'can_claim',
    'featured_image',
    'main_category',
    'is_temporarily_closed',
    'review_keywords',
    'address',
    'link',
    'status',
    'featured_question',
    'reviews_link',
    'plus_code',
    'detailed_address',
    'time_zone',
    'menu',
    'reservations',
    'order_online_links',
    'website',
    'most_popular_times',
], axis=1)

In [160]:
list(df['workday_timing'].unique())

['12:00-00:00',
 '12:00-14:30, 19:00-22:30',
 '08:00-01:00',
 '19:00-00:00',
 '11:00-23:00',
 '09:00-02:00',
 '12:00-15:00, 18:30-23:00',
 '12:00-14:15, 19:00-23:00',
 '10:30-02:00',
 '18:00-02:00',
 '12:00-14:15, 19:00-22:30',
 '10:00-23:00',
 '08:00-00:30',
 '12:00-13:30, 19:15-21:30',
 '09:30-23:00',
 '19:00-02:00',
 '11:00-15:00, 18:00-23:00',
 '10:30-23:00',
 '07:30-20:00',
 '18:30-23:30',
 '11:00-23:30',
 '12:00-22:00',
 '07:00-02:00',
 '10:00-02:00',
 '12:00-15:30, 19:00-23:00',
 '18:00-00:00',
 '12:00-14:00, 19:00-22:00',
 '07:00-21:00',
 '07:00-00:00',
 '12:00-15:00, 18:30-22:00',
 '12:00-14:30, 18:30-23:30',
 nan,
 '20:00-23:30',
 '12:00-15:00, 19:00-23:30',
 '11:30-22:00',
 '19:00-22:30',
 '08:00-02:00',
 '12:00-14:00, 18:00-21:00',
 '09:00-23:00',
 '11:00-00:00',
 '11:00-15:00, 19:00-00:00',
 '18:00-22:30',
 '12:00-14:30, 19:00-23:30',
 '12:00-15:00, 19:00-23:00',
 '12:00-15:00, 19:00-22:00',
 '12:00-14:45, 18:30-00:00',
 '12:00-13:30, 19:30-21:30',
 '12:00-16:00, 19:00-22:

In [161]:
from typing import List, Dict

def categorize_opening_times(times: List[str]) -> List[Dict[str, List[str]]]:
    """
    Categorizes opening times into morning, noon, afternoon, evening, and late.

    Args:
        times (List[str]): A list of opening times in the format 'HH:MM-HH:MM, HH:MM-HH:MM'.

    Returns:
        List[Dict[str, List[str]]]: A list of dictionaries categorizing times for each input.

    Example:
        Input:
        [
            '12:30-14:15, 19:30-22:00',
            '08:00-19:30'
        ]
        Output:
        [
            {'open_morning': [], 'open_noon': ['12:30-14:15'], 'open_afternoon': [], 'open_evening': ['19:30-22:00'], 'open_late': []},
            {'open_morning': ['08:00-12:00'], 'open_noon': ['12:00-14:00'], 'open_afternoon': ['14:00-18:00'], 'open_evening': ['18:00-19:30'], 'open_late': []}
        ]
    """
    # Define time ranges
    time_categories = {
        "open_morning": (6, 12),
        "open_noon": (12, 14),
        "open_afternoon": (14, 18),
        "open_evening": (18, 22),
        "open_late": (22, 6),
    }

    def time_to_hours(time_str: str) -> int:
        """Converts a time string 'HH:MM' to fractional hours."""
        hours, minutes = map(int, time_str.split(':'))
        return hours + minutes / 60

    categorized_times = []
    for time_entry in times:
        # Split multiple ranges
        time_ranges = [t.strip() for t in time_entry.split(',')]
        categories = {key: [] for key in time_categories}

        for time_range in time_ranges:
            start, end = time_range.split('-')
            start_hour = time_to_hours(start)
            end_hour = time_to_hours(end)

            for category, (cat_start, cat_end) in time_categories.items():
                if cat_start < cat_end:  # Normal range (e.g., 06:00–12:00)
                    if start_hour < cat_end and end_hour > cat_start:
                        overlap_start = max(start_hour, cat_start)
                        overlap_end = min(end_hour, cat_end)
                        categories[category].append(f"{overlap_start:.2f}-{overlap_end:.2f}")
                else:  # Overnight range (e.g., 22:00–06:00)
                    if start_hour >= cat_start or end_hour <= cat_end:
                        categories[category].append(f"{start_hour:.2f}-{end_hour:.2f}")
        
        categorized_times.append(categories)

    return categorized_times

In [162]:
times = [
    '12:30-14:15, 19:30-22:00',
    '08:00-19:30',
    '09:30-22:00',
    '11:00-15:00, 18:00-22:30',
    '10:00-22:00',
    '11:00-14:00, 16:30-20:30',
    '11:30-14:30, 19:00-22:45',
    '12:00-14:00, 18:30-23:00',
    '12:30-14:00, 19:30-21:30',
    '12:00-15:00, 19:00-02:00',
    '11:30-15:00, 19:00-22:00',
    '12:00-14:00, 18:30-22:30',
    '12:00-14:30, 18:30-00:00',
    '12:00-15:00, 18:30-21:00',
    '12:00-14:45, 19:00-22:30',
    '11:00-15:00, 18:00-00:00',
    '12:00-14:15, 18:45-22:30',
    '00:00-02:00, 12:00-14:30, 19:00-22:30',
    '19:30-00:30',
]

result = categorize_opening_times(times)
for r in result:
    print(r)

{'open_morning': [], 'open_noon': ['12.50-14.00'], 'open_afternoon': ['14.00-14.25'], 'open_evening': ['19.50-22.00'], 'open_late': []}
{'open_morning': ['8.00-12.00'], 'open_noon': ['12.00-14.00'], 'open_afternoon': ['14.00-18.00'], 'open_evening': ['18.00-19.50'], 'open_late': []}
{'open_morning': ['9.50-12.00'], 'open_noon': ['12.00-14.00'], 'open_afternoon': ['14.00-18.00'], 'open_evening': ['18.00-22.00'], 'open_late': []}
{'open_morning': ['11.00-12.00'], 'open_noon': ['12.00-14.00'], 'open_afternoon': ['14.00-15.00'], 'open_evening': ['18.00-22.00'], 'open_late': []}
{'open_morning': ['10.00-12.00'], 'open_noon': ['12.00-14.00'], 'open_afternoon': ['14.00-18.00'], 'open_evening': ['18.00-22.00'], 'open_late': []}
{'open_morning': ['11.00-12.00'], 'open_noon': ['12.00-14.00'], 'open_afternoon': ['16.50-18.00'], 'open_evening': ['18.00-20.50'], 'open_late': []}
{'open_morning': ['11.50-12.00'], 'open_noon': ['12.00-14.00'], 'open_afternoon': ['14.00-14.50'], 'open_evening': ['19.0

In [112]:
#### workday split from afternoon and morning
tmp = []
for i in df['workday_timing']:
    for y in i.strip().split(','):
    print(i)
    tmp.append(
        (
            datetime.datetime.strptime(i.split('-')[0], '%H:%M').time(),
            datetime.datetime.strptime(i.split('-')[1], '%H:%M').time(),
        )
    )

12:00-00:00
12:00-14:30, 19:00-22:30


ValueError: unconverted data remains: , 19:00

In [113]:
a = datetime.datetime.strptime(
    df['workday_timing'].iloc[5].split('-')[0], 
    '%H:%M').time()

In [137]:
a = df['workday_timing'].iloc[16]

In [141]:
a

'19:00-02:00'

In [150]:
a1 = datetime.datetime.strptime(
    a.split('-')[0], 
    '%H:%M').time()

In [151]:
a2 = datetime.datetime.strptime(
    a.split('-')[1], 
    '%H:%M').time()

In [156]:
a1>=datetime.datetime.strptime('19:00', '%H:%M').time()

True

In [157]:
a2<=datetime.datetime.strptime('21:00', '%H:%M').time()

True

In [124]:
int(a.strftime('%H%M'))

1100

In [149]:
df['workday_timing'].iloc[140]

'12:00-14:30, 19:00-22:30'

In [136]:
df['closed_on']

0              Open All Days
1               ["dimanche"]
2               ["mercredi"]
3              Open All Days
4                  ["lundi"]
                ...         
1132    ["lundi","dimanche"]
1133               ["lundi"]
1134               ["lundi"]
1135               ["lundi"]
1136            ["dimanche"]
Name: closed_on, Length: 1137, dtype: object

In [81]:
# categories -> drop restaurant
# workday_timing -> change to timestamp
# is_permanently_closed -> drop if not nan then drop column
# closed_on -> change all days to NaN
#### for the moment we will delete and we will 
df = df.drop([
    'workday_timing', 
    'is_permanently_closed',
    'closed_on',
    'most_popular_times'
], axis=1)

In [82]:
df = df.drop_duplicates(subset='data_id', keep="first")

In [83]:
### remove the ids
df = df.drop([
    'place_id', 
    'name',
    'data_id',
    'cid',
], axis=1)

In [84]:
df.shape

(1013, 12)

In [85]:
df = df.drop([
    'reviews_per_rating',
    'coordinates',
    'about',
    'hours',
], axis=1)

In [86]:
not_plot = [
    'reviews_per_rating',
    'coordinates',
    'about',
    'hours',
]

In [87]:
fig = px.scatter_matrix(df,
    dimensions=[i for i in df.columns if i not in not_plot],
    #color="species", 
    #symbol="species",
    title="Scatter matrix of iris data set",
) # remove underscore
fig.update_traces(diagonal_visible=False)
fig.show()

In [96]:
# change the price to int
df.price_range = [np.nan if pd.isna(i) else len(i) for i in df.price_range]

In [98]:
df_r = 
df_r = df[[i for i in df.columns if i!='price_range']]

In [None]:
#### deal with nan values

In [136]:
import pandas as pd
import numpy as np
import plotly.express as px

def nan_heatmap(df: pd.DataFrame) -> None:
    """
    Generates a heatmap showing the location of NaN values in a DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame to visualize.
    """
    # Create a boolean DataFrame indicating NaN locations
    nan_df = df.isna()
    # Convert boolean values to integers (NaN=1, non-NaN=0) for heatmap visualization
    nan_int_df = nan_df.astype(int)

    # Plot the heatmap
    fig = px.imshow(nan_int_df, 
                    labels=dict(x="Columns", y="Rows", color="NaN"),
                    color_continuous_scale=["white", "black"],
                    title="Heatmap of NaN Values in DataFrame")
    fig.show()

In [141]:
df.has_menu = df.has_menu.fillna(False)
df.can_order_online = df.can_order_online.fillna(False)


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



In [139]:
df

Unnamed: 0,is_spending_on_ads,reviews,rating,price_range,has_reservations,can_order_online,has_menu,has_website
0,True,1979,4.4,7.0,True,True,True,False
1,True,1965,4.8,7.0,True,True,True,False
2,True,391,4.6,2.0,True,True,True,False
3,True,415,4.4,7.0,True,True,True,False
4,True,328,4.6,,True,True,True,False
...,...,...,...,...,...,...,...,...
1130,True,707,4.2,7.0,True,True,True,False
1133,True,61,4.9,7.0,True,True,False,True
1134,True,892,4.6,7.0,False,False,False,False
1135,True,302,4.8,7.0,False,True,True,False


In [142]:
nan_heatmap(df)

In [None]:
#replace 

In [104]:
#### regressor

In [143]:
#df_ana = df.dropna()

In [148]:
def convert_boolean_to_int(df: pd.DataFrame) -> pd.DataFrame:
    """
    Converts all boolean columns in a DataFrame to integer columns.

    Args:
        df (pd.DataFrame): The input DataFrame with boolean columns.

    Returns:
        pd.DataFrame: DataFrame with boolean columns converted to integers.
    """
    # Convert all boolean columns to integers
    bool_columns = df.select_dtypes(include='bool').columns
    df[bool_columns] = df[bool_columns].astype(int)
    return df

In [105]:
from tpot import TPOTRegressor
from sklearn.model_selection import train_test_split

In [156]:
df

Unnamed: 0,is_spending_on_ads,reviews,rating,price_range,has_reservations,can_order_online,has_menu,has_website
0,1,1979,4.4,7.0,1,1,1,0
1,1,1965,4.8,7.0,1,1,1,0
2,1,391,4.6,2.0,1,1,1,0
3,1,415,4.4,7.0,1,1,1,0
4,1,328,4.6,,1,1,1,0
...,...,...,...,...,...,...,...,...
1130,1,707,4.2,7.0,1,1,1,0
1133,1,61,4.9,7.0,1,1,0,1
1134,1,892,4.6,7.0,0,0,0,0
1135,1,302,4.8,7.0,0,1,1,0


In [163]:
df_ana = convert_boolean_to_int(df)
df_ana = df_ana.dropna()

In [166]:
#to_train = 'price_range'
to_train = 'rating'
X_train, X_test, y_train, y_test = train_test_split(
    df_ana[[i for i in df_ana.columns if i!=to_train]].values, df_ana[to_train].values,
    train_size=0.75,
    test_size=0.25,
)

In [168]:
tpot = TPOTRegressor(generations=10, population_size=100, verbosity=2, random_state=42)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))

Optimization Progress:   0%|          | 0/1100 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: -0.1115780456493827

Generation 2 - Current best internal CV score: -0.11133760528568921

Generation 3 - Current best internal CV score: -0.11132596658514808

Generation 4 - Current best internal CV score: -0.11118228606963329

Generation 5 - Current best internal CV score: -0.11080372573857385

Generation 6 - Current best internal CV score: -0.11080372573857385

Generation 7 - Current best internal CV score: -0.11080372573857385

Generation 8 - Current best internal CV score: -0.11080372573857385

Generation 9 - Current best internal CV score: -0.11080372573857385

Generation 10 - Current best internal CV score: -0.11080372573857385

Best pipeline: RandomForestRegressor(input_matrix, bootstrap=False, max_features=0.4, min_samples_leaf=16, min_samples_split=14, n_estimators=100)
-0.22311261485851705
