In [71]:
import pandas as pd
from tableone import TableOne

# CONTEXT DATAFRAME

In [72]:
context_names = ["interaction_id", "context_location", "context_time_of_day", "context_weather"]

In [73]:
context_df = pd.read_csv(r'data\context.csv')
print(context_df.head())

   interaction_id       location time_of_day weather
0               1        Chicago     Evening   Sunny
1               2        Houston       Night   Rainy
2               3       New York     Evening   Snowy
3               4  San Francisco   Afternoon   Sunny
4               5        Chicago     Evening   Sunny


In [74]:
context_df.columns = context_names

In [75]:
print(context_df.isnull().sum())
print(context_df.shape)

interaction_id         0
context_location       0
context_time_of_day    0
context_weather        0
dtype: int64
(10000, 4)


# INTERACTIONS DATAFRAME

In [76]:
interaction_names = ["user_id", "restaurant_id", "interaction_type", "interaction_timestamp"]

In [77]:
interactions_df = pd.read_csv(r'data\interactions.csv')
print(interactions_df.head())

   user_id  restaurant_id interaction_type            timestamp
0      314            409             like  2024-01-04 10:43:00
1      869            424            click  2024-01-05 03:46:00
2      368             32            click  2024-01-06 09:12:00
3      429            472            visit  2024-01-04 11:11:00
4      830             28             like  2024-01-03 15:21:00


In [78]:
interactions_df.columns = interaction_names

In [79]:
print(interactions_df.isnull().sum())
print(interactions_df.shape)

user_id                  0
restaurant_id            0
interaction_type         0
interaction_timestamp    0
dtype: int64
(10000, 4)


In [80]:
interaction_id_list = []

for i in range(len(interactions_df["restaurant_id"])):
    interaction_id_list.append(i+1)

interactions_df["interaction_id"] = interaction_id_list
print(interactions_df.head())

   user_id  restaurant_id interaction_type interaction_timestamp  \
0      314            409             like   2024-01-04 10:43:00   
1      869            424            click   2024-01-05 03:46:00   
2      368             32            click   2024-01-06 09:12:00   
3      429            472            visit   2024-01-04 11:11:00   
4      830             28             like   2024-01-03 15:21:00   

   interaction_id  
0               1  
1               2  
2               3  
3               4  
4               5  


In [81]:
#Inner Join:
final_df = pd.merge(context_df, interactions_df, on='interaction_id', how='inner')
print(final_df)

      interaction_id context_location context_time_of_day context_weather  \
0                  1          Chicago             Evening           Sunny   
1                  2          Houston               Night           Rainy   
2                  3         New York             Evening           Snowy   
3                  4    San Francisco           Afternoon           Sunny   
4                  5          Chicago             Evening           Sunny   
...              ...              ...                 ...             ...   
9995            9996          Houston           Afternoon          Cloudy   
9996            9997    San Francisco             Morning           Sunny   
9997            9998      Los Angeles             Morning           Rainy   
9998            9999          Chicago             Morning           Snowy   
9999           10000          Chicago           Afternoon           Rainy   

      user_id  restaurant_id interaction_type interaction_timestamp  
0    

# RESTAURANTS DATAFRAME

In [82]:
restaurant_names = ["restaurant_id", "restaurant_name", "restaurant_cuisine", "restaurant_rating", "restaurant_price_range", "restaurant_location"]

In [83]:
restaurants_df = pd.read_csv(r'data\restaurants.csv')
print(restaurants_df.head())

   restaurant_id                      name   cuisine  rating price_range  \
0              1   Banks, Flynn and Joseph   Mexican     2.7          $$   
1              2  Garza, Melton and Powell   Mexican     4.4          $$   
2              3             Frederick PLC  Japanese     3.5           $   
3              4  Sullivan, Gray and Price      Thai     4.4           $   
4              5           Hurst-Frederick  Japanese     4.8          $$   

      location  
0      Chicago  
1  Los Angeles  
2  Los Angeles  
3      Chicago  
4  Los Angeles  


In [84]:
restaurants_df.columns = restaurant_names

In [85]:
print(restaurants_df.isnull().sum())
print(restaurants_df.shape)

restaurant_id             0
restaurant_name           0
restaurant_cuisine        0
restaurant_rating         0
restaurant_price_range    0
restaurant_location       0
dtype: int64
(500, 6)


In [86]:
#Inner Join:
final_df = pd.merge(final_df, restaurants_df, on='restaurant_id', how='inner')
print(final_df.shape)

(10000, 13)


In [87]:
print(final_df.columns)

Index(['interaction_id', 'context_location', 'context_time_of_day',
       'context_weather', 'user_id', 'restaurant_id', 'interaction_type',
       'interaction_timestamp', 'restaurant_name', 'restaurant_cuisine',
       'restaurant_rating', 'restaurant_price_range', 'restaurant_location'],
      dtype='object')


# USERS DATAFRAME

In [88]:
user_names = ["user_id", "user_age", "user_gender", "user_preferred_cuisine"]

In [89]:
users_df = pd.read_csv(r"data\users.csv")
print(users_df.head())

   user_id  age  gender preferred_cuisine
0        1   56    Male            Indian
1        2   46   Other           Chinese
2        3   32    Male            French
3        4   25  Female           Mexican
4        5   38   Other              Thai


In [90]:
users_df.columns = user_names

In [91]:
print(users_df.isnull().sum())
print(users_df.shape)

user_id                   0
user_age                  0
user_gender               0
user_preferred_cuisine    0
dtype: int64
(1000, 4)


In [92]:
final_df = pd.merge(final_df, users_df, on="user_id", how="inner")
print(final_df.columns)
print(final_df.shape)

Index(['interaction_id', 'context_location', 'context_time_of_day',
       'context_weather', 'user_id', 'restaurant_id', 'interaction_type',
       'interaction_timestamp', 'restaurant_name', 'restaurant_cuisine',
       'restaurant_rating', 'restaurant_price_range', 'restaurant_location',
       'user_age', 'user_gender', 'user_preferred_cuisine'],
      dtype='object')
(10000, 16)


In [93]:
print(final_df.isnull().sum())

interaction_id            0
context_location          0
context_time_of_day       0
context_weather           0
user_id                   0
restaurant_id             0
interaction_type          0
interaction_timestamp     0
restaurant_name           0
restaurant_cuisine        0
restaurant_rating         0
restaurant_price_range    0
restaurant_location       0
user_age                  0
user_gender               0
user_preferred_cuisine    0
dtype: int64


# Verificação dos dados

In [94]:
print(final_df.dtypes)

print(final_df["restaurant_cuisine"].value_counts())

interaction_id              int64
context_location           object
context_time_of_day        object
context_weather            object
user_id                     int64
restaurant_id               int64
interaction_type           object
interaction_timestamp      object
restaurant_name            object
restaurant_cuisine         object
restaurant_rating         float64
restaurant_price_range     object
restaurant_location        object
user_age                    int64
user_gender                object
user_preferred_cuisine     object
dtype: object
restaurant_cuisine
Mexican     1514
Japanese    1308
Thai        1272
Chinese     1264
French      1221
Italian     1189
Indian      1174
American    1058
Name: count, dtype: int64


In [None]:
categorical = ['restaurant_cuisine', 'restaurant_price_range', 'restaurant_location']
continuous = ['restaurant_rating']

In [101]:
table1 = TableOne(final_df, categorical=categorical, continuous=continuous, groupby='restaurant_name', pval=True, htest_name=True, overall=True, decimals=2, missing=False)


In [102]:
print(table1.tabulate(tablefmt="github"))

|                               |               | Overall      | Adams Ltd   | Allen Inc   | Alvarez, Hudson and Hawkins   | Anderson Group   | Anderson, Hawkins and Conley   | Anderson-Oconnor   | Andrews and Sons   | Anthony-Alvarado   | Arellano, Lane and Roberts   | Armstrong-Martin   | Avila Inc   | Ayala, Hernandez and Maldonado   | Baird-Brock   | Baker Group   | Baker Inc   | Banks, Flynn and Joseph   | Barnes, Cook and Roberts   | Barnett, Butler and Smith   | Barnett, Lynch and Harris   | Bass, Spears and Brooks   | Bates-Lopez   | Bean, Pham and Porter   | Beck, Lawson and Bailey   | Bennett-Taylor   | Berger and Sons   | Bernard, Lane and Mitchell   | Bishop-Waller   | Blackburn, Elliott and Ward   | Blair-Simpson   | Blankenship, Lee and Cooper   | Bolton, Mills and Bennett   | Bowers, Robbins and Harrell   | Bradshaw-Johnson   | Bradshaw-Velazquez   | Bray LLC    | Briggs-Olsen   | Brooks, Mckee and Martinez   | Brooks, Wilson and Green   | Brooks-Gibson   | Brown PLC   |