In [129]:
import pandas as pd
from tableone import TableOne
from sklearn import tree
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import pickle
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# CONTEXT DATAFRAME

In [81]:
context_names = ["interaction_id", "context_location", "context_time_of_day", "context_weather"]

In [82]:
context_df = pd.read_csv(r'data\context.csv')
print(context_df.head())

   interaction_id       location time_of_day weather
0               1        Chicago     Evening   Sunny
1               2        Houston       Night   Rainy
2               3       New York     Evening   Snowy
3               4  San Francisco   Afternoon   Sunny
4               5        Chicago     Evening   Sunny


In [83]:
context_df.columns = context_names

In [84]:
print(context_df.isnull().sum())
print(context_df.shape)

interaction_id         0
context_location       0
context_time_of_day    0
context_weather        0
dtype: int64
(10000, 4)


# INTERACTIONS DATAFRAME

In [85]:
interaction_names = ["user_id", "restaurant_id", "interaction_type", "interaction_timestamp"]

In [86]:
interactions_df = pd.read_csv(r'data\interactions.csv')
print(interactions_df.head())

   user_id  restaurant_id interaction_type            timestamp
0      314            409             like  2024-01-04 10:43:00
1      869            424            click  2024-01-05 03:46:00
2      368             32            click  2024-01-06 09:12:00
3      429            472            visit  2024-01-04 11:11:00
4      830             28             like  2024-01-03 15:21:00


In [87]:
interactions_df.columns = interaction_names

In [88]:
print(interactions_df.isnull().sum())
print(interactions_df.shape)

user_id                  0
restaurant_id            0
interaction_type         0
interaction_timestamp    0
dtype: int64
(10000, 4)


In [89]:
interaction_id_list = []

for i in range(len(interactions_df["restaurant_id"])):
    interaction_id_list.append(i+1)

interactions_df["interaction_id"] = interaction_id_list
print(interactions_df.head())

   user_id  restaurant_id interaction_type interaction_timestamp  \
0      314            409             like   2024-01-04 10:43:00   
1      869            424            click   2024-01-05 03:46:00   
2      368             32            click   2024-01-06 09:12:00   
3      429            472            visit   2024-01-04 11:11:00   
4      830             28             like   2024-01-03 15:21:00   

   interaction_id  
0               1  
1               2  
2               3  
3               4  
4               5  


In [90]:
#Inner Join:
final_df = pd.merge(context_df, interactions_df, on='interaction_id', how='inner')
print(final_df)

      interaction_id context_location context_time_of_day context_weather  \
0                  1          Chicago             Evening           Sunny   
1                  2          Houston               Night           Rainy   
2                  3         New York             Evening           Snowy   
3                  4    San Francisco           Afternoon           Sunny   
4                  5          Chicago             Evening           Sunny   
...              ...              ...                 ...             ...   
9995            9996          Houston           Afternoon          Cloudy   
9996            9997    San Francisco             Morning           Sunny   
9997            9998      Los Angeles             Morning           Rainy   
9998            9999          Chicago             Morning           Snowy   
9999           10000          Chicago           Afternoon           Rainy   

      user_id  restaurant_id interaction_type interaction_timestamp  
0    

# RESTAURANTS DATAFRAME

In [91]:
restaurant_names = ["restaurant_id", "restaurant_name", "restaurant_cuisine", "restaurant_rating", "restaurant_price_range", "restaurant_location"]

In [92]:
restaurants_df = pd.read_csv(r'data\restaurants.csv')
print(restaurants_df.head())

   restaurant_id                      name   cuisine  rating price_range  \
0              1   Banks, Flynn and Joseph   Mexican     2.7          $$   
1              2  Garza, Melton and Powell   Mexican     4.4          $$   
2              3             Frederick PLC  Japanese     3.5           $   
3              4  Sullivan, Gray and Price      Thai     4.4           $   
4              5           Hurst-Frederick  Japanese     4.8          $$   

      location  
0      Chicago  
1  Los Angeles  
2  Los Angeles  
3      Chicago  
4  Los Angeles  


In [93]:
restaurants_df.columns = restaurant_names

In [94]:
print(restaurants_df.isnull().sum())
print(restaurants_df.shape)

restaurant_id             0
restaurant_name           0
restaurant_cuisine        0
restaurant_rating         0
restaurant_price_range    0
restaurant_location       0
dtype: int64
(500, 6)


In [95]:
#Inner Join:
final_df = pd.merge(final_df, restaurants_df, on='restaurant_id', how='inner')
print(final_df.shape)

(10000, 13)


In [96]:
print(final_df.columns)

Index(['interaction_id', 'context_location', 'context_time_of_day',
       'context_weather', 'user_id', 'restaurant_id', 'interaction_type',
       'interaction_timestamp', 'restaurant_name', 'restaurant_cuisine',
       'restaurant_rating', 'restaurant_price_range', 'restaurant_location'],
      dtype='object')


# USERS DATAFRAME

In [97]:
user_names = ["user_id", "user_age", "user_gender", "user_preferred_cuisine"]

In [98]:
users_df = pd.read_csv(r"data\users.csv")
print(users_df.head())

   user_id  age  gender preferred_cuisine
0        1   56    Male            Indian
1        2   46   Other           Chinese
2        3   32    Male            French
3        4   25  Female           Mexican
4        5   38   Other              Thai


In [99]:
users_df.columns = user_names

In [100]:
print(users_df.isnull().sum())
print(users_df.shape)

user_id                   0
user_age                  0
user_gender               0
user_preferred_cuisine    0
dtype: int64
(1000, 4)


In [101]:
final_df = pd.merge(final_df, users_df, on="user_id", how="inner")
print(final_df.columns)
print(final_df.shape)

Index(['interaction_id', 'context_location', 'context_time_of_day',
       'context_weather', 'user_id', 'restaurant_id', 'interaction_type',
       'interaction_timestamp', 'restaurant_name', 'restaurant_cuisine',
       'restaurant_rating', 'restaurant_price_range', 'restaurant_location',
       'user_age', 'user_gender', 'user_preferred_cuisine'],
      dtype='object')
(10000, 16)


In [102]:
print(final_df.isnull().sum())

interaction_id            0
context_location          0
context_time_of_day       0
context_weather           0
user_id                   0
restaurant_id             0
interaction_type          0
interaction_timestamp     0
restaurant_name           0
restaurant_cuisine        0
restaurant_rating         0
restaurant_price_range    0
restaurant_location       0
user_age                  0
user_gender               0
user_preferred_cuisine    0
dtype: int64


# Verificação dos dados

In [103]:
print(final_df.dtypes)

print(final_df["restaurant_cuisine"].value_counts())

interaction_id              int64
context_location           object
context_time_of_day        object
context_weather            object
user_id                     int64
restaurant_id               int64
interaction_type           object
interaction_timestamp      object
restaurant_name            object
restaurant_cuisine         object
restaurant_rating         float64
restaurant_price_range     object
restaurant_location        object
user_age                    int64
user_gender                object
user_preferred_cuisine     object
dtype: object
restaurant_cuisine
Mexican     1514
Japanese    1308
Thai        1272
Chinese     1264
French      1221
Italian     1189
Indian      1174
American    1058
Name: count, dtype: int64


In [104]:
categorical = ['restaurant_cuisine', 'restaurant_price_range', 'restaurant_location']
continuous = ['restaurant_rating']

In [105]:
table1 = TableOne(final_df, categorical=categorical, continuous=continuous, groupby='restaurant_name', pval=True, htest_name=True, overall=True, decimals=2, missing=False)


In [106]:
print(table1.tabulate(tablefmt="table"))

                                              Overall       Adams Ltd    Allen Inc    Alvarez, Hudson and Hawkins    Anderson Group    Anderson, Hawkins and Conley    Anderson-Oconnor    Andrews and Sons    Anthony-Alvarado    Arellano, Lane and Roberts    Armstrong-Martin    Avila Inc    Ayala, Hernandez and Maldonado    Baird-Brock    Baker Group    Baker Inc    Banks, Flynn and Joseph    Barnes, Cook and Roberts    Barnett, Butler and Smith    Barnett, Lynch and Harris    Bass, Spears and Brooks    Bates-Lopez    Bean, Pham and Porter    Beck, Lawson and Bailey    Bennett-Taylor    Berger and Sons    Bernard, Lane and Mitchell    Bishop-Waller    Blackburn, Elliott and Ward    Blair-Simpson    Blankenship, Lee and Cooper    Bolton, Mills and Bennett    Bowers, Robbins and Harrell    Bradshaw-Johnson    Bradshaw-Velazquez    Bray LLC     Briggs-Olsen    Brooks, Mckee and Martinez    Brooks, Wilson and Green    Brooks-Gibson    Brown PLC    Brown and Sons    Brown, Johnson and Zimmerm

# NAIVES BAYESS

In [107]:
scaler_restaurant = StandardScaler()

In [108]:
X_restaurant = final_df[["restaurant_cuisine", "restaurant_rating", "restaurant_price_range", "restaurant_location"]].values
y_restaurant = final_df["restaurant_name"].values

print(X_restaurant)
print(y_restaurant)

[['Chinese' 3.1 '$$$' 'New York']
 ['Mexican' 3.0 '$$' 'New York']
 ['Mexican' 5.0 '$$$$' 'Los Angeles']
 ...
 ['Mexican' 3.1 '$$' 'Chicago']
 ['Indian' 4.1 '$$$' 'New York']
 ['American' 3.0 '$$' 'Los Angeles']]
['Meadows Inc' 'Lane, Lee and Thomas' 'Nelson, Perry and Murphy' ...
 'Stevenson Group' 'Garcia Ltd' 'Horn-Gonzalez']


In [109]:
label_encoder_cuisine = LabelEncoder()
label_encoder_price_range = LabelEncoder()
label_encoder_location = LabelEncoder()

In [110]:
X_restaurant[:, 0] = label_encoder_cuisine.fit_transform(X_restaurant[:, 0])
X_restaurant[:, 2] = label_encoder_price_range.fit_transform(X_restaurant[:, 2])
X_restaurant[:, 3] = label_encoder_location.fit_transform(X_restaurant[:, 3])

print(X_restaurant)


[[1 3.1 2 3]
 [6 3.0 1 3]
 [6 5.0 3 2]
 ...
 [6 3.1 1 0]
 [3 4.1 2 3]
 [0 3.0 1 2]]


In [111]:
# X_restaurant = scaler_restaurant.fit_transform(X_restaurant)
X_train_restaurant, X_test_restaurant, y_train_restaurant, y_test_restaurant = train_test_split(X_restaurant, y_restaurant, test_size=0.2, random_state=42)
print(X_train_restaurant.shape)
print(X_test_restaurant.shape)
print(y_train_restaurant.shape)
print(y_test_restaurant.shape)

(8000, 4)
(2000, 4)
(8000,)
(2000,)


In [112]:
# Guardar modelos
with open('pickles/restaurant_label_encoder.pkl', 'wb') as f:
    pickle.dump((X_train_restaurant, y_train_restaurant, X_test_restaurant, y_test_restaurant), f)

In [113]:
naives_bayes_restaurant = GaussianNB()
naives_bayes_restaurant.fit(X_train_restaurant, y_train_restaurant)

previsoes_restaurant = naives_bayes_restaurant.predict(X_test_restaurant)
print(previsoes_restaurant)


['Morgan, Wilson and Moore' 'Hernandez, Spencer and Howard'
 'Christensen-Perkins' ... 'Snyder, Rodriguez and Luna'
 'Blankenship, Lee and Cooper' 'Harris, Jackson and Donaldson']


In [114]:
print(y_test_restaurant)

['Morgan, Wilson and Moore' 'Hernandez, Spencer and Howard'
 'Christensen-Perkins' ... 'Snyder, Rodriguez and Luna'
 'Blankenship, Lee and Cooper' 'Harris, Jackson and Donaldson']


In [115]:
print(naives_bayes_restaurant.score(X_test_restaurant, y_test_restaurant))

0.9395


In [116]:
print(classification_report(y_test_restaurant, previsoes_restaurant))

                                   precision    recall  f1-score   support

                        Adams Ltd       1.00      1.00      1.00         5
                        Allen Inc       1.00      1.00      1.00         7
      Alvarez, Hudson and Hawkins       1.00      1.00      1.00         2
                   Anderson Group       1.00      1.00      1.00         4
     Anderson, Hawkins and Conley       1.00      1.00      1.00         2
                 Anderson-Oconnor       1.00      1.00      1.00         8
                 Andrews and Sons       1.00      1.00      1.00         6
                 Anthony-Alvarado       1.00      1.00      1.00         7
       Arellano, Lane and Roberts       1.00      1.00      1.00         4
                 Armstrong-Martin       1.00      1.00      1.00         6
                        Avila Inc       1.00      1.00      1.00         4
   Ayala, Hernandez and Maldonado       1.00      1.00      1.00         4
                      Ba

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# DECISIONS TREES

In [117]:
with open('pickles/restaurant_label_encoder.pkl', 'rb') as f:
    X_restaurant, y_restaurant, X_test_restaurant, y_test_restaurant = pickle.load(f)
print(X_restaurant)

[[5 4.2 3 3]
 [2 2.6 0 0]
 [7 3.0 1 4]
 ...
 [4 3.9 0 3]
 [5 4.4 1 3]
 [6 4.8 0 1]]


In [118]:
restaurant_tree = DecisionTreeClassifier(criterion='entropy')
restaurant_tree.fit(X_train_restaurant, y_train_restaurant)

previsoes_restaurant_tree = restaurant_tree.predict(X_test_restaurant)
print(previsoes_restaurant_tree)

['Morgan, Wilson and Moore' 'Hernandez, Spencer and Howard'
 'Christensen-Perkins' ... 'Snyder, Rodriguez and Luna'
 'Blankenship, Lee and Cooper' 'Harris, Jackson and Donaldson']


In [119]:
print(y_test_restaurant)

['Morgan, Wilson and Moore' 'Hernandez, Spencer and Howard'
 'Christensen-Perkins' ... 'Snyder, Rodriguez and Luna'
 'Blankenship, Lee and Cooper' 'Harris, Jackson and Donaldson']


In [120]:
print(accuracy_score(y_test_restaurant, previsoes_restaurant_tree))
print(classification_report(y_test_restaurant, previsoes_restaurant_tree))


0.954
                                   precision    recall  f1-score   support

                        Adams Ltd       1.00      1.00      1.00         5
                        Allen Inc       1.00      1.00      1.00         7
      Alvarez, Hudson and Hawkins       1.00      1.00      1.00         2
                   Anderson Group       1.00      1.00      1.00         4
     Anderson, Hawkins and Conley       1.00      1.00      1.00         2
                 Anderson-Oconnor       1.00      1.00      1.00         8
                 Andrews and Sons       1.00      1.00      1.00         6
                 Anthony-Alvarado       1.00      1.00      1.00         7
       Arellano, Lane and Roberts       1.00      1.00      1.00         4
                 Armstrong-Martin       1.00      1.00      1.00         6
                        Avila Inc       1.00      1.00      1.00         4
   Ayala, Hernandez and Maldonado       1.00      1.00      1.00         4
                  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# RANDOM FOREST

In [122]:
with open("pickles/restaurant_label_encoder.pkl", "rb") as f:
    X_train_restaurant, y_train_restaurant, X_test_restaurant, y_test_restaurant = pickle.load(f)

In [123]:
restaurant_forest = RandomForestClassifier(n_estimators=100, criterion="entropy")
restaurant_forest.fit(X_train_restaurant, y_train_restaurant)

In [124]:
previsoes_restaurant_forest = restaurant_forest.predict(X_test_restaurant)
print(previsoes_restaurant_forest)

['Morgan, Wilson and Moore' 'Hernandez, Spencer and Howard'
 'Christensen-Perkins' ... 'Snyder, Rodriguez and Luna'
 'Blankenship, Lee and Cooper' 'Harris, Jackson and Donaldson']


In [125]:
print(y_test_restaurant)

['Morgan, Wilson and Moore' 'Hernandez, Spencer and Howard'
 'Christensen-Perkins' ... 'Snyder, Rodriguez and Luna'
 'Blankenship, Lee and Cooper' 'Harris, Jackson and Donaldson']


In [126]:
print(accuracy_score(y_test_restaurant, previsoes_restaurant_forest))
print(classification_report(y_test_restaurant, previsoes_restaurant_forest))

0.9535
                                   precision    recall  f1-score   support

                        Adams Ltd       1.00      1.00      1.00         5
                        Allen Inc       1.00      1.00      1.00         7
      Alvarez, Hudson and Hawkins       1.00      1.00      1.00         2
                   Anderson Group       1.00      1.00      1.00         4
     Anderson, Hawkins and Conley       1.00      1.00      1.00         2
                 Anderson-Oconnor       1.00      1.00      1.00         8
                 Andrews and Sons       1.00      1.00      1.00         6
                 Anthony-Alvarado       1.00      1.00      1.00         7
       Arellano, Lane and Roberts       1.00      1.00      1.00         4
                 Armstrong-Martin       1.00      1.00      1.00         6
                        Avila Inc       1.00      1.00      1.00         4
   Ayala, Hernandez and Maldonado       1.00      1.00      1.00         4
                 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# KNN


In [None]:
with open("pickles/restaurant_standard_scale.pkl", "rb") as f:
    X_train_restaurant, y_train_restaurant, X_test_restaurant, y_test_restaurant = pickle.load(f)


[[ 0.59173044  0.59093717  1.31878872  0.73774636]
 [-0.72439599 -1.58567709 -1.32072436 -1.39139176]
 [ 1.46914806 -1.04152353 -0.44088667  1.44745907]
 ...
 [ 0.15302163  0.18282199 -1.32072436  0.73774636]
 [ 0.59173044  0.86301395 -0.44088667  0.73774636]
 [ 1.03043925  1.40716751 -1.32072436 -0.68167905]]


In [130]:
restaurant_knn = KNeighborsClassifier(n_neighbors=5)
restaurant_knn.fit(X_train_restaurant, y_train_restaurant)

In [131]:
previsoes_restaurant_knn = restaurant_knn.predict(X_test_restaurant)
print(previsoes_restaurant_knn)

['Morgan, Wilson and Moore' 'Hernandez, Spencer and Howard'
 'Christensen-Perkins' ... 'Snyder, Rodriguez and Luna'
 'Blankenship, Lee and Cooper' 'Porter, Anderson and Gilmore']


In [132]:
print(y_test_restaurant)

['Morgan, Wilson and Moore' 'Hernandez, Spencer and Howard'
 'Christensen-Perkins' ... 'Snyder, Rodriguez and Luna'
 'Blankenship, Lee and Cooper' 'Harris, Jackson and Donaldson']


In [134]:
print(accuracy_score(y_test_restaurant, previsoes_restaurant_knn))
print(classification_report(y_test_restaurant, previsoes_restaurant_knn))

0.9525
                                   precision    recall  f1-score   support

                        Adams Ltd       1.00      1.00      1.00         5
                        Allen Inc       1.00      1.00      1.00         7
      Alvarez, Hudson and Hawkins       1.00      1.00      1.00         2
                   Anderson Group       1.00      1.00      1.00         4
     Anderson, Hawkins and Conley       1.00      1.00      1.00         2
                 Anderson-Oconnor       1.00      1.00      1.00         8
                 Andrews and Sons       1.00      1.00      1.00         6
                 Anthony-Alvarado       1.00      1.00      1.00         7
       Arellano, Lane and Roberts       1.00      1.00      1.00         4
                 Armstrong-Martin       1.00      1.00      1.00         6
                        Avila Inc       1.00      1.00      1.00         4
   Ayala, Hernandez and Maldonado       1.00      1.00      1.00         4
                 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
