In [416]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from surprise import SVD, Dataset, Reader
from surprise import accuracy
from sentence_transformers import SentenceTransformer

In [417]:
Reviews = pd.read_excel('Evanston Restaurant Reviews.xlsx', sheet_name='Reviews')
Restaurants = pd.read_excel('Evanston Restaurant Reviews.xlsx', sheet_name='Restaurants')

In [418]:
Reviews['Restaurant Name'] = Reviews['Restaurant Name'].replace("Clare's Korner", "Claire's Korner")
Reviews['Restaurant Name'] = Reviews['Restaurant Name'].replace('Cross Rhodes ', 'Cross Rhodes')
Reviews['Restaurant Name'] = Reviews['Restaurant Name'].replace("Kilwin's ", "Kilwin's")
Reviews['Restaurant Name'] = Reviews['Restaurant Name'].replace('Todoroki Sushi ', 'Todoroki Sushi')
Reviews = Reviews.rename(columns={'Height (in)': 'Height (cm)'})

Restaurants['Restaurant Name'] = Restaurants['Restaurant Name'].replace('Shinsen ', 'Shinsen')

In [419]:
# Convert both columns to sets for comparison
set_restaurants = set(Restaurants['Restaurant Name'])
set_reviews = set(Reviews['Restaurant Name'])

# Find names in set_restaurants that are not in set_reviews
differences_restaurants_not_in_reviews = set_restaurants - set_reviews

# Find names in set_reviews that are not in set_restaurants
differences_reviews_not_in_restaurants = set_reviews - set_restaurants

# Print the differences
print("Names in Restaurants not in Reviews:")
print(differences_restaurants_not_in_reviews)

print("\nNames in Reviews not in Restaurants:")
print(differences_reviews_not_in_restaurants)

Names in Restaurants not in Reviews:
set()

Names in Reviews not in Restaurants:
{'Todoroki Sushi', 'La Principal', 'World Market'}


In [420]:
restaurants_to_remove = {'World Market', 'Todoroki Sushi', 'La Principal'}

Reviews = Reviews[~Reviews['Restaurant Name'].isin(restaurants_to_remove)]

In [421]:
Reviews['Marital Status'] = Reviews['Marital Status'].str.strip().str.title()
Reviews['Marital Status'] = Reviews['Marital Status'].replace({'Widow': 'Widowed'})

In [422]:
Evanston_Restaurant_Reviews = pd.merge(Reviews, Restaurants, on='Restaurant Name', how='left')
Evanston_Restaurant_Reviews

Unnamed: 0,Reviewer Name,Restaurant Name,Rating,Review Text,Date of Review,Birth Year,Marital Status,Has Children?,Vegetarian?,Weight (lb),Height (cm),Average Amount Spent,Preferred Mode of Transport,Northwestern Student?,Cuisine,Latitude,Longitude,Average Cost,Open After 8pm?,Brief Description
0,Dan B,Lao Sze Chuan,1,Really disappointed for the dishes…. Not athle...,2022-08-10 00:00:00,1942.0,Single,No,,234.0,161.0,Medium,Car Owner,No,Chinese,42.048462,-87.679476,20,Yes,"Modern Chinese mainstay, known for an extensiv..."
1,A B,Barn Steakhouse,5,Excellent meal in a warm atmosphere! The space...,2022-11-22 00:00:00,1998.0,Single,No,,,,Medium,On Foot,No,American,42.048225,-87.68611,50,Yes,"Chic spot known for New American fare, steaks ..."
2,A B,Brothers K Coffeehouse,4,,2022-08-18 00:00:00,1998.0,Single,No,,,,Medium,On Foot,No,Coffee,42.033913,-87.677913,13,No,Buzzy coffee shop known for Chicago-roasted Me...
3,A B,Clarkes Off Campus,5,Best burger in Evanston,2022-10-30 00:00:00,1998.0,Single,No,,,,Medium,On Foot,No,American,42.046823,-87.682366,20,No,Simple outpost of small local chain known for ...
4,A B,Edzo's Burger Shop,5,Second best burger in Evanston,2022-11-28 00:00:00,1998.0,Single,No,,,,Medium,On Foot,No,Burgers,42.046334,-87.680662,13,No,Bustling burger joint known for creative toppi...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,Sergey Levinsky,Shinsen,5,,2023-05-10 00:00:00,1965.0,Single,Yes,,180.0,155.0,Medium,Car Owner,No,Japanese,42.042173,-87.680221,20,Yes,"BYOB spot for a variety of noodle dishes, sush..."
1455,Marc Rappapoprt,Shinsen,5,We spent new years eve here. This place was aw...,2023-01-12 00:00:00,1969.0,Married,Yes,,120.0,192.0,Medium,Car Owner,No,Japanese,42.042173,-87.680221,20,Yes,"BYOB spot for a variety of noodle dishes, sush..."
1456,Michael Espinola,Shinsen,2,,2021-11-21 00:00:00,1949.0,Married,Yes,,116.0,192.0,Medium,Car Owner,No,Japanese,42.042173,-87.680221,20,Yes,"BYOB spot for a variety of noodle dishes, sush..."
1457,Garreth Lin,Shinsen,3,,2023-12-20 00:00:00,1952.0,Married,Yes,,139.0,150.0,Medium,Car Owner,No,Japanese,42.042173,-87.680221,20,Yes,"BYOB spot for a variety of noodle dishes, sush..."


# 1. Create a user feature matrix

In [423]:
demographic = Reviews.copy()

# Remove columns not related to demographics
user_data = demographic.drop(columns=['Restaurant Name', 'Rating', 'Review Text', 'Date of Review', 'Average Amount Spent'])

# Remove duplicate records to ensure unique user data
unique_users = user_data.drop_duplicates(subset='Reviewer Name')

# Reset index without dropping it to keep the 'Reviewer Name' after duplicates are removed
unique_users.reset_index(inplace=True)

# Fill missing values for numeric features with the mean
numerical_columns = ['Birth Year', 'Weight (lb)', 'Height (cm)']
for column in numerical_columns:
    unique_users[column] = unique_users[column].fillna(unique_users[column].mean())

# One-hot encoding of categorical features
categorical_columns = ['Marital Status', 'Has Children?', 'Vegetarian?', 
                       'Preferred Mode of Transport', 'Northwestern Student?']
encoder = OneHotEncoder()
encoded_data = encoder.fit_transform(unique_users[categorical_columns]).toarray()
encoded_feature_names = encoder.get_feature_names_out(categorical_columns)

# Merge the numeric data with the one-hot encoded categorical data
encoded_users = pd.DataFrame(encoded_data, columns=encoded_feature_names)
feature_matrix = pd.concat([unique_users[numerical_columns].reset_index(drop=True), encoded_users], axis=1)

# Add the 'Reviewer Name' to the feature matrix and set it as the index
feature_matrix_with_names = pd.concat([unique_users[['Reviewer Name']], feature_matrix], axis=1)
feature_matrix_with_names.set_index('Reviewer Name', inplace=True)

print(f"Total number of unique user vectors: {feature_matrix_with_names.shape[0]}")
print(f"Dimensionality of each vector: {feature_matrix_with_names.shape[1]}")

Total number of unique user vectors: 1066
Dimensionality of each vector: 19


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_users[column] = unique_users[column].fillna(unique_users[column].mean())


In [424]:
feature_matrix_with_names

Unnamed: 0_level_0,Birth Year,Weight (lb),Height (cm),Marital Status_Married,Marital Status_Single,Marital Status_Widowed,Marital Status_nan,Has Children?_No,Has Children?_Yes,Has Children?_nan,Vegetarian?_No,Vegetarian?_Yes,Vegetarian?_nan,Preferred Mode of Transport_Car Owner,Preferred Mode of Transport_On Foot,Preferred Mode of Transport_Public Transit,Preferred Mode of Transport_nan,Northwestern Student?_No,Northwestern Student?_Yes
Reviewer Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Dan B,1942.0,234.000000,161.000000,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
A B,1998.0,205.478927,172.929658,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
Abigail,2001.0,205.478927,172.929658,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
Actual P.,1994.0,243.000000,162.000000,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
Adam Gibbons,1999.0,205.478927,172.929658,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Sergey Levinsky,1965.0,180.000000,155.000000,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
Marc Rappapoprt,1969.0,120.000000,192.000000,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
Michael Espinola,1949.0,116.000000,192.000000,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
Garreth Lin,1952.0,139.000000,150.000000,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0


# 2.

In [425]:
similarity_matrix2 = cosine_similarity(feature_matrix_with_names)
similarity_df2 = pd.DataFrame(similarity_matrix2, index=feature_matrix_with_names.index, columns=feature_matrix_with_names.index)

similarity_df2

Reviewer Name,Dan B,A B,Abigail,Actual P.,Adam Gibbons,Agnes Zuber,Al Brunner,Alan Amato,Albert Bechtold,Albert Genova,...,Stephanie Moore,Stephan Carlson,Elizabeth Carlson,Bethany Carlson,Edith Williams,Sergey Levinsky,Marc Rappapoprt,Michael Espinola,Garreth Lin,Shuo Li
Reviewer Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Dan B,1.000000,0.999842,0.999840,0.999997,0.999841,0.998665,0.999968,0.999998,0.999984,0.999595,...,0.999887,0.997928,0.999399,0.999776,0.999765,0.999587,0.998160,0.998060,0.998799,0.999296
A B,0.999842,1.000000,1.000000,0.999810,1.000000,0.999388,0.999836,0.999848,0.999831,0.999866,...,0.999463,0.998913,0.998774,0.999987,0.999954,0.999910,0.999079,0.999008,0.999466,0.999789
Abigail,0.999840,1.000000,1.000000,0.999808,1.000000,0.999393,0.999833,0.999846,0.999830,0.999865,...,0.999459,0.998918,0.998765,0.999987,0.999955,0.999912,0.999084,0.999013,0.999472,0.999793
Actual P.,0.999997,0.999810,0.999808,1.000000,0.999809,0.998600,0.999952,0.999997,0.999988,0.999532,...,0.999911,0.997815,0.999410,0.999734,0.999742,0.999553,0.998053,0.997950,0.998740,0.999243
Adam Gibbons,0.999841,1.000000,1.000000,0.999809,1.000000,0.999390,0.999835,0.999847,0.999831,0.999866,...,0.999461,0.998915,0.998771,0.999987,0.999954,0.999911,0.999080,0.999009,0.999468,0.999791
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Sergey Levinsky,0.999587,0.999910,0.999912,0.999553,0.999911,0.999734,0.999519,0.999616,0.999634,0.999691,...,0.999082,0.999227,0.998058,0.999886,0.999971,1.000000,0.999365,0.999302,0.999794,0.999952
Marc Rappapoprt,0.998160,0.999079,0.999084,0.998053,0.999080,0.999794,0.998235,0.998181,0.998151,0.999254,...,0.997138,0.999992,0.996021,0.999203,0.999087,0.999365,1.000000,0.999999,0.999738,0.999659
Michael Espinola,0.998060,0.999008,0.999013,0.997950,0.999009,0.999764,0.998143,0.998081,0.998049,0.999204,...,0.997014,0.999997,0.995895,0.999140,0.999012,0.999302,0.999999,1.000000,0.999701,0.999612
Garreth Lin,0.998799,0.999466,0.999472,0.998740,0.999468,0.999994,0.998712,0.998849,0.998886,0.999280,...,0.998012,0.999666,0.996626,0.999475,0.999618,0.999794,0.999738,0.999701,1.000000,0.999922


In [426]:
def recommend_restaurants_demographic(user_name, similarity_df2, reviews_data, top_n=3):
    # Get similarity scores for the user and sort them
    similarities = similarity_df2.loc[user_name].sort_values(ascending=False)
    
    # Find the top N most similar users
    most_similar_users = similarities.index[1:top_n+1]  # Exclude the first one (the user itself)
    
    # Dictionary to hold the most similar users and their favorite restaurants
    recommendations = {}
    
    # Iterate through the most similar users and find their favorite restaurant
    for similar_user in most_similar_users:
        # Get this user's highest-rated restaurant
        user_reviews = reviews_data[reviews_data['Reviewer Name'] == similar_user]
        highest_rated_restaurant = user_reviews.loc[user_reviews['Rating'].idxmax(), 'Restaurant Name']
        recommendations[similar_user] = highest_rated_restaurant
    
    return recommendations

recommendations = recommend_restaurants_demographic('Timothy Mace', similarity_df2, Evanston_Restaurant_Reviews, top_n=3)

for user, restaurant in recommendations.items():
    print(f"{user}'s favorite restaurants is {restaurant}")

Enid Egan's favorite restaurants is Burger King
Anthony Grieco's favorite restaurants is Union Pizzeria
NU Student 12's favorite restaurants is Hokkaido Ramen


# 3.

In [427]:
data3 = Evanston_Restaurant_Reviews.copy()
pivot_table = data3.pivot_table(index='Reviewer Name', columns='Restaurant Name', values='Rating', aggfunc='first')

#pivot_table

In [428]:
filled_table = pivot_table.copy() 

for reviewer in filled_table.index:
    for restaurant in filled_table.columns:
        if pd.isna(filled_table.loc[reviewer, restaurant]):
            # Get the 5 users with the highest similarity
            top_5_similar = similarity_df2.loc[reviewer].nlargest(6).index[1:]

            # Get the ratings of this restaurant from these 5 similar users
            ratings = filled_table.loc[top_5_similar, restaurant]
            
            # Replace NaN score with 3
            ratings = ratings.fillna(3)
            
            # Calculate the average of these ratings
            average_rating = ratings.mean()
            
            # Fill in the original blank ratings with the average
            filled_table.loc[reviewer, restaurant] = average_rating

#filled_table


# 4.

In [429]:
similarity_matrix4 = cosine_similarity(filled_table)
similarity_df4 = pd.DataFrame(similarity_matrix4, index=filled_table.index, columns=filled_table.index)

#similarity_df4

In [430]:
def recommend_restaurant_for_similar_user(target_user, similarity_df, reviews_data):
    similarities = similarity_df.loc[target_user].drop(target_user, errors='ignore')
    most_similar_user = similarities.idxmax()

    user_reviews = reviews_data.loc[most_similar_user]

    highest_rated_restaurant = user_reviews.idxmax()
    highest_rating = user_reviews.max()

    print(f"The most similar user to {target_user} is {most_similar_user}.")
    print(f"{most_similar_user}'s favorite restaurant is {highest_rated_restaurant} with a rating of {highest_rating}.")

recommend_restaurant_for_similar_user('Sarah Belle', similarity_df4, filled_table)


The most similar user to Sarah Belle is Juan Rogers.
Juan Rogers's favorite restaurant is Kuni's Japanese Restaurant with a rating of 5.0.


# 5

In [449]:
data5 = Evanston_Restaurant_Reviews.copy()

# Fill missing values for numeric features with the mean
numerical_columns = ['Birth Year', 'Weight (lb)', 'Height (cm)']
for column in numerical_columns:
    data5[column] = data5[column].fillna(data5[column].mean())

# Normalized average consumption
scaler = StandardScaler()
data5[numerical_columns] = scaler.fit_transform(data5[numerical_columns])

# One-hot encoding of categorical features
categorical_columns = ['Marital Status', 'Has Children?', 'Vegetarian?', 
                       'Preferred Mode of Transport', 'Northwestern Student?', 'Cuisine']
encoder = OneHotEncoder()
encoded_data = encoder.fit_transform(data5[categorical_columns]).toarray()
encoded_feature_names = encoder.get_feature_names_out(categorical_columns)

# Merge the numeric data with the one-hot encoded categorical data
encoded_users = pd.DataFrame(encoded_data, columns=encoded_feature_names)
feature_matrix = pd.concat([data5[numerical_columns].reset_index(drop=True), encoded_users], axis=1)

# Add the 'Reviewer Name' to the feature matrix and set it as the index
feature_matrix_with_names = pd.concat([data5[['Reviewer Name']], feature_matrix], axis=1)
feature_matrix_with_names['Rating'] = data5['Rating']
feature_matrix_with_names.set_index('Reviewer Name', inplace=True)

#feature_matrix_with_names


In [454]:
X = feature_matrix_with_names.drop('Rating', axis=1)
y = feature_matrix_with_names['Rating']

# Divide the data set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)


print(f"Linear Regression R²: {r2_score(y_test, y_pred)}")
print(f"Linear Regression MSE: {mean_squared_error(y_test, y_pred, squared=False)}")


Linear Regression R²: 0.04272582169303907
Linear Regression MSE: 1.4161487486136173




# 6

### Evaluation model

In [457]:
#np.random.seed(42)
random_index = np.random.randint(0, len(X_test))
selected_features = X_test.iloc[random_index]
selected_actual_rating = y_test.iloc[random_index]

# Use the model to predict ratings
selected_predicted_rating = model.predict([selected_features])

# Output comparison results
print(f"Selected: {selected_features}")
print(f"Actual rating: {selected_actual_rating}")
print(f"Predicted score: {selected_predicted_rating[0]}")


Selected: Birth Year                                   -0.562436
Weight (lb)                                  -0.119718
Height (cm)                                  -0.657007
Marital Status_Married                        1.000000
Marital Status_Single                         0.000000
Marital Status_Widowed                        0.000000
Marital Status_nan                            0.000000
Has Children?_No                              0.000000
Has Children?_Yes                             1.000000
Has Children?_nan                             0.000000
Vegetarian?_No                                1.000000
Vegetarian?_Yes                               0.000000
Vegetarian?_nan                               0.000000
Preferred Mode of Transport_Car Owner         1.000000
Preferred Mode of Transport_On Foot           0.000000
Preferred Mode of Transport_Public Transit    0.000000
Preferred Mode of Transport_nan               0.000000
Northwestern Student?_No                      1.000000




### Check the significance of variables

In [434]:
import statsmodels.api as sm

X = sm.add_constant(X)

model = sm.OLS(y, X).fit()

# Summary model
results_summary = model.summary()

# Find p-value
p_values = model.pvalues

# Filter variables with p-value greater than 0.05
non_significant_vars = p_values[p_values > 0.05]

print("Variables with p-value > 0.05:")
#print(non_significant_vars)
print(len(non_significant_vars))

Variables with p-value > 0.05:
27


# 7

In [435]:
X = feature_matrix_with_names.drop('Rating', axis=1)
y = feature_matrix_with_names['Rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


lasso = Lasso(alpha=0.01)
lasso.fit(X_train, y_train)

y_pred_lasso = lasso.predict(X_test)

mse_lasso = mean_squared_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

print(f"Lasso R²: {r2_lasso}")
print(f"Lasso MSE: {mse_lasso}")


Lasso R²: 0.061471575884063046
Lasso MSE: 1.9662051606135937


In [458]:
lasso_coefs = pd.DataFrame({'Feature': X.columns, 'Coefficient': lasso.coef_})
feature_select = lasso_coefs[lasso_coefs['Coefficient'] != 0]

print(feature_select.sort_values(by='Coefficient', key=abs, ascending=False))



                                  Feature  Coefficient
24                        Cuisine_Burgers    -1.094282
6                      Marital Status_nan    -0.849577
10                         Vegetarian?_No    -0.518954
41                           Cuisine_Thai    -0.424039
19                       Cuisine_American    -0.345288
27                         Cuisine_Coffee     0.278833
30                        Cuisine_Italian    -0.265019
34                  Cuisine_Mediterranean    -0.212184
7                        Has Children?_No     0.201228
5                  Marital Status_Widowed    -0.176639
3                  Marital Status_Married     0.157478
4                   Marital Status_Single     0.133472
39                    Cuisine_South Asian    -0.114820
26                      Cuisine_Chocolate     0.078841
2                             Height (cm)    -0.067986
23                     Cuisine_Bubble Tea     0.059871
13  Preferred Mode of Transport_Car Owner     0.057567
11        

In [437]:

print('Number of Coefficient is not zero', len(lasso_coefs[lasso_coefs['Coefficient'] != 0]))

print('Number of Coefficient is zero', len(lasso_coefs[lasso_coefs['Coefficient'] == 0]))


Number of Coefficient is not zero 19
Number of Coefficient is zero 24


# 8

In [459]:
coffee_data = feature_matrix_with_names[feature_matrix_with_names['Cuisine_Coffee'] == 1]
#coffee_data

In [439]:
X = coffee_data.drop(columns=['Rating', 'Cuisine_Coffee'])
y = coffee_data['Rating']

# Divide the data set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Coffee Linear Regression R²: {r2}")
print(f"Coffee Linear Regression MSE: {mse}")

Coffee Linear Regression R²: 0.16007417827490733
Coffee Linear Regression MSE: 1.4412928301791532


In [440]:
# model coefficients
coefficients = pd.DataFrame(model.coef_, X.columns, columns=['Coefficient'])
print(coefficients.sort_values(by='Coefficient', key=abs, ascending=False))

                                            Coefficient
Has Children?_Yes                             -0.743550
Preferred Mode of Transport_nan                0.493754
Has Children?_No                               0.435354
Vegetarian?_nan                               -0.361062
Marital Status_nan                             0.308196
Has Children?_nan                              0.308196
Northwestern Student?_No                      -0.300820
Northwestern Student?_Yes                      0.300820
Preferred Mode of Transport_On Foot           -0.288127
Vegetarian?_Yes                                0.271343
Marital Status_Single                         -0.218292
Weight (lb)                                    0.171519
Preferred Mode of Transport_Public Transit    -0.156568
Birth Year                                    -0.134120
Height (cm)                                    0.102063
Marital Status_Married                        -0.089904
Vegetarian?_No                                 0

# 10

In [441]:
pip install sentence-transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [460]:
data10 = Evanston_Restaurant_Reviews.copy()

review_templates = {
    1: "Very bad experience, not recommended.",
    2: "Below average, was expecting something better.",
    3: "Average experience, nothing special.",
    4: "Good experience, would recommend.",
    5: "Excellent! Will definitely come back!"
}

data10['Review Text'] = data10.apply(
    lambda row: review_templates[row['Rating']] if pd.isnull(row['Review Text']) else row['Review Text'],
    axis=1
)


model = SentenceTransformer('all-MiniLM-L6-v2')
text_embeddings = model.encode(data10['Review Text'].tolist(), show_progress_bar=True)
#text_embeddings

Batches: 100%|██████████| 46/46 [00:01<00:00, 25.00it/s]


In [468]:
X_train, X_test, y_train, y_test = train_test_split(text_embeddings, data10['Rating'], test_size=0.2, , random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Text Linear Regression R²: {r2}")
print(f"Text Linear Regression MSE: {mse}")

Text Linear Regression R²: 0.5513855703791211
Text Linear Regression MSE: 1.000122270126374


In [471]:
username_counts = Evanston_Restaurant_Reviews['Reviewer Name'].value_counts()

username_counts.head(10)

Reviewer Name
Jillan Dames         36
Kris G               32
Erin Morrison        15
Castor Z             15
Steven Rusert        13
Jeanie B             12
Olya S               12
Dennis Folse         12
Solomon M            12
Stephanie Maxwell    11
Name: count, dtype: int64