In [2]:
import pandas as pd
from scipy.stats import ttest_ind

In [3]:


# Load your dataset
df_chicago = pd.read_csv('~/desktop/Project_7/Chicago.csv')
df_san_diego = pd.read_csv('~/desktop/Project_7/San Diego.csv')
df_san_francisco = pd.read_csv('~/desktop/Project_7/San Francisco.csv')
df_new_york = pd.read_csv('~/desktop/Project_7/New York.csv')
df_los_angeles = pd.read_csv('~/desktop/Project_7/Los Angeles.csv')



In [5]:
# Set significance level
alpha = 0.05

# Filter data for Italian and American restaurants
italian_food = df_san_diego [df_san_diego ['food_type'] == 'Italian']['food']
american_food = df_san_diego [df_san_diego ['food_type'] == 'American']['food']

# Check the number of restaurants in each group
num_italian = italian_food.count()
num_american = american_food.count()
print(f'Number of Italian restaurants: {num_italian}')
print(f'Number of American restaurants: {num_american}')

# Calculate the average food ratings
avg_italian_food = italian_food.mean()
avg_american_food = american_food.mean()
print(f'Average food rating for Italian restaurants: {avg_italian_food:.2f}')
print(f'Average food rating for American restaurants: {avg_american_food:.2f}')


t_stat, p_value = ttest_ind(italian_food, american_food, equal_var=True)


# Check significance
if p_value < alpha:
    result = "significant"
else:
    result = "not significant"

# Print results
print(f'T-statistic: {t_stat:.2f}, P-value: {p_value:.4f}')
print(f'There is a {result} difference in food ratings between Italian and American restaurants.')


Number of Italian restaurants: 40
Number of American restaurants: 59
Average food rating for Italian restaurants: 4.57
Average food rating for American restaurants: 4.35
T-statistic: 3.36, P-value: 0.0011
There is a significant difference in food ratings between Italian and American restaurants.


In [6]:
#Statistical Comparison of Ambience Ratings Based on Review Counts

# Create two groups based on review count
high_reviews = df_chicago[df_chicago['number_of_reviews'] > 500]['ambience']
low_reviews = df_chicago[df_chicago['number_of_reviews'] <= 500]['ambience']

# Count the number of restaurants in each group
count_high_reviews = high_reviews.count()
count_low_reviews = low_reviews.count()
print(f'Number of restaurants with high reviews (>500): {count_high_reviews}')
print(f'Number of restaurants with low reviews (<=500): {count_low_reviews}')

# Calculate the average ambience for each group
avg_high_reviews = high_reviews.mean()
avg_low_reviews = low_reviews.mean()

# Print the average ambience scores
print(f'Average ambience for restaurants with high reviews (>500): {avg_high_reviews:.2f}')
print(f'Average ambience for restaurants with low reviews (<=500): {avg_low_reviews:.2f}')

# Perform the t-test
t_stat, p_value = ttest_ind(high_reviews, low_reviews, equal_var=True)

# Check for significance
if p_value < alpha:
    significance = "significant"
else:
    significance = "not significant"
    
# Print the t-test results
print(f'T-statistic: {t_stat:.4f}, P-value: {p_value:.8f}')
print(f'The difference in ambience ratings is {significance}.')



Number of restaurants with high reviews (>500): 153
Number of restaurants with low reviews (<=500): 168
Average ambience for restaurants with high reviews (>500): 4.48
Average ambience for restaurants with low reviews (<=500): 4.34
T-statistic: 4.3463, P-value: 0.00001864
The difference in ambience ratings is significant.


In [7]:
# Comparative Analysis of Restaurant Value Based on Ambience Ratings
high_ambience_value = df_chicago[df_chicago['ambience'] > 4.5]['value']
low_ambience_value = df_chicago[df_chicago['ambience'] <= 4.5]['value']

# Calculate the average and median value for each group
avg_high_ambience_value = high_ambience_value.mean()
avg_low_ambience_value = low_ambience_value.mean()

# Print the counts and averages
print(f'Number of restaurants with high ambience (> 4.5): {high_ambience_value.count()}')
print(f'Number of restaurants with low ambience (<= 4.5): {low_ambience_value.count()}')
print(f'Average value for restaurants with high ambience: {avg_high_ambience_value:.2f}')
print(f'Average value for restaurants with low ambience: {avg_low_ambience_value:.2f}')

# Perform the t-test
t_stat, p_value = ttest_ind(high_ambience_value, low_ambience_value, equal_var=True)

# Print the t-test results
print(f'T-statistic: {t_stat:.2f}, P-value: {p_value:.8f}')


Number of restaurants with high ambience (> 4.5): 128
Number of restaurants with low ambience (<= 4.5): 193
Average value for restaurants with high ambience: 4.38
Average value for restaurants with low ambience: 4.00
T-statistic: 7.36, P-value: 0.00000000


In [36]:
# Extract service columns for San Diego and San Francisco
sd_service = df_san_diego['service']
sf_service = df_san_francisco['service']

# Calculate the average service ratings
avg_sd_service = sd_service.mean()
avg_sf_service = sf_service.mean()

# Perform t-test
t_stat, p_value = ttest_ind(sd_service, sf_service, equal_var=True)

# Print results
print(f"Average service rating for San Diego: {avg_sd_service:.2f}")
print(f"Average service rating for San Francisco: {avg_sf_service:.2f}")
print(f"T-statistic: {t_stat:.4f}")
print(f"P-value: {p_value:.4f}")


Average service rating for San Diego: 4.45
Average service rating for San Francisco: 4.51
T-statistic: -2.5455
P-value: 0.0110


In [43]:
# Extract rating columns for New York and Los Angeles
ny_rating = df_new_york['rating']
la_rating = df_los_angeles['rating']

# Calculate the average service ratings
avg_ny_rating  = sd_service.mean()
avg_la_rating  = sf_service.mean()

# Perform t-test
t_stat, p_value = ttest_ind(ny_rating , la_rating , equal_var=True)

# Print results
print(f"Average service rating for New York: {avg_ny_rating :.2f}")
print(f"Average service rating for Los Angeles: {avg_la_rating :.2f}")
print(f"T-statistic: {t_stat:.4f}")
print(f"P-value: {p_value:.4f}")


Average service rating for New York: 4.55
Average service rating for Los Angeles: 4.59
T-statistic: -1.9190
P-value: 0.0555


In [50]:
# Extract rating columns for San Francisco and San Diego
sf_rating = df_san_francisco['rating']
sd_rating = df_san_diego['rating']

# Calculate the average ratings
avg_sf_rating  = sf_rating .mean()
avg_sd_rating  = sd_rating .mean()

t_stat, p_value = ttest_ind(sf_rating, sd_rating, equal_var=True)
print(f"Average service rating for San Francisco: {avg_sf_rating:.2f}")
print(f"Average service rating for San Diego: {avg_sd_rating:.2f}")
print("T-statistic:", t_stat, "P-value:", p_value)


Average service rating for San Francisco: 4.53
Average service rating for San Diego: 4.45
T-statistic: 3.657606888653669 P-value: 0.0002664359593113553


In [61]:

# Filter the food ratings based on ambience
high_ambience_food = df_san_diego[df_san_diego['ambience'] >= 4.5]['food']
low_ambience_food = df_san_diego[df_san_diego['ambience'] < 4.5]['food']

# Perform the t-test
t_stat, p_value = ttest_ind(high_ambience_food, low_ambience_food, equal_var=False)

# Calculate averages
high_ambience_avg = high_ambience_food.mean()
low_ambience_avg = low_ambience_food.mean()

# Print results with formatting to two decimal places
print(f"T-statistic: {t_stat:.2f}")
print(f"P-value: {p_value:}")
print(f"Average Food Rating (High Ambience): {high_ambience_avg:.2f}")
print(f"Average Food Rating (Low Ambience): {low_ambience_avg:.2f}")


T-statistic: 9.28
P-value: 1.450348639760867e-16
Average Food Rating (High Ambience): 4.55
Average Food Rating (Low Ambience): 4.20


In [63]:
from scipy import stats
import pandas as pd

# Sample DataFrame (replace this with your actual DataFrame)
# df = pd.read_csv('your_data_file.csv')

# Extract the ratings column
ratings = df_chicago['rating']

# Perform one-sample t-test against the population mean of 4.0
t_statistic, p_value = stats.ttest_1samp(ratings, 4.0)

# Print the results
print("T-Statistic:", t_statistic)
print("P-Value:", p_value)

# Interpretation
if p_value < 0.05:
    print("The average rating is significantly different from 4.0.")
else:
    print("The average rating is not significantly different from 4.0.")


T-Statistic: 24.564624713606406
P-Value: 1.0285657523119223e-75
The average rating is significantly different from 4.0.
