In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

In [2]:
# input the dataset with user rating and selected attribute polarities
row_df = pd.read_csv('/path/to/dataWithPolarities.csv')

In [3]:
selected_columns = ['StandardRating','pricePolarity','foodPolarity','locationPolarity','servicePolarity']  
df = row_df[selected_columns]

### Correlation with Pearson

In [4]:
import pandas as pd
from scipy.stats import pearsonr
from itertools import combinations

# Function to calculate correlations based on the input (uni, bi, or tri)
def calculate_correlations(df, input_type):
    correlations = {}
    columns = df.drop(columns=['StandardRating'])  # Exclude the 'StandardRating' column
    
    if input_type == 'uni':
        for col in columns:
            correlation, _ = pearsonr(df['StandardRating'], df[col])
            correlations[f'StandardRating vs {col}'] = correlation

    elif input_type == 'bi':
        column_combinations = combinations(columns, 2)
        for col1, col2 in column_combinations:
            correlation, _ = pearsonr(df['StandardRating'], df[col1] + df[col2])
            correlations[f'StandardRating vs {col1} AND {col2}'] = correlation

    elif input_type == 'tri':
        column_combinations = combinations(columns, 3)
        for col1, col2, col3 in column_combinations:
            correlation, _ = pearsonr(df['StandardRating'], df[col1] + df[col2] + df[col3])
            correlations[f'StandardRating vs {col1} AND {col2} AND {col3}'] = correlation
    else:
        column_combinations = combinations(columns, 4)
        for col1, col2, col3, col4 in column_combinations:
            correlation, _ = pearsonr(df['StandardRating'], df[col1] + df[col2] + df[col3] + df[col4])
            correlations[f'StandardRating vs {col1} AND {col2} AND {col3} AND {col4}'] = correlation

    return correlations

# Input type (uni -> each attribute, bi -> two attributes, tri -> three attributes, quad -> all attributes)
input_type = input("Enter 'uni', 'bi', 'tri', or 'quad': ")

if input_type not in ['uni', 'bi', 'tri','quad']:
    print("Invalid input type.")
else:
    result = calculate_correlations(df, input_type)
    print("Correlations:")
    for key, value in result.items():
        print(f"{key}: {value:.4f}")

Enter 'uni', 'bi', 'tri', or 'quad': uni
Correlations:
StandardRating vs pricePolarity: 0.1187
StandardRating vs foodPolarity: 0.2960
StandardRating vs locationPolarity: 0.4618
StandardRating vs servicePolarity: 0.2878


### Correlation with linear regression

In [5]:
# reshaping arrays to be compatible with linear regression model input
stdPol = np.array(df['StandardRating']).reshape(-1, 1)
pricePol = np.array(df['pricePolarity']).reshape(-1, 1)
foodPol = np.array(df['foodPolarity']).reshape(-1, 1)
locationPol = np.array(df['locationPolarity']).reshape(-1, 1)
servicePol = np.array(df['servicePolarity']).reshape(-1, 1)

In [6]:
model = LinearRegression()
model.fit(foodPol,stdPol)  # X is a matrix of attributes, y is original rating
foodCorrelation = model.coef_

In [7]:
model = LinearRegression()
model.fit(pricePol,stdPol)
priceCorrelation = model.coef_

In [8]:
model = LinearRegression()
model.fit(locationPol,stdPol)
locationCorrelation = model.coef_

In [9]:
model = LinearRegression()
model.fit(servicePol,stdPol)
serviceCorrelation = model.coef_

In [10]:
print('Attribute correlation with user rating using linear regression : ')
print(f'Food Correlation : {foodCorrelation[0][0]}')
print(f'Price Correlation : {priceCorrelation[0][0]}')
print(f'Location Correlation : {locationCorrelation[0][0]}')
print(f'Service Correlation : {serviceCorrelation[0][0]}')

Attribute correlation with user rating using linear regression : 
Food Correlation : 0.5262458842059128
Price Correlation : 0.4446202294061949
Location Correlation : 0.6188126762846683
Service Correlation : 0.5534971204178304
