In [58]:
import pandas as pd
import string

In [59]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB

In [60]:
from sklearn.metrics import classification_report

**For 200 reviews**

In [61]:
tag_df = pd.read_csv("annotation dataset/small_annotation.csv", encoding = "ISO-8859-1")
sentiment_df = pd.read_csv("annotation dataset/sentiment annotation.csv", encoding = "ISO-8859-1")

In [62]:
from sklearn.utils import resample

def upsample(df):
    df.fillna(0,inplace = True)
    majority = int(df.iloc[:,1].values.sum())
    # Separate majority and minority classes
    df_majority = df[df.iloc[:,1]==1]
    df_minority = df[df.iloc[:,1]==0]
    # Upsample minority class
    df_minority_upsampled = resample(df_minority, 
                                     replace=True,     # sample with replacement
                                     n_samples=majority,    # to match majority class
                                     random_state=123) # reproducible results

    # Combine majority class with upsampled minority class
    upsampled = pd.concat([df_majority, df_minority_upsampled])
    return upsampled

In [63]:
food = tag_df.loc[tag_df['food'] == 1].reset_index(drop=True)
ambience = tag_df.loc[tag_df['ambience'] == 1].reset_index(drop=True)
service = tag_df.loc[tag_df['service'] == 1].reset_index(drop=True)
price = tag_df.loc[tag_df['price'] == 1].reset_index(drop=True)

In [64]:
exclude = set(string.punctuation)
def get_text_subset(df):
    txt = []
    for i in range(len(df)):
        text = df.at[i,'text'].replace('\n \n', '').lower()
        #text = ''.join(ch for ch in text if ch not in exclude)
        txt.append(text)
    return txt

In [65]:
# extract sentiment into 4 sub datasets
categories = ['food', 'ambience', 'service', 'price']
def extract_sentiment(category):  
    reviews = []
    for i in range(len(tag_df[category])):
        for j in range(len(sentiment_df)):
            if tag_df['review_id'][i] == sentiment_df['review_id'][j] and tag_df[category][i] == 1:
                reviews.append(sentiment_df.iloc[j])
    print(len(reviews))  
    return reviews

In [66]:
food_df = pd.DataFrame(extract_sentiment(categories[0]))
ambience_df = pd.DataFrame(extract_sentiment(categories[1]))
service_df = pd.DataFrame(extract_sentiment(categories[2]))
price_df = pd.DataFrame(extract_sentiment(categories[3]))

189
83
137
55


In [67]:
# reset index
food_df.reset_index(inplace = True)
ambience_df.reset_index(inplace = True)
service_df.reset_index(inplace = True)
price_df.reset_index(inplace = True)

In [68]:
# get 4 subsets from sentiment.csv
food_txt = get_text_subset(food_df)
ambience_txt = get_text_subset(ambience_df)
service_txt = get_text_subset(service_df)
price_txt = get_text_subset(price_df)

In [69]:
def get_label(df,label):
    txt = []
    for i in range(len(df)):
        text = df.at[i,label]
        txt.append(text)
    return txt

In [70]:
# get 4 labels from sentiment.csv
food_label = get_label(food_df, 'food_pos')
ambience_label = get_label(ambience_df, 'ambience_pos')
service_label = get_label(service_df, 'service_pos')
price_label = get_label(price_df, 'price_pos')

**Naive Bayes Model**

In [71]:
def naive_bayes(txt, label):
    review_tokens = [review.split() for review in txt]
    
    onehot_enc = MultiLabelBinarizer()
    onehot_enc.fit(review_tokens)
    X_train, X_test, y_train, y_test = train_test_split(review_tokens, label, test_size=0.2, random_state=None)
    #return onehot_enc, X_train, X_test, y_train, y_test
    
    # classification report
    bnbc = BernoulliNB(binarize=None)
    bnbc.fit(onehot_enc.transform(X_train), y_train)
    y_test_pred = bnbc.predict(onehot_enc.transform(X_test))
    score_train = bnbc.score(onehot_enc.transform(X_train), y_train)
    #print("score on training data :",score_train)
    score_test = bnbc.score(onehot_enc.transform(X_test),y_test)
    print("testing data accuracy:",score_test)
    print("Classification report for testing data :")
    print(classification_report(y_test, y_test_pred))
    #print(y_test_pred)

In [72]:
print('food')
naive_bayes(food_txt, food_label)
print('ambience')
naive_bayes(ambience_txt, ambience_label)
print('service')
naive_bayes(service_txt, service_label)
print('price')
naive_bayes(price_txt, price_label)

food
testing data accuracy: 0.868421052631579
Classification report for testing data :
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         4
         1.0       0.89      0.97      0.93        34

   micro avg       0.87      0.87      0.87        38
   macro avg       0.45      0.49      0.46        38
weighted avg       0.80      0.87      0.83        38

ambience
testing data accuracy: 0.8823529411764706
Classification report for testing data :
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         2
         1.0       0.88      1.00      0.94        15

   micro avg       0.88      0.88      0.88        17
   macro avg       0.44      0.50      0.47        17
weighted avg       0.78      0.88      0.83        17

service
testing data accuracy: 0.75
Classification report for testing data :
              precision    recall  f1-score   support

         0.0       1.00      0.30  

  'precision', 'predicted', average, warn_for)


**For 391 reviews**

In [85]:
tag_df = pd.read_csv("annotation dataset/big_annotation.csv", encoding = "ISO-8859-1")
sentiment_df = pd.read_csv("annotation dataset/sentiment annotation.csv", encoding = "ISO-8859-1")

In [86]:
from sklearn.utils import resample

def upsample(df):
    df.fillna(0,inplace = True)
    majority = int(df.iloc[:,1].values.sum())
    # Separate majority and minority classes
    df_majority = df[df.iloc[:,1]==1]
    df_minority = df[df.iloc[:,1]==0]
    # Upsample minority class
    df_minority_upsampled = resample(df_minority, 
                                     replace=True,     # sample with replacement
                                     n_samples=majority,    # to match majority class
                                     random_state=123) # reproducible results

    # Combine majority class with upsampled minority class
    upsampled = pd.concat([df_majority, df_minority_upsampled])
    return upsampled

In [87]:
food = tag_df.loc[tag_df['food'] == 1].reset_index(drop=True)
ambience = tag_df.loc[tag_df['ambience'] == 1].reset_index(drop=True)
service = tag_df.loc[tag_df['service'] == 1].reset_index(drop=True)
price = tag_df.loc[tag_df['price'] == 1].reset_index(drop=True)

In [88]:
exclude = set(string.punctuation)
def get_text_subset(df):
    txt = []
    for i in range(len(df)):
        text = df.at[i,'text'].replace('\n \n', '').lower()
        #text = ''.join(ch for ch in text if ch not in exclude)
        txt.append(text)
    return txt

In [89]:
# extract sentiment into 4 sub datasets
categories = ['food', 'ambience', 'service', 'price']
def extract_sentiment(category):  
    reviews = []
    for i in range(len(tag_df[category])):
        for j in range(len(sentiment_df)):
            if tag_df['review_id'][i] == sentiment_df['review_id'][j] and tag_df[category][i] == 1:
                reviews.append(sentiment_df.iloc[j])
    print(len(reviews))  
    return reviews

In [90]:
food_df = pd.DataFrame(extract_sentiment(categories[0]))
ambience_df = pd.DataFrame(extract_sentiment(categories[1]))
service_df = pd.DataFrame(extract_sentiment(categories[2]))
price_df = pd.DataFrame(extract_sentiment(categories[3]))

372
152
248
116


In [91]:
# reset index
food_df.reset_index(inplace = True)
ambience_df.reset_index(inplace = True)
service_df.reset_index(inplace = True)
price_df.reset_index(inplace = True)

In [92]:
# get 4 subsets from sentiment.csv
food_txt = get_text_subset(food_df)
ambience_txt = get_text_subset(ambience_df)
service_txt = get_text_subset(service_df)
price_txt = get_text_subset(price_df)

In [93]:
def get_label(df,label):
    txt = []
    for i in range(len(df)):
        text = df.at[i,label]
        txt.append(text)
    return txt

In [94]:
# get 4 labels from sentiment.csv
food_label = get_label(food_df, 'food_pos')
ambience_label = get_label(ambience_df, 'ambience_pos')
service_label = get_label(service_df, 'service_pos')
price_label = get_label(price_df, 'price_pos')

**Naive Bayes Model**

In [95]:
def naive_bayes(txt, label):
    review_tokens = [review.split() for review in txt]
    
    onehot_enc = MultiLabelBinarizer()
    onehot_enc.fit(review_tokens)
    X_train, X_test, y_train, y_test = train_test_split(review_tokens, label, test_size=0.2, random_state=None)
    #return onehot_enc, X_train, X_test, y_train, y_test
    
    # classification report
    bnbc = BernoulliNB(binarize=None)
    bnbc.fit(onehot_enc.transform(X_train), y_train)
    y_test_pred = bnbc.predict(onehot_enc.transform(X_test))
    score_train = bnbc.score(onehot_enc.transform(X_train), y_train)
    #print("score on training data :",score_train)
    score_test = bnbc.score(onehot_enc.transform(X_test),y_test)
    print("testing data accuracy:",score_test)
    print("Classification report for testing data :")
    print(classification_report(y_test, y_test_pred))
    #print(y_test_pred)

In [96]:
print('food')
naive_bayes(food_txt, food_label)
print('ambience')
naive_bayes(ambience_txt, ambience_label)
print('service')
naive_bayes(service_txt, service_label)
print('price')
naive_bayes(price_txt, price_label)

food
testing data accuracy: 0.8133333333333334
Classification report for testing data :
              precision    recall  f1-score   support

         0.0       0.40      0.15      0.22        13
         1.0       0.84      0.95      0.89        62

   micro avg       0.81      0.81      0.81        75
   macro avg       0.62      0.55      0.56        75
weighted avg       0.77      0.81      0.78        75

ambience
testing data accuracy: 0.8387096774193549
Classification report for testing data :
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         4
         1.0       0.87      0.96      0.91        27

   micro avg       0.84      0.84      0.84        31
   macro avg       0.43      0.48      0.46        31
weighted avg       0.75      0.84      0.79        31

service
testing data accuracy: 0.64
Classification report for testing data :
              precision    recall  f1-score   support

         0.0       0.17      0.07 