In [18]:
df=pd.read_csv('millers_ale_house.csv')

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 729 entries, 0 to 728
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   name      729 non-null    object
 1   rating    729 non-null    int64 
 2   location  729 non-null    object
 3   date      729 non-null    object
 4   body      729 non-null    object
dtypes: int64(1), object(4)
memory usage: 28.6+ KB


In [20]:
df['rating'] = df['rating'].astype(int)

df['sentiment'] = df['rating'].apply(lambda x: 'Negative' if x <= 3 else 'Positive')

In [21]:
df

Unnamed: 0,name,rating,location,date,body,sentiment
0,Deb G.,5,"Jersey City, NJ",28-Jan-24,"Food was good as always. Our server, Victoria,...",Positive
1,Randi P.,4,"Hackensack, NJ",26-Jan-24,Met an old friend for a quick bite and a drink...,Positive
2,Randi P.,3,"Hackensack, NJ",26-Jan-24,Met an old friend for a quick bite and a drink...,Negative
3,Randi P.,5,"Hackensack, NJ",26-Jan-24,Met an old friend for a quick bite and a drink...,Positive
4,Jamel O.,3,"Jackson Heights, NY",17-Feb-24,I usually love Miller's. I have been to a coup...,Negative
...,...,...,...,...,...,...
724,Gabe S.,4,"Oradell, NJ",15-Feb-14,I just joined yelp due to a TERRIBLE dining ex...,Positive
725,R K.,3,"Allendale, NJ",3-Sep-12,Ate here for lunch today. The best way I can d...,Negative
726,Jamie B.,5,"Haledon, NJ",20-Apr-13,food was great..our server Heather was excelle...,Positive
727,Lilly S.,1,"Wyckoff, NJ",30-Apr-13,Recently went here for the second time and I w...,Negative


cleaning with lemmatizer included.

In [24]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

def clean_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    clean_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in string.punctuation]
    stop_words = set(stopwords.words('english'))
    clean_tokens = [token for token in clean_tokens if token not in stop_words]
    clean_text = ' '.join(clean_tokens)
    return clean_text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


<function clean_text at 0x7969a89a2cb0>


In [29]:
cleaned_text = df['body'].apply(clean_text)

print(cleaned_text)

0      food wa good always server victoria wa spot or...
1      met old friend quick bite drink wing good come...
2      met old friend quick bite drink wing good come...
3      met old friend quick bite drink wing good come...
4      usually love miller 's couple always sat bar w...
                             ...                        
724    joined yelp due terrible dining experience fig...
725    ate lunch today best way describe miller ale h...
726    food wa great .. server heather wa excellent w...
727    recently went second time never go back husban...
728    pretty good depending order ... low end anythi...
Name: body, Length: 729, dtype: object


Training

In [30]:
X = cleaned_text

y = (df['sentiment'] == 'Positive').astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


Tf-idf vectorization and model building

In [31]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

from sklearn.metrics import classification_report, recall_score, precision_score, f1_score, roc_curve, auc, precision_recall_curve


names = ["Logistic Regression", "SVM", "Decision Tree", "Random Forest", "AdaBoost", "Neural Net",
         "Naive Bayes"]

classifiers = [LogisticRegression(),
               SVC(probability=True),
               DecisionTreeClassifier(max_depth=5),
               RandomForestClassifier(max_depth=5, n_estimators=10),
               AdaBoostClassifier(),
               MLPClassifier(alpha=1, max_iter=1000),
               MultinomialNB()
               ]


for name, clf in zip(names, classifiers):
  clf_pipe = Pipeline([
                    ('tfidf', TfidfVectorizer()),
                    (name, clf),
                    ])

  clf_pipe.fit(X_train,y_train)

  pred = clf_pipe.predict(X_test)
  pred_prob = clf_pipe.predict_proba(X_test)[:, 1]

  fpr, tpr, thresholds = roc_curve(y_test, pred_prob)
  precision, recall, thresholds_pr = precision_recall_curve(y_test, pred)

  print('\n\n', name, '\n\n')
  print(classification_report(y_test, pred))
  print('ROC AUC: ', auc(fpr, tpr))
  print('Precision/Recall AUC: ', auc(precision, recall))
  print('\n\n')



 Logistic Regression 


              precision    recall  f1-score   support

           0       0.77      0.78      0.77        98
           1       0.82      0.81      0.81       121

    accuracy                           0.79       219
   macro avg       0.79      0.79      0.79       219
weighted avg       0.79      0.79      0.79       219

ROC AUC:  0.8845083487940631
Precision/Recall AUC:  0.3132920110192837





 SVM 


              precision    recall  f1-score   support

           0       0.78      0.76      0.77        98
           1       0.81      0.83      0.82       121

    accuracy                           0.79       219
   macro avg       0.79      0.79      0.79       219
weighted avg       0.79      0.79      0.79       219

ROC AUC:  0.8708466857817506
Precision/Recall AUC:  0.3118827369018185





 Decision Tree 


              precision    recall  f1-score   support

           0       0.62      0.85      0.72        98
           1       0.82      0.58

Removed tf-idf and added count vectorizer instead.

In [32]:
from sklearn.feature_extraction.text import CountVectorizer

for name, clf in zip(names, classifiers):
    clf_pipe = Pipeline([
        ('count_vectorizer', CountVectorizer()),
        (name, clf)
    ])

    clf_pipe.fit(X_train, y_train)

    pred = clf_pipe.predict(X_test)
    pred_prob = clf_pipe.predict_proba(X_test)[:, 1]

    fpr, tpr, thresholds = roc_curve(y_test, pred_prob)
    precision, recall, thresholds_pr = precision_recall_curve(y_test, pred)

    print('\n\n', name, '\n\n')
    print(classification_report(y_test, pred))
    print('ROC AUC: ', auc(fpr, tpr))
    print('Precision/Recall AUC: ', auc(precision, recall))
    print('\n\n')



 Logistic Regression 


              precision    recall  f1-score   support

           0       0.74      0.71      0.73        98
           1       0.78      0.80      0.79       121

    accuracy                           0.76       219
   macro avg       0.76      0.76      0.76       219
weighted avg       0.76      0.76      0.76       219

ROC AUC:  0.8283437341878901
Precision/Recall AUC:  0.29110955130382277





 SVM 


              precision    recall  f1-score   support

           0       0.70      0.70      0.70        98
           1       0.76      0.76      0.76       121

    accuracy                           0.74       219
   macro avg       0.73      0.73      0.73       219
weighted avg       0.74      0.74      0.74       219

ROC AUC:  0.8110558272895936
Precision/Recall AUC:  0.274029208649383





 Decision Tree 


              precision    recall  f1-score   support

           0       0.63      0.74      0.68        98
           1       0.76      0.64

Evaluation: In terms of overall performance the best performing model was the TF-IDF Logistic Regression model with a ROCAUC of .88 and PRAUC of .31 followed closely behind by the TF-IDF Naive Bayes model with an ROCAUC of .88 and PRAUC of .30. For just the count-vectorization the best performing model is easily the Naive-Bayes model with an ROCAUC of .87 and PRAUC .30. I think ROCAUC is the best metric to use here because ROCAUC measures the ability to determine positive and negative cases making it the best way to determine positive and negative sentiment based on reviews.