# `pipeline_full.ipynb`

In this notebook I perform sentiment analysis on 'Google Reviews of Medical Facilities' from [Kaggle]("https://www.kaggle.com/datasets/cgrowe96/google-reviews-of-us-medical-facilities") using TF-IDF and Logistic Regression (`sklearn`)

## Preparation

### Imports

In [15]:
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import joblib
import numpy as np

### Load and Clean Data

In [None]:
# 1) load
df = pd.read_csv("data\\raw_data.csv")

# 2) inspect
print(df.shape)
print(df.columns.tolist()[:50])   # show first 50 columns

# 3) keep only the relevant columns
cols_needed = ['Review Text', 'label']

df = df[cols_needed]

# 4) drop rows with missing text or label
df = df.dropna(subset=['Review Text', 'label'])

# 5) keep only rows where label is 'positive' or 'negative' (in case others exist)
df = df[df['label'].isin(['positive', 'negative'])].copy()

print("After cleaning rows:", df.shape)
print(df['label'].value_counts())

(233024, 21)
['Unnamed: 0', 'Author', 'Review Text', 'Review Rating', 'Date', 'Owner Answer', 'Owner Answer Date', 'Author Profile', 'Author Image', 'Review URL', 'label', 'zip', 'spill1', 'spill2', 'spill3', 'spill4', 'spill5', 'spill7', 'spill7.1', 'spill8', 'state']
After cleaning rows: (202047, 2)
label
positive    116273
negative     85774
Name: count, dtype: int64


### Process Review Text using regex

In [17]:
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    # remove HTML tags
    text = re.sub(r'<[^>]+>', ' ', text)
    # remove URLs
    text = re.sub(r'http\S+|www\.\S+', ' ', text)
    # remove emails
    text = re.sub(r'\S+@\S+', ' ', text)
    # keep apostrophes (helpful for contractions) but remove other punctuation
    text = re.sub(r"[^a-z0-9\s']", ' ', text)
    # collapse whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# apply (fast enough for ~233k rows)
df['clean_text'] = df['Review Text'].astype(str).apply(clean_text)

# quick check
print(df['clean_text'].sample(5).tolist())

['i am so happy that i ve found a great primary care home everyone is so helpful and caring thank you', 'i do not understand the bad reviews i just went a week or so ago for a mammogram the staff in the front office as well as those who were doing the exam were extremely friendly and made the whole process which i have never liked easy and painless will definitely go back there again', "nurses are nice and all that but i just don't understand how a person in their care in whats supposed to be a clean hospital can get infection after infection after infection in the damn hospital how is someone in their care getting worse and having more problems in their care than they were having ever before seriously who goes to a hospital and gets sicker", "we had our baby at this place the front desk volunteer is rude as can be and after 5 u half to enter the hospital thru the emergency room which is insane bc they make u walk thru where a bunch of sick people are to go to the hospital where people

### Split into Train and Test sets

In [18]:
X = df['clean_text']
y = df['label'].map({'negative': 0, 'positive': 1})  # binary numeric labels

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)

print("Train size:", X_train.shape[0], "Test size:", X_test.shape[0])
print("Train label distribution:\n", y_train.value_counts(normalize=True))

Train size: 161637 Test size: 40410
Train label distribution:
 label
1    0.575475
0    0.424525
Name: proportion, dtype: float64


## Modelling

In [19]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        ngram_range=(1,2),
        min_df=5,          # ignore tokens that appear in fewer than 5 docs
        max_df=0.95,       # ignore tokens that appear in more than 95% of docs
        max_features=50000, # cap vocabulary size
        sublinear_tf=True,
        stop_words=None     # keep stopwords (negations are important)
    )),
    ('clf', LogisticRegression(
        solver='saga',
        penalty='l2',
        C=1.0,
        max_iter=1000,
        class_weight='balanced',  # helpful if labels are imbalanced
        n_jobs=-1,
        random_state=42
    ))
])

# Train
pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('tfidf', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'saga'
,max_iter,1000


## Prediction and Analysis

### Performance Metrics

In [20]:
y_pred = pipeline.predict(X_test)

acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
f1_macro = f1_score(y_test, y_pred, average='macro')
f1_weighted = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {acc:.4f}")
print(f"F1 (binary, positive=1): {f1:.4f}")
print(f"F1 macro: {f1_macro:.4f}, F1 weighted: {f1_weighted:.4f}\n")

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['negative', 'positive']))

# Confusion matrix (optional)
cm = confusion_matrix(y_test, y_pred)
print("Confusion matrix:\n", cm)

Accuracy: 0.9683
F1 (binary, positive=1): 0.9721
F1 macro: 0.9676, F1 weighted: 0.9683

Classification Report:
              precision    recall  f1-score   support

    negative       0.95      0.98      0.96     17155
    positive       0.98      0.96      0.97     23255

    accuracy                           0.97     40410
   macro avg       0.97      0.97      0.97     40410
weighted avg       0.97      0.97      0.97     40410

Confusion matrix:
 [[16736   419]
 [  864 22391]]


Not bad at all for a simple model. May experiment with `DistilBERT`, `BERT` or `RoBERTa` at a later point.

### Saving the model in `pickle` format

In [21]:
joblib.dump(pipeline, "sentiment_tfidf_logreg.pkl")
print("Saved model to sentiment_tfidf_logreg.pkl")

Saved model to sentiment_tfidf_logreg.pkl


### Testing with my own text

In [22]:
labels = ["negative", "positive"]
loaded = joblib.load("sentiment_tfidf_logreg.pkl")
sample = ["I was going to say how much I love this hospital but unfortunately I cannot."]
print(f"Pred label: {labels[loaded.predict(sample)[0]]}")
print("Probabilities:", loaded.predict_proba(sample))  # probability for [neg, pos]

Pred label: positive
Probabilities: [[0.13263064 0.86736936]]


### Identifying the most influential words

In [23]:
vec = pipeline.named_steps['tfidf']
clf = pipeline.named_steps['clf']

feature_names = vec.get_feature_names_out()
coefs = clf.coef_[0]
top_n = 20

top_pos_idx = np.argsort(coefs)[-top_n:][::-1]
top_neg_idx = np.argsort(coefs)[:top_n]

print("Top positive features:")
print(feature_names[top_pos_idx])
print("\nTop negative features:")
print(feature_names[top_neg_idx])

Top positive features:
['great' 'amazing' 'excellent' 'thank' 'thank you' 'best' 'wonderful'
 'awesome' 'friendly' 'the best' 'love' 'everyone' 'professional' 'good'
 'caring' 'always' 'kind' 'quickly' 'quick' 'fantastic']

Top negative features:
['rude' 'worst' 'horrible' 'not' 'unprofessional' 'terrible' 'no' 'poor'
 'told' 'don' 'never' 'hours' 'worse' 'joke' 'dirty' 'awful' 'ridiculous'
 'the worst' 'money' 'not recommend']


### Misclassification Analysis

In [24]:
# 1. Build results DataFrame
test_results = pd.DataFrame({
    'text': X_test,
    'true_label': y_test.map({0: 'negative', 1: 'positive'}),
    'pred_label': y_pred,
    'prob_negative': pipeline.predict_proba(X_test)[:, 0],
    'prob_positive': pipeline.predict_proba(X_test)[:, 1]
})
test_results['pred_label'] = test_results['pred_label'].map({0: 'negative', 1: 'positive'})

# 2. Filter misclassified
misclassified = test_results[test_results['true_label'] != test_results['pred_label']]
print(f"Misclassified: {len(misclassified)} / {len(test_results)} ({len(misclassified)/len(test_results):.2%})")

# 3. Show a sample of errors
print("\nSample misclassified reviews:")
display(misclassified.sample(10, random_state=42)[['text', 'true_label', 'pred_label', 'prob_positive']])

# 4. Top words pushing errors (FP = false positives, FN = false negatives)
vec = pipeline.named_steps['tfidf']
clf = pipeline.named_steps['clf']
feature_names = np.array(vec.get_feature_names_out())
coefs = clf.coef_[0]

Misclassified: 1283 / 40410 (3.17%)

Sample misclassified reviews:


Unnamed: 0,text,true_label,pred_label,prob_positive
57157,i have always taken my children to wesley for ...,negative,positive,0.948336
164397,i do have to admit i have to give lots of them...,positive,negative,0.167097
113687,the nurses were absolutely amazing best nurses...,positive,negative,0.246919
57054,there are good doctors at this place,positive,negative,0.391463
1796,service was great but hearing the staff bash o...,negative,positive,0.740925
24842,i had a good experience at st joseph medical c...,negative,positive,0.857833
196026,i would like to thank you for getting my husba...,negative,positive,0.889868
106118,we have never left a review of this type for a...,positive,negative,0.48512
225521,yes there are quite a few that are nice and ve...,negative,positive,0.690405
92779,went to psych ward wasn t feeling well and tol...,positive,negative,0.171358


In [25]:
def top_error_words(texts, direction, top_n=10):
    analyzer = vec.build_analyzer()
    from collections import Counter
    counts = Counter()
    for doc in texts:
        counts.update(analyzer(doc))
    scored = []
    for word, cnt in counts.items():
        if word in feature_names:
            idx = np.where(feature_names == word)[0][0]
            scored.append((word, coefs[idx], cnt))
    key = (lambda x: x[1]) if direction == 'positive' else (lambda x: x[1] * -1)
    return sorted(scored, key=key, reverse=True)[:top_n]

fp = misclassified[(misclassified['pred_label'] == 'positive') & (misclassified['true_label'] == 'negative')]
fn = misclassified[(misclassified['pred_label'] == 'negative') & (misclassified['true_label'] == 'positive')]

print("\nWords pushing toward POSITIVE in false positives:")
for w, c, n in top_error_words(fp['text'], 'positive'):
    print(f"{w:15} coef={c:.2f} count={n}")

print("\nWords pushing toward NEGATIVE in false negatives:")
for w, c, n in top_error_words(fn['text'], 'negative'):
    print(f"{w:15} coef={c:.2f} count={n}")


Words pushing toward POSITIVE in false positives:
great           coef=12.55 count=92
amazing         coef=10.74 count=32
excellent       coef=10.68 count=23
thank           coef=10.08 count=39
thank you       coef=9.57 count=32
best            coef=9.42 count=44
wonderful       coef=8.75 count=25
awesome         coef=7.54 count=9
friendly        coef=7.22 count=19
the best        coef=6.62 count=30

Words pushing toward NEGATIVE in false negatives:
rude            coef=-11.90 count=71
worst           coef=-10.76 count=31
horrible        coef=-10.31 count=37
not             coef=-10.08 count=676
unprofessional  coef=-7.98 count=14
terrible        coef=-7.96 count=23
no              coef=-7.60 count=286
poor            coef=-6.73 count=14
told            coef=-6.38 count=215
don             coef=-6.19 count=159
