In [None]:
import pandas as pd
URL = 'https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_2023/' \
    'raw/review_categories/All_Beauty.jsonl.gz'
df = pd.read_json(URL, lines=True, compression='gzip', nrows=5000)
print(df.columns)

In [None]:
print(df[['rating', 'text']].head())

In [None]:
from sklearn.model_selection import train_test_split
df['sentiment'] = df['rating'].apply(lambda x: 0 if x <= 3 else 1)
labels_count = df['sentiment'].value_counts()
labels_count = labels_count / labels_count.sum()
print(labels_count)  # Labels are imbalanced 7728:2272

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['sentiment'], test_size=0.2
)

In [None]:
stoplist = set('for a of the and to in'.split())
def remove_words(text, to_remove):
    return ' '.join(
        [word for word in text.lower().split() if word not in to_remove]
    )
fn_stop = lambda x: remove_words(x, stoplist)
t_train, t_test = X_train.apply(fn_stop), X_test.apply(fn_stop)

In [None]:
frequnecy = t_train.str.split().explode().value_counts()
frequnecy = frequnecy[frequnecy <= 10]
fn_low = lambda x: remove_words(x, set(frequnecy.index))
t_train, t_test = t_train.apply(fn_low), t_test.apply(fn_low)

In [None]:
from gensim import corpora
dictionary = corpora.Dictionary(t_train.str.split())
corpus = [dictionary.doc2bow(text) for text in t_train.str.split()]

In [None]:
from gensim import models
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=100)
X_train = pd.DataFrame(
    [dict(lsi[dictionary.doc2bow(text.split())]) for text in t_train]
).fillna(0)
X_test = pd.DataFrame(
    [dict(lsi[dictionary.doc2bow(text.split())]) for text in t_test]
).fillna(0)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_sample_weight
lr = LogisticRegression()
sample_weight = compute_sample_weight('balanced', y_train)
lr.fit(X_train, y_train, sample_weight=sample_weight)

In [None]:
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score
from sklearn.metrics import precision_score, recall_score, f1_score
y_pred = lr.predict(X_test)
print(accuracy_score(y_test, y_pred))  # ~0.8
print(precision_score(y_test, y_pred))  # ~0.9
print(recall_score(y_test, y_pred))  # ~0.67
print(f1_score(y_test, y_pred))  # ~0.77

fpr, tpr, _ = roc_curve(y_test, lr.predict_proba(X_test)[:, 1])
auc = roc_auc_score(y_test, lr.predict_proba(X_test)[:, 1])

In [None]:
from matplotlib import pyplot as plt
plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1], '--', color='red')
plt.text(0.6, 0.4, f'AUC: {auc:.3f}')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()

In [None]:
text = [
    'These are pretty and seem well made. I find them comfortable to wear and they are cute.',
    'this product was terrible, i will never recommend it to anyone',
]
text = [remove_words(sentence, stoplist) for sentence in text]
text = [remove_words(sentence, set(frequnecy.index)) for sentence in text]
text = pd.DataFrame([
    dict(lsi[dictionary.doc2bow(sentence.split())])
    for sentence in text
])
print(lr.predict(text))  # [1 0]