In [22]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("amazon_polarity")

# Take a small subset, e.g., 50,000 rows for train, 5,000 for test
train_subset = dataset['train'].shuffle(seed=42).select(range(50000))
test_subset = dataset['test'].shuffle(seed=42).select(range(5000))

# Merge title and content into a single text column
def merge_text(example):
    example['text'] = example['title'] + " " + example['content']
    return example

train_subset = train_subset.map(merge_text)
test_subset = test_subset.map(merge_text)


In [23]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(
    list(train_subset['text']),
    list(train_subset['label']),
    test_size=0.1,  # 10% for validation
    random_state=42
)

test_texts = list(test_subset['text'])
test_labels = list(test_subset['label'])

In [24]:
print(train_texts[0])
train_labels[0]  # 1


The best software to have on hand I started out with the pc Basic fax, upgraded to the 9.0 version when I upgraded my computer and then upgraded again to the 10.0 version of Winfax. No matter which version, it's easy to use easy to install and very handy and helpful to have. I don't fax often but when I do I know I can rely on Winfax 10.0 to perform exactly as stated. And you don't need a manual or book to learn how to use it. The manual and instructions included in the program is all you really need.I would like to add this statement from a previous or what another reviewer wrote: "Amazon states that 10.0 will work with Windows XP. Version 10.02 is required according to Symantec's own website."My search shows not only does Amazon state 10.0 will work with windows xp, Symentec's own web site states the same thing. No where did I find that Symentec's website stated Version 10.02 is required for Windows XP. People do your own search in addition do to reading these reviews.


1

In [25]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    # def clean_text(example):
    # text = example["text"]
    # text = text.lower()
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return {"clean_text":text}

In [26]:
train_texts = [clean_text(text) for text in train_texts]
val_texts = [clean_text(text) for text in val_texts]
test_texts = [clean_text(text) for text in test_texts]

In [27]:
print(train_texts[0])

{'clean_text': 'the best software to have on hand i started out with the pc basic fax upgraded to the version when i upgraded my computer and then upgraded again to the version of winfax no matter which version its easy to use easy to install and very handy and helpful to have i dont fax often but when i do i know i can rely on winfax to perform exactly as stated and you dont need a manual or book to learn how to use it the manual and instructions included in the program is all you really needi would like to add this statement from a previous or what another reviewer wrote amazon states that will work with windows xp version is required according to symantecs own websitemy search shows not only does amazon state will work with windows xp symentecs own web site states the same thing no where did i find that symentecs website stated version is required for windows xp people do your own search in addition do to reading these reviews'}


In [28]:
print(train_subset[0])
print(test_subset[0])
print(train_subset.column_names)
print(train_labels[0])

{'label': 0, 'title': 'Anyone who likes this better than the Pekinpah is a moron.', 'content': "All the pretty people in this film. Even the Rudy character played by Michael Madsen. This is adapted from a Jim Thompson novel for cryin' out loud! These are supposed to be marginal characters, not fashion models. Though McQueen and McGraw were attractive (but check out McQueen's crummy prison haircut) they were believable in the role. Baldwin and Bassinger seem like movie stars trying to act like hard cases. Action wise, the robbery scene in the Pekinpah version was about 100 times more exciting and suspenseful than anything in this re-make.", 'text': "Anyone who likes this better than the Pekinpah is a moron. All the pretty people in this film. Even the Rudy character played by Michael Madsen. This is adapted from a Jim Thompson novel for cryin' out loud! These are supposed to be marginal characters, not fashion models. Though McQueen and McGraw were attractive (but check out McQueen's cr

In [29]:
# tokeinzation tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer

# Create the vectorizer
vectorizer = TfidfVectorizer(
    max_features=10000,  # limit vocabulary size
    ngram_range=(1, 2),  # unigrams + bigrams
)

In [30]:

# Extract the cleaned text from the dictionaries
train_texts_clean = [d['clean_text'] for d in train_texts]
val_texts_clean = [d['clean_text'] for d in val_texts]
test_texts_clean = [d['clean_text'] for d in test_texts]

In [31]:
# Fit on train and transform train + val + test
X_train = vectorizer.fit_transform(train_texts_clean)
X_val = vectorizer.transform(val_texts_clean)
X_test = vectorizer.transform(test_texts_clean)

In [32]:
# Labels
y_train = train_labels
y_val = val_labels
y_test = test_labels

In [33]:
print("Train shape:", X_train.shape)

Train shape: (45000, 10000)


In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix,roc_auc_score

# Initialize the model
lr_model = LogisticRegression(
    max_iter=1000,
    solver='saga',  # for large datasets
    random_state=42)

In [35]:
lr_model.fit(X_train, y_train)

In [36]:
val_preds = lr_model.predict(X_val)
# Evaluate
val_acc = accuracy_score(y_val, val_preds)
val_prec = precision_score(y_val, val_preds)
val_rec = recall_score(y_val, val_preds)
val_f1 = f1_score(y_val, val_preds)
roc_auc = roc_auc_score(y_val, lr_model.predict_proba(X_val)[:,1])


print("Validation Accuracy:", val_acc)
print("Validation Precision:", val_prec)
print("Validation Recall:", val_rec)
print("Validation F1 Score:", val_f1)
print("Validation ROC AUC:", roc_auc)


Validation Accuracy: 0.898
Validation Precision: 0.9024193548387097
Validation Recall: 0.8930566640063847
Validation F1 Score: 0.8977135980746089
Validation ROC AUC: 0.9619351407464106
