# base import

In [15]:
import json
import pandas as pd
import numpy as np
import os


# Load train.json

In [16]:
# 读取训练集
with open("train.json", "r", encoding="utf-8") as f:
    train_data = json.load(f)

# 转换为 DataFrame
df = pd.DataFrame(train_data)
print(df.head())


                                             reviews  sentiments
0  I bought this belt for my daughter in-law for ...           1
1  The size was perfect and so was the color.  It...           1
2  Fits and feels good, esp. for doing a swim rac...           1
3  These socks are absolutely the best. I take pi...           1
4  Thank you so much for the speedy delivery they...           1


# ML import

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [18]:
# split the dataset
texts = df["reviews"].values
labels = df["sentiments"].values

# stratified
X_train, X_val, y_train, y_val = train_test_split(
    texts, labels, test_size=0.2, random_state=42, stratify=labels
)

print("train size:", len(X_train))
print("validation size:", len(X_val))


train size: 5920
validation size: 1481


# TF-IDT+LogisticRegression

In [19]:
# convert text to TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf   = vectorizer.transform(X_val)


In [20]:
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_tfidf, y_train)

# predict and evaluate
y_pred = clf.predict(X_val_tfidf)

print(classification_report(y_val, y_pred))


              precision    recall  f1-score   support

           0       0.93      0.38      0.54       217
           1       0.90      1.00      0.95      1264

    accuracy                           0.90      1481
   macro avg       0.92      0.69      0.74      1481
weighted avg       0.91      0.90      0.89      1481



# test

In [21]:
# load test set
with open("test.json", "r", encoding="utf-8") as f:
    test_data = json.load(f)

df_test = pd.DataFrame(test_data)
X_test = df_test["reviews"].values

# convert to TF-IDF features
X_test_tfidf = vectorizer.transform(X_test)

# predict
test_pred = clf.predict(X_test_tfidf)

# save submission.csv
submission = pd.DataFrame({
    "id": range(len(test_pred)),
    "sentiments": test_pred
})
submission.to_csv("submission.csv", index=False)
print("✅ submission.csv has been created.")


✅ submission.csv has been created.
