In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("kritanjalijain/amazon-reviews")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: C:\Users\Micha\.cache\kagglehub\datasets\kritanjalijain\amazon-reviews\versions\2


In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

df = pd.read_csv("data/train.csv", nrows=50000, header=None)
df.rename(columns={0: "Polarity", 1: "Title", 2: "Review"}, inplace=True)
print(df.shape)

(50000, 3)


In [3]:
df = df[["Polarity", "Review"]].reset_index(drop=True)
df.head(10)

Unnamed: 0,Polarity,Review
0,2,This sound track was beautiful! It paints the ...
1,2,I'm reading a lot of reviews saying that this ...
2,2,This soundtrack is my favorite music of all ti...
3,2,I truly like this soundtrack and I enjoy video...
4,2,"If you've played the game, you know how divine..."
5,2,I am quite sure any of you actually taking the...
6,1,"This is a self-published book, and if you want..."
7,2,I loved Whisper of the wicked saints. The stor...
8,2,I just finished reading Whisper of the Wicked ...
9,2,This was a easy to read book that made me want...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Polarity  50000 non-null  int64 
 1   Review    50000 non-null  object
dtypes: int64(1), object(1)
memory usage: 781.4+ KB


In [5]:
df.isnull().sum()

Polarity    0
Review      0
dtype: int64

In [6]:
df["Polarity"].value_counts(normalize=True)

Polarity
2    0.51012
1    0.48988
Name: proportion, dtype: float64

In [7]:
nltk.download("stopwords")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Micha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
stop_words = set(stopwords.words("english"))


def preprocess_text(text):
    # Convert text to lowercase and remove URLs, text in square brackets, punctuation, and words with numbers
    text = text.lower()
    pattern = r"https?://\S+|www\.\S+|\[.*?\]|[^a-zA-Z\s]+|\w*\d\w*"
    text = re.sub(pattern, "", text)

    # Split and filter out stop words
    filtered_words = [word for word in text.split() if word not in stop_words]

    return " ".join(filtered_words).strip()

In [9]:
df["Review"] = df["Review"].apply(preprocess_text)

In [None]:
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Micha\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [22]:
stemmer = SnowballStemmer("english")


def stem_text(text):
    # Tokenize the input text
    words = nltk.word_tokenize(text)

    # Apply stemming to each word
    stemmed_words = []
    for word in words:
        stemmed_words.append(stemmer.stem(word))

    # Return the stemmed words as a single string
    return " ".join(stemmed_words)

In [None]:
# Apply stem_text to all data
df["Review"] = df["Review"].apply(stem_text)

In [10]:
df.head(10)

Unnamed: 0,Polarity,Review
0,2,sound track beautiful paints senery mind well ...
1,2,im reading lot reviews saying best game soundt...
2,2,soundtrack favorite music time hands intense s...
3,2,truly like soundtrack enjoy video game music p...
4,2,youve played game know divine music every sing...
5,2,quite sure actually taking time read played ga...
6,1,selfpublished book want know whyread paragraph...
7,2,loved whisper wicked saints story amazing plea...
8,2,finished reading whisper wicked saints fell lo...
9,2,easy read book made want keep reading easy put...


In [24]:
# Instantiate the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(df["Review"])
y_train = df["Polarity"]

# Initialize the classifier
clf = LinearSVC()

# Train the classifier
clf.fit(X_train_tfidf, y_train)

In [25]:
test_df = pd.read_csv("data/test.csv", header=None, nrows=50000)
test_df.columns = ["Polarity", "Title", "Review"]
test_df = test_df[["Polarity", "Review"]].reset_index(drop=True)

In [26]:
test_df["Review"] = test_df["Review"].apply(preprocess_text)
test_df["Review"] = test_df["Review"].apply(stem_text)

In [None]:
X_test_tfidf = tfidf_vectorizer.transform(test_df["Review"])

In [None]:
y_test = test_df["Polarity"]
y_pred = clf.predict(X_test_tfidf)

In [29]:
# Printing out results
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.82708

Classification Report:
               precision    recall  f1-score   support

           1       0.85      0.79      0.82     24626
           2       0.81      0.86      0.83     25374

    accuracy                           0.83     50000
   macro avg       0.83      0.83      0.83     50000
weighted avg       0.83      0.83      0.83     50000



In [None]:
def predict_sentiment(text):
    # Preprocess and stem the input text
    preprocessed_text = preprocess_text(text)
    stemmed_text = stem_text(preprocessed_text)

    # Transform the text into features using the TF-IDF vectorizer
    features = tfidf_vectorizer.transform([stemmed_text])

    # Predict sentiment using the classifier
    prediction = clf.predict(features)[0]

    # Return the sentiment label
    return "Negative" if prediction == 1 else "Positive"

In [None]:
sentiment = predict_sentiment(
    "I absolutely love this app! It's intuitive and works flawlessly."
)
print(sentiment)

Positive


In [34]:
sentiment = predict_sentiment(
    "The worst purchase I've made. The product broke within a week and the customer service was unhelpful."
)
print(sentiment)

Negative


In [None]:
sentiment = predict_sentiment(
    "The product didn't work as expected. Very disappointed with the performance."
)
print(sentiment)

Negative
