In [7]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from transformers import pipeline
import math

In [11]:
df = pd.read_json('/app/data/Software.jsonl', lines=True, nrows=100000)
df.head()

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
0,1,malware,mcaffee IS malware,[],B07BFS3G7P,B0BQSK9QCF,AGCI7FAH4GL5FI65HYLKWTMFZ2CQ,2019-07-03 19:37:12.076,0,False
1,5,Lots of Fun,I love playing tapped out because it is fun to...,[],B00CTQ6SIG,B00CTQ6SIG,AHSPLDNW5OOUK2PLH7GXLACFBZNQ,2015-02-16 20:58:56.000,0,True
2,5,Light Up The Dark,I love this flashlight app! It really illumin...,[],B0066WJLU6,B0066WJLU6,AHSPLDNW5OOUK2PLH7GXLACFBZNQ,2013-03-04 12:14:27.000,0,True
3,4,Fun game,One of my favorite games,[],B00KCYMAWK,B00KCYMAWK,AH6CATODIVPVUOJEWHRSRCSKAOHA,2019-06-20 20:10:28.662,0,True
4,4,I am not that good at it but my kids are,Cute game. I am not that good at it but my kid...,[],B00P1RK566,B00P1RK566,AEINY4XOINMMJCK5GZ3M6MMHBN6A,2014-12-11 00:19:56.000,0,True


In [12]:
df.shape

(100000, 10)

**Number of Observations**

In [3]:
#with open('/app/data/Software.jsonl', 'r', encoding='utf-8') as f:
 #   line_count = sum(1 for _ in f)

#print(f"Total lines (entries) in dataset: {line_count}")X


Total lines (entries) in dataset: 4880181


**Using a pretrained sentiment model (distilbert-base-uncased-finetuned-sst-2-english) BERT to auto-label our data**

<pre>
When you use this line:
     sentiment_pipeline = pipeline("sentiment-analysis")
     
You're getting:

-Tokenization
-Lowercasing
-Truncation/padding
-Feature extraction (embeddings)
-Model inference

All handled automatically inside the pipeline. So no need for:

-Removing punctuation
-Stemming/lemmatization
-TF-IDF or CountVectorizer§
-Stopword removal
    </pre>

In [13]:
#pip install transformers torch pandas


In [14]:
#Drop rows with missing title or text
df = df.dropna(subset=["title", "text"])

In [17]:
#verification
print("shape after dropping missing data : ", df.shape) #no missing data
df.head()

shape after dropping missing data :  (100000, 10)


Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
0,1,malware,mcaffee IS malware,[],B07BFS3G7P,B0BQSK9QCF,AGCI7FAH4GL5FI65HYLKWTMFZ2CQ,2019-07-03 19:37:12.076,0,False
1,5,Lots of Fun,I love playing tapped out because it is fun to...,[],B00CTQ6SIG,B00CTQ6SIG,AHSPLDNW5OOUK2PLH7GXLACFBZNQ,2015-02-16 20:58:56.000,0,True
2,5,Light Up The Dark,I love this flashlight app! It really illumin...,[],B0066WJLU6,B0066WJLU6,AHSPLDNW5OOUK2PLH7GXLACFBZNQ,2013-03-04 12:14:27.000,0,True
3,4,Fun game,One of my favorite games,[],B00KCYMAWK,B00KCYMAWK,AH6CATODIVPVUOJEWHRSRCSKAOHA,2019-06-20 20:10:28.662,0,True
4,4,I am not that good at it but my kids are,Cute game. I am not that good at it but my kid...,[],B00P1RK566,B00P1RK566,AEINY4XOINMMJCK5GZ3M6MMHBN6A,2014-12-11 00:19:56.000,0,True


In [None]:
#Stratified sampling: take 2% from each rating group
#sample_fraction = 0.02  #Change this depending on your RAM
#Our dataset has 4,880,181 rows meaning:
#0.02 × 4,880,181 = ~97,603 rows will be in df_sample
#df_sample = df.groupby("rating", group_keys=False).apply(lambda x: x.sample(frac=sample_fraction, random_state=42))


In [None]:
#Reset index
#df_sample = df_sample.reset_index(drop=True)

In [18]:
#Combine title + text into one column to give the model more context
df["full_text"] = df["title"].astype(str) + ". " + df["text"].astype(str)

In [23]:
#verification
print("Data after combining title and text in full_text: ")
df.head()

Data after combining title and text in full_text: 


Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,full_text
0,1,malware,mcaffee IS malware,[],B07BFS3G7P,B0BQSK9QCF,AGCI7FAH4GL5FI65HYLKWTMFZ2CQ,2019-07-03 19:37:12.076,0,False,malware. mcaffee IS malware
1,5,Lots of Fun,I love playing tapped out because it is fun to...,[],B00CTQ6SIG,B00CTQ6SIG,AHSPLDNW5OOUK2PLH7GXLACFBZNQ,2015-02-16 20:58:56.000,0,True,Lots of Fun. I love playing tapped out because...
2,5,Light Up The Dark,I love this flashlight app! It really illumin...,[],B0066WJLU6,B0066WJLU6,AHSPLDNW5OOUK2PLH7GXLACFBZNQ,2013-03-04 12:14:27.000,0,True,Light Up The Dark. I love this flashlight app!...
3,4,Fun game,One of my favorite games,[],B00KCYMAWK,B00KCYMAWK,AH6CATODIVPVUOJEWHRSRCSKAOHA,2019-06-20 20:10:28.662,0,True,Fun game. One of my favorite games
4,4,I am not that good at it but my kids are,Cute game. I am not that good at it but my kid...,[],B00P1RK566,B00P1RK566,AEINY4XOINMMJCK5GZ3M6MMHBN6A,2014-12-11 00:19:56.000,0,True,I am not that good at it but my kids are. Cute...


In [21]:
#Load pretrained sentiment analysis model
sentiment_pipeline = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


In [24]:
#Run sentiment analysis in batches
batch_size = 500
results = []

for i in range(0, len(df), batch_size):
    print(f"Processing batch {i} to {i+batch_size}")
    batch_texts = df["full_text"][i:i+batch_size].tolist()
    batch_results = sentiment_pipeline(batch_texts, truncation=True)
    results.extend(batch_results)

Processing batch 0 to 500
Processing batch 500 to 1000
Processing batch 1000 to 1500
Processing batch 1500 to 2000
Processing batch 2000 to 2500
Processing batch 2500 to 3000
Processing batch 3000 to 3500
Processing batch 3500 to 4000
Processing batch 4000 to 4500
Processing batch 4500 to 5000
Processing batch 5000 to 5500
Processing batch 5500 to 6000
Processing batch 6000 to 6500
Processing batch 6500 to 7000
Processing batch 7000 to 7500
Processing batch 7500 to 8000
Processing batch 8000 to 8500
Processing batch 8500 to 9000
Processing batch 9000 to 9500
Processing batch 9500 to 10000
Processing batch 10000 to 10500
Processing batch 10500 to 11000
Processing batch 11000 to 11500
Processing batch 11500 to 12000
Processing batch 12000 to 12500
Processing batch 12500 to 13000
Processing batch 13000 to 13500
Processing batch 13500 to 14000
Processing batch 14000 to 14500
Processing batch 14500 to 15000
Processing batch 15000 to 15500
Processing batch 15500 to 16000
Processing batch 160

In [27]:
#Store sentiment and confidence scores ===
df["sentiment"] = [r["label"].lower() for r in results]
df["confidence"] = [r["score"] for r in results]

In [None]:
#Optional Step: Filtering Out Low-Confidence Predictions
#If we want cleaner labels, we can drop results with low model confidence (e.g., score < 0.8)
#df["confidence"] = [res["score"] for res in results]
#df_filtered = df[df["confidence"] >= 0.8]''

In [29]:
#verification
print("Data after adding sentiment label: ")
df[:100]

Data after adding sentiment label: 


Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,full_text,sentiment,confidence
0,1,malware,mcaffee IS malware,[],B07BFS3G7P,B0BQSK9QCF,AGCI7FAH4GL5FI65HYLKWTMFZ2CQ,2019-07-03 19:37:12.076,0,False,malware. mcaffee IS malware,negative,0.999740
1,5,Lots of Fun,I love playing tapped out because it is fun to...,[],B00CTQ6SIG,B00CTQ6SIG,AHSPLDNW5OOUK2PLH7GXLACFBZNQ,2015-02-16 20:58:56.000,0,True,Lots of Fun. I love playing tapped out because...,positive,0.999827
2,5,Light Up The Dark,I love this flashlight app! It really illumin...,[],B0066WJLU6,B0066WJLU6,AHSPLDNW5OOUK2PLH7GXLACFBZNQ,2013-03-04 12:14:27.000,0,True,Light Up The Dark. I love this flashlight app!...,positive,0.999873
3,4,Fun game,One of my favorite games,[],B00KCYMAWK,B00KCYMAWK,AH6CATODIVPVUOJEWHRSRCSKAOHA,2019-06-20 20:10:28.662,0,True,Fun game. One of my favorite games,positive,0.999882
4,4,I am not that good at it but my kids are,Cute game. I am not that good at it but my kid...,[],B00P1RK566,B00P1RK566,AEINY4XOINMMJCK5GZ3M6MMHBN6A,2014-12-11 00:19:56.000,0,True,I am not that good at it but my kids are. Cute...,positive,0.999729
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2,Good,Good for kids,[],B071J8FCMQ,B071J8FCMQ,AHFZUNQFXSVVT6Z6BYKE5CLBX3KQ,2020-10-03 19:28:07.816,6,True,Good. Good for kids,positive,0.999848
96,3,Good,Good for kids,[],B0742686QC,B0742686QC,AHFZUNQFXSVVT6Z6BYKE5CLBX3KQ,2020-10-03 19:25:57.388,5,True,Good. Good for kids,positive,0.999848
97,3,Good delivery,Fun for kids,[],B018QOE09S,B018QOE09S,AHFZUNQFXSVVT6Z6BYKE5CLBX3KQ,2020-10-03 19:18:38.190,0,True,Good delivery. Fun for kids,positive,0.999877
98,3,Good delivery,Good for kids,[],B075QZCSJ6,B075QZCSJ6,AHFZUNQFXSVVT6Z6BYKE5CLBX3KQ,2020-10-03 19:13:02.680,1,True,Good delivery. Good for kids,positive,0.999859


In [30]:
#Save the labeled sample
df.to_csv("sample_labeled_sentiment.csv", index=False)
print("Sentiment labeling complete. Saved to sample_labeled_sentiment.csv")

Sentiment labeling complete. Saved to sample_labeled_sentiment.csv


In [33]:
sentiment_counts = df["sentiment"].value_counts()
sentiment_percentages = (sentiment_counts / len(df)) * 100
sentiment_summary = pd.DataFrame({
    "count": sentiment_counts,
    "percentage": sentiment_percentages.round(2)
})
print(sentiment_summary)

           count  percentage
sentiment                   
positive   65440       65.44
negative   34560       34.56
