In [19]:
# Step 1: Data Exploration and Preprocessing
# 1.1 Load the JSON data and convert it to a pandas DataFrame
import pandas as pd
import json

# Load the JSON data
file_path = '/Users/kikumarm/CodeRepo/ML_Project/dataset.json/Electronics_5.json'

# Read the file line by line and parse each JSON object
data = []
with open(file_path, 'r') as file:
    for line in file:
        data.append(json.loads(line))

# Take only the first 10,000 records
data = data[:10000]

# Convert to pandas DataFrame
df = pd.DataFrame(data)

print(df.head())


       reviewerID        asin              reviewerName   helpful  \
0   AO94DHGC771SJ  0528881469                   amazdnu    [0, 0]   
1   AMO214LNFCEI4  0528881469           Amazon Customer  [12, 15]   
2  A3N7T0DY83Y4IG  0528881469             C. A. Freeman  [43, 45]   
3  A1H8PY3QHMQQA0  0528881469  Dave M. Shaw "mack dave"   [9, 10]   
4  A24EV6RXELQZ63  0528881469               Wayne Smith    [0, 0]   

                                          reviewText  overall  \
0  We got this GPS for my husband who is an (OTR)...      5.0   
1  I'm a professional OTR truck driver, and I bou...      1.0   
2  Well, what can I say.  I've had this unit in m...      3.0   
3  Not going to write a long review, even thought...      2.0   
4  I've had mine for a year and here's what we go...      1.0   

                                  summary  unixReviewTime   reviewTime  
0                         Gotta have GPS!      1370131200   06 2, 2013  
1                       Very Disappointed      1

In [20]:
# 1.2 Remove duplicates and handle missing values
# Remove duplicates
# Split the 'helpful' column into two separate columns
df[['helpful_votes', 'unhelpful_votes']] = pd.DataFrame(df['helpful'].tolist(), index=df.index)

# Drop the original 'helpful' column if it's no longer needed
df.drop(columns=['helpful'], inplace=True)

# Remove duplicates
df.drop_duplicates(inplace=True)

# Display the DataFrame info to confirm duplicates are removed
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   reviewerID       10000 non-null  object 
 1   asin             10000 non-null  object 
 2   reviewerName     9962 non-null   object 
 3   reviewText       10000 non-null  object 
 4   overall          10000 non-null  float64
 5   summary          10000 non-null  object 
 6   unixReviewTime   10000 non-null  int64  
 7   reviewTime       10000 non-null  object 
 8   helpful_votes    10000 non-null  int64  
 9   unhelpful_votes  10000 non-null  int64  
dtypes: float64(1), int64(3), object(6)
memory usage: 781.4+ KB
None


In [21]:
# # Handle missing values (example: fill NaNs with empty strings)
df.fillna('', inplace=True)

# # Display the DataFrame info to check for missing values
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   reviewerID       10000 non-null  object 
 1   asin             10000 non-null  object 
 2   reviewerName     10000 non-null  object 
 3   reviewText       10000 non-null  object 
 4   overall          10000 non-null  float64
 5   summary          10000 non-null  object 
 6   unixReviewTime   10000 non-null  int64  
 7   reviewTime       10000 non-null  object 
 8   helpful_votes    10000 non-null  int64  
 9   unhelpful_votes  10000 non-null  int64  
dtypes: float64(1), int64(3), object(6)
memory usage: 781.4+ KB
None


In [22]:
# 1.3 Clean and normalize text data
import re

def clean_text(text):
    # Remove special characters and lowercase the text
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = text.lower()
    return text

# Apply the clean_text function to the 'reviewText' and 'summary' columns
df['reviewText'] = df['reviewText'].apply(clean_text)
df['summary'] = df['summary'].apply(clean_text)

# Combine 'summary' and 'reviewText' fields for more context
df['combined_text'] = df['summary'] + ' ' + df['reviewText']

# Display the first few rows of the DataFrame
print(df[['combined_text']].head())


                                       combined_text
0  gotta have gps we got this gps for my husband ...
1  very disappointed im a professional otr truck ...
2  1st impression well what can i say  ive had th...
3  great grafics poor gps not going to write a lo...
4  major issues only excuses for support ive had ...


In [23]:
# 1.4 Create multi-label categories based on review content
# Define categories based on common themes in Amazon reviews

categories = [
    'Product Quality', 
    'Customer Service', 
    'Price', 
    'Functionality', 
    'Ease of Use', 
    'Delivery Experience', 
    'Durability', 
    'Packaging', 
    'Value for Money', 
    'Others'
]

def categorize_review(text):
    labels = []
    if any(keyword in text for keyword in ['quality', 'excellent', 'superior', 'poor', 'bad quality']):
        labels.append('Product Quality')
    if any(keyword in text for keyword in ['service', 'support', 'customer care', 'helpful', 'response']):
        labels.append('Customer Service')
    if any(keyword in text for keyword in ['price', 'cost', 'expensive', 'cheap', 'affordable']):
        labels.append('Price')
    if any(keyword in text for keyword in ['function', 'feature', 'performance', 'capability', 'operation']):
        labels.append('Functionality')
    if any(keyword in text for keyword in ['easy', 'simple', 'user-friendly', 'intuitive', 'convenient']):
        labels.append('Ease of Use')
    if any(keyword in text for keyword in ['delivery', 'shipping', 'arrival', 'on time', 'late']):
        labels.append('Delivery Experience')
    if any(keyword in text for keyword in ['durable', 'sturdy', 'long-lasting', 'reliable', 'robust']):
        labels.append('Durability')
    if any(keyword in text for keyword in ['packaging', 'box', 'wrap', 'sealed', 'damaged packaging']):
        labels.append('Packaging')
    if any(keyword in text for keyword in ['value', 'worth', 'bang for the buck', 'investment', 'reasonable']):
        labels.append('Value for Money')
    if not labels:
        labels.append('Others')
    return labels

# Apply the categorize_review function to the 'combined_text' column
df['categories'] = df['combined_text'].apply(categorize_review)

# Display the first few rows of the DataFrame
print(df[['combined_text', 'categories']].head())

                                       combined_text  \
0  gotta have gps we got this gps for my husband ...   
1  very disappointed im a professional otr truck ...   
2  1st impression well what can i say  ive had th...   
3  great grafics poor gps not going to write a lo...   
4  major issues only excuses for support ive had ...   

                                      categories  
0                          [Delivery Experience]  
1                [Price, Ease of Use, Packaging]  
2                         [Price, Functionality]  
3  [Product Quality, Functionality, Ease of Use]  
4              [Customer Service, Functionality]  


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   reviewerID       10000 non-null  object 
 1   asin             10000 non-null  object 
 2   reviewerName     10000 non-null  object 
 3   reviewText       10000 non-null  object 
 4   overall          10000 non-null  float64
 5   summary          10000 non-null  object 
 6   unixReviewTime   10000 non-null  int64  
 7   reviewTime       10000 non-null  object 
 8   helpful_votes    10000 non-null  int64  
 9   unhelpful_votes  10000 non-null  int64  
 10  combined_text    10000 non-null  object 
 11  categories       10000 non-null  object 
dtypes: float64(1), int64(3), object(8)
memory usage: 937.6+ KB


In [None]:
# 1.5 Tokenize and encode text using BERT tokenizer
from transformers import BertTokenizer

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Convert the 'combined_text' column to a list
texts = df['combined_text'].tolist()

# Batch tokenize and encode the text
encoded = tokenizer(
    texts,
    add_special_tokens=True,
    max_length=512,
    truncation=True,
    padding=True,  # Pads to the longest sequence in the batch
    return_tensors='np'  # Return NumPy arrays for faster processing
)

# Add input IDs back to the DataFrame
df['input_ids'] = list(encoded['input_ids'])

In [26]:
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,reviewText,overall,summary,unixReviewTime,reviewTime,helpful_votes,unhelpful_votes,combined_text,categories,input_ids
0,AO94DHGC771SJ,528881469,amazdnu,we got this gps for my husband who is an otr o...,5.0,gotta have gps,1370131200,"06 2, 2013",0,0,gotta have gps we got this gps for my husband ...,[Delivery Experience],"[101, 10657, 2031, 14658, 2057, 2288, 2023, 14..."
1,AMO214LNFCEI4,528881469,Amazon Customer,im a professional otr truck driver and i bough...,1.0,very disappointed,1290643200,"11 25, 2010",12,15,very disappointed im a professional otr truck ...,"[Price, Ease of Use, Packaging]","[101, 2200, 9364, 10047, 1037, 2658, 27178, 20..."
2,A3N7T0DY83Y4IG,528881469,C. A. Freeman,well what can i say ive had this unit in my t...,3.0,1st impression,1283990400,"09 9, 2010",43,45,1st impression well what can i say ive had th...,"[Price, Functionality]","[101, 3083, 8605, 2092, 2054, 2064, 1045, 2360..."
3,A1H8PY3QHMQQA0,528881469,"Dave M. Shaw ""mack dave""",not going to write a long review even thought ...,2.0,great grafics poor gps,1290556800,"11 24, 2010",9,10,great grafics poor gps not going to write a lo...,"[Product Quality, Functionality, Ease of Use]","[101, 2307, 22160, 6558, 3532, 14658, 2025, 21..."
4,A24EV6RXELQZ63,528881469,Wayne Smith,ive had mine for a year and heres what we got ...,1.0,major issues only excuses for support,1317254400,"09 29, 2011",0,0,major issues only excuses for support ive had ...,"[Customer Service, Functionality]","[101, 2350, 3314, 2069, 21917, 2005, 2490, 492..."


In [30]:
# 1.6 Create multi-hot encoded label vectors for the categories
from sklearn.preprocessing import MultiLabelBinarizer

# Initialize MultiLabelBinarizer
mlb = MultiLabelBinarizer(classes=categories)

# Fit and transform the categories
df['label_vectors'] = list(mlb.fit_transform(df['categories']))

# Display the first few rows of the DataFrame
print(df[['categories', 'input_ids', 'label_vectors']].head())

                                      categories  \
0                          [Delivery Experience]   
1                [Price, Ease of Use, Packaging]   
2                         [Price, Functionality]   
3  [Product Quality, Functionality, Ease of Use]   
4              [Customer Service, Functionality]   

                                           input_ids  \
0  [101, 10657, 2031, 14658, 2057, 2288, 2023, 14...   
1  [101, 2200, 9364, 10047, 1037, 2658, 27178, 20...   
2  [101, 3083, 8605, 2092, 2054, 2064, 1045, 2360...   
3  [101, 2307, 22160, 6558, 3532, 14658, 2025, 21...   
4  [101, 2350, 3314, 2069, 21917, 2005, 2490, 492...   

                    label_vectors  
0  [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]  
1  [0, 0, 1, 0, 1, 0, 0, 1, 0, 0]  
2  [0, 0, 1, 1, 0, 0, 0, 0, 0, 0]  
3  [1, 0, 0, 1, 1, 0, 0, 0, 0, 0]  
4  [0, 1, 0, 1, 0, 0, 0, 0, 0, 0]  


In [32]:
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,reviewText,overall,summary,unixReviewTime,reviewTime,helpful_votes,unhelpful_votes,combined_text,categories,input_ids,label_vectors
0,AO94DHGC771SJ,528881469,amazdnu,we got this gps for my husband who is an otr o...,5.0,gotta have gps,1370131200,"06 2, 2013",0,0,gotta have gps we got this gps for my husband ...,[Delivery Experience],"[101, 10657, 2031, 14658, 2057, 2288, 2023, 14...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]"
1,AMO214LNFCEI4,528881469,Amazon Customer,im a professional otr truck driver and i bough...,1.0,very disappointed,1290643200,"11 25, 2010",12,15,very disappointed im a professional otr truck ...,"[Price, Ease of Use, Packaging]","[101, 2200, 9364, 10047, 1037, 2658, 27178, 20...","[0, 0, 1, 0, 1, 0, 0, 1, 0, 0]"
2,A3N7T0DY83Y4IG,528881469,C. A. Freeman,well what can i say ive had this unit in my t...,3.0,1st impression,1283990400,"09 9, 2010",43,45,1st impression well what can i say ive had th...,"[Price, Functionality]","[101, 3083, 8605, 2092, 2054, 2064, 1045, 2360...","[0, 0, 1, 1, 0, 0, 0, 0, 0, 0]"
3,A1H8PY3QHMQQA0,528881469,"Dave M. Shaw ""mack dave""",not going to write a long review even thought ...,2.0,great grafics poor gps,1290556800,"11 24, 2010",9,10,great grafics poor gps not going to write a lo...,"[Product Quality, Functionality, Ease of Use]","[101, 2307, 22160, 6558, 3532, 14658, 2025, 21...","[1, 0, 0, 1, 1, 0, 0, 0, 0, 0]"
4,A24EV6RXELQZ63,528881469,Wayne Smith,ive had mine for a year and heres what we got ...,1.0,major issues only excuses for support,1317254400,"09 29, 2011",0,0,major issues only excuses for support ive had ...,"[Customer Service, Functionality]","[101, 2350, 3314, 2069, 21917, 2005, 2490, 492...","[0, 1, 0, 1, 0, 0, 0, 0, 0, 0]"


In [40]:
# STEP 1:
# To handle class imbalance in your dataset, you can use techniques like oversampling the minority classes or applying class weights during model training. Here are two common approaches:
# 1. Oversampling the Minority Classes
# You can use the imblearn library's RandomOverSampler to oversample the minority classes.
# 2. Applying Class Weights During Model Training
# You can calculate class weights and pass them to the loss function during model training. This approach is useful when using libraries like PyTorch.

from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd

# Initialize the RandomOverSampler
ros = RandomOverSampler()

# Resample the dataset using the 'categories' column
X_resampled, y_resampled = ros.fit_resample(df['input_ids'], df['categories'])

# Convert resampled labels (lists of categories) back to multi-hot encoding
mlb = MultiLabelBinarizer(classes=categories)  # Use your predefined list of all possible categories
y_resampled_multi_hot = mlb.fit_transform(y_resampled)

# Convert back to DataFrame
df_resampled = pd.DataFrame({'input_ids': X_resampled, 'label_vectors': list(y_resampled_multi_hot)})

# Display the first few rows of the resampled DataFrame
print(df_resampled.head())

ValueError: You appear to be using a legacy multi-label data representation. Sequence of sequences are no longer supported; use a binary array or sparse matrix instead - the MultiLabelBinarizer transformer can convert to this format.