In [1]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
file_path = r"C:\Users\muski\Desktop\NewProject\Part II\Software.json"

In [3]:
# Create empty lists to store data
reviewerID_list = []
asin_list = []
reviewerName_list = []
vote_list = []
style_list = []
reviewText_list = []
overall_list = []
summary_list = []
unixReviewTime_list = []
reviewTime_list = []

In [4]:
# Read JSON data
with open(file_path, 'r') as file:
    for line in file:
        try:
            data = json.loads(line)

            # Append data to respective lists
            reviewerID_list.append(data.get('reviewerID'))
            asin_list.append(data.get('asin'))
            reviewerName_list.append(data.get('reviewerName'))
            vote_list.append(data.get('vote'))
            style_list.append(data.get('style'))
            reviewText_list.append(data.get('reviewText'))
            overall_list.append(data.get('overall'))
            summary_list.append(data.get('summary'))
            unixReviewTime_list.append(data.get('unixReviewTime'))
            reviewTime_list.append(data.get('reviewTime'))

        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")

In [5]:
# Create a DataFrame
df = pd.DataFrame({
    'reviewerID': reviewerID_list,
    'asin': asin_list,
    'reviewerName': reviewerName_list,
    'vote': vote_list,
    'style': style_list,
    'reviewText': reviewText_list,
    'overall': overall_list,
    'summary': summary_list,
    'unixReviewTime': unixReviewTime_list,
    'reviewTime': reviewTime_list
})

In [6]:
df.tail()

Unnamed: 0,reviewerID,asin,reviewerName,vote,style,reviewText,overall,summary,unixReviewTime,reviewTime
459431,AGEWYJ2NF5C2H,B01HF41TKI,Bonita Alferes,,,No instructions.....No Help unless you want to...,2.0,Two Stars,1480377600,"11 29, 2016"
459432,A3VCFV8WEQG9R5,B01HF3G4BS,mekonen,,,it's a joke,1.0,One Star,1519862400,"03 1, 2018"
459433,A3DXGHJF6SOHNC,B01HF3G4BS,bbeckham,,,I have multiple licenses of the Antivirus. I h...,5.0,This is very effective antivirus software.,1512172800,"12 2, 2017"
459434,A1WOS4D7QA06DO,B01HJAMWOK,Charles E. Potter,,,good value,5.0,Five Stars,1536969600,"09 15, 2018"
459435,A20SG9ZGIIFW69,B01HJAMWOK,Joe,,,very nice designs easy to use.,5.0,Five Stars,1475625600,"10 5, 2016"


In [7]:
# Convert 'vote' column to numeric
df['vote'] = pd.to_numeric(df['vote'], errors='coerce')

In [8]:
# Create a binary label for helpfulness (e.g., if 'vote' is above a certain threshold, label it as 1, else label it as 0)
df['helpful'] = (df['vote'] > 5).astype(int)

In [9]:
#check for null values
print(df['reviewText'].isnull().sum())

66


In [10]:
#replace nan with empty strings
df['reviewText'].fillna('', inplace=True)

#### Feature Engineering: Text vectorization using TF-IDF for 'reviewText'

In [11]:

tfidf_vectorizer = TfidfVectorizer(max_features=2000, stop_words='english')
X_text = tfidf_vectorizer.fit_transform(df['reviewText'])

In [12]:
# Concatenate TF-IDF features with other relevant features
X_features = pd.concat([df[['overall']], pd.DataFrame(X_text.toarray())], axis=1)


In [13]:
print(df['helpful'].shape)

(459436,)


In [14]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_features, df['helpful'], test_size=0.2, random_state=42)

#### The XGBoost (Extreme Gradient Boosting) model was used. XGBClassifier for implementation of gradient boosting

In [15]:
# Initialize and train the XGBoost model
model = XGBClassifier()
model.fit(X_train, y_train)

In [16]:
# Make predictions on the testing set
y_pred = model.predict(X_test)


In [17]:
# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.8896265018283127

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.98      0.94     81337
           1       0.56      0.18      0.27     10551

    accuracy                           0.89     91888
   macro avg       0.73      0.58      0.60     91888
weighted avg       0.86      0.89      0.86     91888


Confusion Matrix:
 [[79895  1442]
 [ 8700  1851]]


#### 1. The overall accuracy of the model is 88.96%, which indicates the proportion of correctly predicted instances among all instances. 
#### 2. A precision of 56% means that among the instances predicted as helpful, 56% were actually helpful
#### 3. Recall of 18% indicates that the model identified only 18% of the actual helpful instances.
#### 4. The harmonic mean F1-score is relatively low, indicating a trade-off between precision and recall.


The model performs well in identifying non-helpful instances (high precision and recall for class 0).
However, the performance is weaker for identifying helpful instances (low precision and recall for class 1).
The class imbalance is evident from the large number of non-helpful instances (class 0) compared to helpful instances (class 1).


1. Given the class imbalance, I will consider techniques like oversampling, undersampling, or adjusting class weights to improve the model's performance on the minority class.
2. Fine-tune hyperparameters, especially those related to handling imbalanced data and model complexity.
3. Consider incorporating additional features or experimenting with different models to capture more nuanced patterns in the data.