In [19]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
import torch
import time
from datetime import datetime

In [20]:
# Download necessary NLTK data
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('stopwords', quiet=True)

True

In [21]:
# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

VALID_ASPECTS = ['price', 'camera', 'battery', 'display', 'design', 'software', 'cpu/gpu', 'memory', 'network']


In [22]:
def preprocess_text(text):
    if pd.isna(text):
        return ""
    words = nltk.word_tokenize(str(text).lower())
    tagged_words = nltk.pos_tag(words)
    processed_words = [lemmatizer.lemmatize(word) for word, tag in tagged_words
                       if (tag.startswith('JJ') or tag.startswith('NN')) and word not in stop_words]
    return ' '.join(processed_words)


In [23]:
def create_knn_model(aspects, reviews):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(reviews)
    y = aspects

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    knn = KNeighborsClassifier(n_neighbors=8)
    knn.fit(X_train, y_train)

    y_pred = knn.predict(X_test)
    print(classification_report(y_test, y_pred))

    return knn, vectorizer


In [24]:
def classify_aspects_knn(model, vectorizer, reviews):
    X = vectorizer.transform(reviews)
    return model.predict(X)

In [41]:
def main():
    input_filename = '/content/reviews.csv'  # Using the sampled review file

    # Start overall timing
    overall_start_time = time.time()

    # Load data into DataFrame
    print("Loading data...")
    load_start_time = time.time()
    df = pd.read_csv(input_filename)
    df = df.head(1000)
    required_columns = ['body', 'rating', 'asin', 'name']
    if not all(col in df.columns for col in required_columns):
        raise KeyError(f"Missing columns in the input file. Required columns: {required_columns}")
    df = df.dropna(subset=['body', 'asin', 'name']).reset_index(drop=True)
    load_end_time = time.time()
    print(f"Data loaded and cleaned. Shape: {df.shape}")
    print(f"Time taken to load data: {load_end_time - load_start_time:.2f} seconds")

    # Process all text data at once
    print("Processing text data...")
    preprocess_start_time = time.time()
    df['processed_text'] = df['body'].apply(preprocess_text)
    preprocess_end_time = time.time()
    print(f"Time taken to preprocess text: {preprocess_end_time - preprocess_start_time:.2f} seconds")

    # Create aspect labels for KNN training (This should ideally come from a labeled dataset)
    aspect_labels = []
    for review in df['processed_text']:
        aspects_in_review = [aspect for aspect in VALID_ASPECTS if aspect in review]
        if aspects_in_review:
            aspect_labels.append(aspects_in_review[0])  # Use the first matching aspect as a placeholder
        else:
            aspect_labels.append(VALID_ASPECTS[0])

    # Train KNN model for aspect classification
    print("Training KNN model...")
    knn_model, vectorizer = create_knn_model(aspect_labels, df['processed_text'].tolist())

    # Classify aspects using KNN model
    print("Classifying aspects with KNN model...")
    df['aspect'] = classify_aspects_knn(knn_model, vectorizer, df['processed_text'].tolist())

    # Initialize BERT model for sentiment analysis
    print("Initializing BERT model for sentiment analysis...")
    tokenizer = BertTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
    model = BertForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
    nlp = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

    def get_sentiment(text):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
        result = model(**inputs)
        label = result.logits.argmax().item()
        if label in [0, 1]:  # Assuming 0 and 1 correspond to negative labels
            return 'negative'
        elif label == 2:  # Assuming 2 corresponds to neutral label
            return 'neutral'
        else:  # Assuming 3 and 4 correspond to positive labels
            return 'positive'

    def get_sentiment_score(text):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
        result = model(**inputs)
        label = result.logits.argmax().item()
        return label  # Return the label directly for averaging

    # Analyze sentiment with BERT model
    print("Analyzing sentiment with BERT model...")
    df['aspect_sentiments'] = df.apply(lambda row: {aspect: get_sentiment(row['body']) for aspect in VALID_ASPECTS if aspect in row['processed_text']}, axis=1)
    df['aspect_sentiment_scores'] = df.apply(lambda row: {aspect: get_sentiment_score(row['body']) for aspect in VALID_ASPECTS if aspect in row['processed_text']}, axis=1)
    df['average_sentiment_score'] = df['aspect_sentiment_scores'].apply(lambda x: sum(x.values()) / len(x) if x else None)

    # Generate output filename with timestamp
    input_base = input_filename.split('/')[-1].split('.')[0]
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_filename = f'/content/knn_bert_aspect_sentiment_results_{input_base}_{timestamp}.csv'

    # Save results to CSV
    print("Saving results...")
    save_start_time = time.time()
    df[['body', 'rating', 'asin', 'name', 'aspect', 'aspect_sentiments', 'average_sentiment_score']].to_csv(output_filename, index=False)
    save_end_time = time.time()
    print(f"Time taken to save results: {save_end_time - save_start_time:.2f} seconds")

    # Overall execution time
    overall_end_time = time.time()
    print(f"\nTotal execution time: {overall_end_time - overall_start_time:.2f} seconds")

if __name__ == "__main__":
    main()



Loading data...
Data loaded and cleaned. Shape: (1000, 8)
Time taken to load data: 1.26 seconds
Processing text data...
Time taken to preprocess text: 4.47 seconds
Training KNN model...
              precision    recall  f1-score   support

     battery       0.00      0.00      0.00        34
      camera       0.00      0.00      0.00         8
      design       0.00      0.00      0.00         2
     display       0.00      0.00      0.00         1
      memory       0.00      0.00      0.00         1
       price       0.76      1.00      0.86       151
    software       0.00      0.00      0.00         3

    accuracy                           0.76       200
   macro avg       0.11      0.14      0.12       200
weighted avg       0.57      0.76      0.65       200

Classifying aspects with KNN model...


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Initializing BERT model for sentiment analysis...
Analyzing sentiment with BERT model...
Saving results...
Time taken to save results: 0.02 seconds

Total execution time: 528.71 seconds


In [35]:
# Loading and inspecting the final data
final_data = pd.read_csv('/content/knn_bert_aspect_modeling_results_sampled_reviews_20240709_174755.csv')
unique_aspects = final_data['aspect'].unique()
unique_sentiments = final_data['aspect_sentiments'].unique()
print(unique_aspects)
print(unique_sentiments)

['price' 'camera']
['{}' "{'price': 'positive'}"
 "{'camera': 'positive', 'battery': 'positive', 'network': 'positive'}"
 "{'price': 'negative'}" "{'camera': 'positive', 'display': 'positive'}"
 "{'battery': 'neutral'}" "{'camera': 'positive'}"
 "{'price': 'positive', 'battery': 'positive', 'design': 'positive'}"
 "{'battery': 'positive'}" "{'camera': 'negative', 'design': 'negative'}"
 "{'camera': 'positive', 'battery': 'positive', 'display': 'positive', 'software': 'positive', 'network': 'positive'}"
 "{'price': 'negative', 'camera': 'negative'}" "{'network': 'neutral'}"
 "{'memory': 'positive'}" "{'camera': 'negative', 'battery': 'negative'}"
 "{'price': 'neutral', 'camera': 'neutral'}"
 "{'price': 'positive', 'software': 'positive'}"
 "{'camera': 'positive', 'battery': 'positive', 'software': 'positive'}"
 "{'price': 'negative', 'battery': 'negative'}"
 "{'battery': 'positive', 'memory': 'positive'}"
 "{'price': 'positive', 'battery': 'positive', 'display': 'positive'}"
 "{'battery

In [37]:
final_data


Unnamed: 0,body,rating,asin,name,aspect,aspect_sentiments,average_sentiment_score
0,Nothing but flying star's for me here I had a ...,5,B01N9XOXCK,magnum688,price,{},
1,Really awesome phone i love it,5,B07NZVM3RN,ian herrington,price,{},
2,Awesome phone !!! Although it’s a lite version...,5,B07CMBB6PH,Martin,price,{'price': 'positive'},4.0
3,Great camera. Window is amazing. Only compatib...,2,B00TRLXO6U,Amazon Customer,camera,"{'camera': 'positive', 'battery': 'positive', ...",3.0
4,Un gran teléfono y el color es genial,5,B07X5VF1FM,luis francisco espinoza,price,{},
...,...,...,...,...,...,...,...
95,I approve and will recommend this product to o...,5,B07NZXXZB2,Jay Lumbre,price,{},
96,I purchased this phone (in Midnight Black) dir...,5,B07K76LBLZ,D. Force,price,"{'price': 'positive', 'camera': 'positive', 'b...",4.0
97,I just have a question about the sim card. Is ...,2,B01CYYYRNK,Jeannette Bowersox,price,{},
98,Very Good!,5,B00VH2TWBS,Oseas Illú,price,{},


In [38]:
# prompt: Using dataframe final_data: find all the data from final_data which dont have any nan value in average_sentiment_score

final_data.dropna(subset=['average_sentiment_score'])


Unnamed: 0,body,rating,asin,name,aspect,aspect_sentiments,average_sentiment_score
2,Awesome phone !!! Although it’s a lite version...,5,B07CMBB6PH,Martin,price,{'price': 'positive'},4.0
3,Great camera. Window is amazing. Only compatib...,2,B00TRLXO6U,Amazon Customer,camera,"{'camera': 'positive', 'battery': 'positive', ...",3.0
5,This phone was terrible! Every time you would ...,1,B06X9HVVC5,Amazon Customer,price,{'price': 'negative'},0.0
8,I had several S4's with problems and switched ...,5,B00F2SKPIM,rainier63,price,"{'camera': 'positive', 'display': 'positive'}",3.0
9,"The features are alright, but nothing amazing....",1,B06Y16RL4W,Scari,price,{'price': 'negative'},1.0
10,So far the glass and back are just fine. Batte...,2,B07HKPMFZ5,Jac,price,{'battery': 'neutral'},2.0
11,Great phone. No problems whatsoever. Camera co...,4,B00MWI4KKE,Arrtoo,price,{'camera': 'positive'},3.0
13,Incredible phone for the price point. There is...,5,B07XVZXR5Y,乇乂丅尺卂丅卄工匚匚,price,"{'price': 'positive', 'battery': 'positive', '...",3.0
14,The only problem with this phone is I had to s...,5,B07SBJPYLW,Michael King,camera,{'camera': 'positive'},3.0
18,Great phone have had it for a while now and I ...,5,B07N4M412B,Alex maynard,price,{'battery': 'positive'},4.0
