In [5]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from transformers import BertModel, BertTokenizer
import torch

def load_tsv_data(file_path):
    data = []
    
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split('\t')
            if len(parts) >= 2:
                review = parts[0]
                features = parts[1:]
                for feature in features:
                    feature_parts = feature.split()
                    if len(feature_parts) >= 4:
                        aspect_term_indices = feature_parts[0].split(',')
                        sentiment = feature_parts[2]
                        sentiment_term_indices = feature_parts[3].split(',')
                        
                        if aspect_term_indices[0] == '-1':
                            aspect_term = 'NULL'
                        else:
                            start, end = int(aspect_term_indices[0]), int(aspect_term_indices[1])
                            aspect_term = ' '.join(review.split()[start:end])
                        
                        if sentiment_term_indices[0] == '-1':
                            sentiment_term = 'NULL'
                        else:
                            start, end = int(sentiment_term_indices[0]), int(sentiment_term_indices[1])
                            sentiment_term = ' '.join(review.split()[start:end])
                        
                        data.append({
                            'review': review,
                            'aspect_term': aspect_term,
                            'sentiment_term': sentiment_term,
                            'sentiment': sentiment
                        })
            else:
                print(f"Skipping malformed line: {line.strip()}")
    
    return pd.DataFrame(data)

def train_and_tune_model(X_train, y_train, X_dev, y_dev):
    try:
        # define the parameter grid
        param_grid = {
            'C': [0.1, 1, 10, 100],
            'kernel': ['linear', 'rbf'],
            'gamma': ['scale', 'auto', 0.1, 1]
        }
        
        # perform grid search with cross-validation
        grid_search = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        
        # get the best model
        best_model = grid_search.best_estimator_
        
        # evaluate the best model on the dev
        dev_accuracy = best_model.score(X_dev, y_dev)
        print(f"Best model parameters: {grid_search.best_params_}")
        print(f"Development set accuracy: {dev_accuracy}")
        
        return best_model
    except Exception as e:
        print(f"Error during model training and tuning: {e}")
        return None

def evaluate_model(model, X_test, y_test):
    try:
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)
        return accuracy, report
    except Exception as e:
        print(f"Error during model evaluation: {e}")
        return None, None
    
def extract_features(reviews, aspect_terms, sentiment_terms, model, tokenizer):
    features = []
    for review, aspect_term, sentiment_term in zip(reviews, aspect_terms, sentiment_terms):
        # combine review, aspect term, and sentiment term into a single text
        combined_text = f"{review} [SEP] {aspect_term} [SEP] {sentiment_term}"
        
        inputs = tokenizer(combined_text, return_tensors='pt', padding=True, truncation=True, max_length=512)
        
        with torch.no_grad():
            outputs = model(**inputs)
        features.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
    return np.array(features)

def baseline_experiment(train_data, dev_data, test_data, model, tokenizer):
    
    print("\nExtracting features...")
    X_train = extract_features(train_data['review'], train_data['aspect_term'], train_data['sentiment_term'], model, tokenizer)
    X_dev = extract_features(dev_data['review'], dev_data['aspect_term'], dev_data['sentiment_term'], model, tokenizer)
    X_test = extract_features(test_data['review'], test_data['aspect_term'], test_data['sentiment_term'], model, tokenizer)
    
    print("Encoding labels...")
    le = LabelEncoder()
    le.fit(train_data['sentiment'].tolist() + dev_data['sentiment'].tolist() + test_data['sentiment'].tolist())
    y_train = le.transform(train_data['sentiment'])
    y_dev = le.transform(dev_data['sentiment'])
    y_test = le.transform(test_data['sentiment'])
    
    print("Training and tuning model...")
    best_classifier = train_and_tune_model(X_train, y_train, X_dev, y_dev)
    
    print("Evaluating model...")
    accuracy, report = evaluate_model(best_classifier, X_test, y_test)
    
    return best_classifier, le, accuracy, report

In [7]:
if __name__ == "__main__":
    bert_model = BertModel.from_pretrained(r'D:\AIProject\Bert\model')
    bert_tokenizer = BertTokenizer.from_pretrained(r'D:\AIProject\Bert\tokenizer')

    # restuarant
    rest_train_data = load_tsv_data(r'D:\AIProject\data\restaurant\rest16_quad_train.tsv')
    rest_test_data = load_tsv_data(r'D:\AIProject\data\restaurant\rest16_quad_test.tsv')
    rest_dev_data = load_tsv_data(r'D:\AIProject\data\restaurant\rest16_quad_dev.tsv')
    
    print("\nRunning baseline experiment...")
    baseline_classifier, baseline_le, baseline_accuracy, baseline_report = baseline_experiment(rest_train_data, rest_dev_data, rest_test_data, bert_model, bert_tokenizer)
    
    print("\nBaseline Experiment Results:")
    print(f"Accuracy: {baseline_accuracy}")
    print("Classification Report:")
    print(baseline_report)

    print("\nBaseline Experiments completed.")


Running baseline experiment...

Extracting features...
Encoding labels...
Training and tuning model...
Best model parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Development set accuracy: 0.8582375478927203
Evaluating model...

Baseline Experiment Results:
Accuracy: 0.8482532751091703
Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.83      0.74       205
           1       0.56      0.11      0.19        44
           2       0.92      0.90      0.91       667

    accuracy                           0.85       916
   macro avg       0.71      0.62      0.61       916
weighted avg       0.85      0.85      0.84       916


Baseline Experiments completed.


In [6]:
import requests
from tqdm import tqdm
import pickle
import os
import time
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import logging
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from scipy.sparse import hstack, csr_matrix
import nltk
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords

nltk.data.path.append("D:\\nltk_data")

# cache file for storing ConceptNet data
cache_file = 'conceptnet_cache.pkl'

def load_cache():
    global conceptnet_cache
    if os.path.exists(cache_file):
        with open(cache_file, 'rb') as f:
            conceptnet_cache = pickle.load(f)
    else:
        conceptnet_cache = {}

def save_cache():
    with open(cache_file, 'wb') as f:
        pickle.dump(conceptnet_cache, f)

def get_related_terms(word, max_retries=3, retry_delay=5):
    if word in conceptnet_cache:
        return conceptnet_cache[word]

    url = f"http://api.conceptnet.io/c/en/{word}?limit=50"  # increase limit for more results
    for attempt in range(max_retries):
        try:
            response = requests.get(url)
            response.raise_for_status()
            data = response.json()
            related_terms = set()  # store related terms in a set to avoid duplicates
            for edge in data.get('edges', []):
                rel = edge.get('rel', {}).get('label')
                end = edge.get('end', {})
                if isinstance(end, dict) and end.get('language') == 'en':
                    label = end.get('label')
                    # only keep related terms that are different from the original word
                    if label and label != word and rel in ['IsA', 'RelatedTo', 'Synonym', 'HasA', 'PartOf', 'UsedFor', 'CapableOf']:
                        related_terms.add(label)
            conceptnet_cache[word] = list(related_terms)  # convert set to list before storing
            return conceptnet_cache[word]
        except requests.exceptions.RequestException as e:
            if attempt < max_retries - 1:
                print(f"Error fetching data for '{word}'. Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
            else:
                print(f"Failed to fetch data for '{word}' after {max_retries} attempts.")
                logging.error(f"Failed to fetch data for '{word}': {str(e)}")
                return []
        except KeyError as e:
            print(f"Unexpected data structure for '{word}'. Skipping...")
            logging.error(f"Unexpected data structure for '{word}': {str(e)}\nData: {data}")
            return []
        except Exception as e:
            print(f"Unexpected error processing '{word}'. Skipping...")
            logging.error(f"Unexpected error processing '{word}': {str(e)}\nData: {data}")
            return []


review

In [114]:
def preprocess_text(text):
    # tokenize and tag words
    tokens = word_tokenize(text.lower())
    tagged = pos_tag(tokens)
    
    # keep only nouns
    nouns = [word for word, pos in tagged if pos.startswith('NN')]

    # remove stopwords
    stop_words = set(stopwords.words('english'))
    nouns = [noun for noun in nouns if noun not in stop_words]
    
    return nouns

def enrich_text(text, max_related=5):
    # get_related_terms can only enter one word, so preprocess_text first is necessary 
    words = preprocess_text(text)
    enriched_words = []
    for word in words:
        enriched_words.append(word)
        related_terms = get_related_terms(word)
        # filter out words that are already in the text
        filtered_terms = [term for term in related_terms if term not in words]
        
        enriched_words.extend(filtered_terms[:max_related])
        # remian the original word
    return ' '.join(enriched_words)

def extract_conceptnet_features(reviews, aspect_terms, sentiment_terms, model, tokenizer):
    features = []
    for review, aspect_term, sentiment_term in zip(reviews, aspect_terms, sentiment_terms):
        # enrich the review text with related terms from ConceptNet
        enriched_review = enrich_text(review)
        
        combined_text = f"{enriched_review} [SEP] {aspect_term} [SEP] {sentiment_term}"
        
        inputs = tokenizer(combined_text, return_tensors='pt', padding=True, truncation=True, max_length=512)
        
        with torch.no_grad():
            outputs = model(**inputs)
        features.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
        
        save_cache()
    
    save_cache()  
    return np.array(features)

def conceptnet_experiment(train_data, dev_data, test_data, model, tokenizer):
    load_cache()  
    print("Preprocessing data...")
    
    print("\nExtracting features with ConceptNet enrichment...")
    X_train = extract_conceptnet_features(train_data['review'], train_data['aspect_term'], train_data['sentiment_term'], model, tokenizer)
    X_dev = extract_conceptnet_features(dev_data['review'], dev_data['aspect_term'], dev_data['sentiment_term'], model, tokenizer)
    X_test = extract_conceptnet_features(test_data['review'], test_data['aspect_term'], test_data['sentiment_term'], model, tokenizer)
    
    print("Encoding labels...")
    le = LabelEncoder()
    le.fit(train_data['sentiment'].tolist() + dev_data['sentiment'].tolist() + test_data['sentiment'].tolist())
    y_train = le.transform(train_data['sentiment'])
    y_dev = le.transform(dev_data['sentiment'])
    y_test = le.transform(test_data['sentiment'])
    
    print("Training and tuning model...")
    best_classifier = train_and_tune_model(X_train, y_train, X_dev, y_dev)
    
    print("Evaluating model...")
    accuracy, report = evaluate_model(best_classifier, X_test, y_test)
    
    return best_classifier, le, accuracy, report

if __name__ == "__main__":
    bert_model = BertModel.from_pretrained(r'D:\AIProject\Bert\model')
    bert_tokenizer = BertTokenizer.from_pretrained(r'D:\AIProject\Bert\tokenizer')

    # restuarant
    rest_train_data = load_tsv_data(r'D:\AIProject\data\restaurant\rest16_quad_train.tsv')
    rest_test_data = load_tsv_data(r'D:\AIProject\data\restaurant\rest16_quad_test.tsv')
    rest_dev_data = load_tsv_data(r'D:\AIProject\data\restaurant\rest16_quad_dev.tsv')
    
    print("\nRunning ConceptNet experiment...")
    conceptnet_classifier, conceptnet_le, conceptnet_accuracy, conceptnet_report = conceptnet_experiment(rest_train_data, rest_dev_data, rest_test_data, bert_model, bert_tokenizer)
    
    print("\nConceptnet Experiment Results:")
    print(f"Accuracy: {conceptnet_accuracy}")
    print("Classification Report:")
    print(conceptnet_report)

    print("\nConceptnet Experiments completed.")


Running ConceptNet experiment...
Preprocessing data...

Extracting features with ConceptNet enrichment...
Encoding labels...
Training and tuning model...
Best model parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Development set accuracy: 0.7969348659003831
Evaluating model...

Conceptnet Experiment Results:
Accuracy: 0.732532751091703
Classification Report:
              precision    recall  f1-score   support

           0       0.47      0.57      0.52       205
           1       1.00      0.02      0.04        44
           2       0.83      0.83      0.83       667

    accuracy                           0.73       916
   macro avg       0.77      0.47      0.46       916
weighted avg       0.76      0.73      0.72       916


Conceptnet Experiments completed.


2 aspect_term or sentiment_term 

In [3]:
def enrich_text(primary_term, max_related=5):
    terms = primary_term.split()
    
    related_terms = []
    for term in terms:
        related_terms.extend(get_related_terms(term))
    
    filtered_terms = [term for term in related_terms if term not in terms]
    
    return ' '.join(filtered_terms[:max_related])

def extract_conceptnet_features(reviews, aspect_terms, sentiment_terms, model, tokenizer):
    features = []
    for review, aspect_term, sentiment_term in zip(reviews, aspect_terms, sentiment_terms):
        # only use aspect term if available, otherwise use sentiment term
        primary_term = aspect_term if aspect_term else sentiment_term
        enriched_terms = enrich_text(primary_term)
        enriched_review = review + ' ' + enriched_terms
        
        combined_text = f"{enriched_review} [SEP] {aspect_term} [SEP] {sentiment_term}"
        
        inputs = tokenizer(combined_text, return_tensors='pt', padding=True, truncation=True, max_length=512)
        
        with torch.no_grad():
            outputs = model(**inputs)
        features.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
        
        save_cache()
    
    save_cache()  
    return np.array(features)

def conceptnet_experiment(train_data, dev_data, test_data, model, tokenizer):
    load_cache()  
    print("Preprocessing data...")
    
    print("\nExtracting features with ConceptNet enrichment...")
    X_train = extract_conceptnet_features(train_data['review'], train_data['aspect_term'], train_data['sentiment_term'], model, tokenizer)
    X_dev = extract_conceptnet_features(dev_data['review'], dev_data['aspect_term'], dev_data['sentiment_term'], model, tokenizer)
    X_test = extract_conceptnet_features(test_data['review'], test_data['aspect_term'], test_data['sentiment_term'], model, tokenizer)
    
    print("Encoding labels...")
    le = LabelEncoder()
    le.fit(train_data['sentiment'].tolist() + dev_data['sentiment'].tolist() + test_data['sentiment'].tolist())
    y_train = le.transform(train_data['sentiment'])
    y_dev = le.transform(dev_data['sentiment'])
    y_test = le.transform(test_data['sentiment'])
    
    print("Training and tuning model...")
    best_classifier = train_and_tune_model(X_train, y_train, X_dev, y_dev)
    
    print("Evaluating model...")
    accuracy, report = evaluate_model(best_classifier, X_test, y_test)
    
    return best_classifier, le, accuracy, report

if __name__ == "__main__":
    bert_model = BertModel.from_pretrained(r'D:\AIProject\Bert\model')
    bert_tokenizer = BertTokenizer.from_pretrained(r'D:\AIProject\Bert\tokenizer')

    # restuarant
    rest_train_data = load_tsv_data(r'D:\AIProject\data\restaurant\rest16_quad_train.tsv')
    rest_test_data = load_tsv_data(r'D:\AIProject\data\restaurant\rest16_quad_test.tsv')
    rest_dev_data = load_tsv_data(r'D:\AIProject\data\restaurant\rest16_quad_dev.tsv')
    
    print("\nRunning ConceptNet experiment...")
    conceptnet_classifier, conceptnet_le, conceptnet_accuracy, conceptnet_report = conceptnet_experiment(rest_train_data, rest_dev_data, rest_test_data, bert_model, bert_tokenizer)
    
    print("\nConceptnet Experiment Results:")
    print(f"Accuracy: {conceptnet_accuracy}")
    print("Classification Report:")
    print(conceptnet_report)

    print("\nConceptnet Experiments completed.")


Running ConceptNet experiment...
Preprocessing data...

Extracting features with ConceptNet enrichment...
Encoding labels...
Training and tuning model...
Best model parameters: {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}
Development set accuracy: 0.8582375478927203
Evaluating model...

Conceptnet Experiment Results:
Accuracy: 0.8329694323144105
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.80      0.71       205
           1       0.00      0.00      0.00        44
           2       0.90      0.90      0.90       667

    accuracy                           0.83       916
   macro avg       0.52      0.57      0.54       916
weighted avg       0.80      0.83      0.82       916


Conceptnet Experiments completed.


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


3 aspect_term

In [4]:
def enrich_text(primary_term, max_related=5):
    terms = primary_term.split()
    
    related_terms = []
    for term in terms:
        related_terms.extend(get_related_terms(term))
    
    filtered_terms = [term for term in related_terms if term not in terms]
    
    return ' '.join(filtered_terms[:max_related])

def extract_conceptnet_features(reviews, aspect_terms, sentiment_terms, model, tokenizer):
    features = []
    for review, aspect_term, sentiment_term in zip(reviews, aspect_terms, sentiment_terms):
        # only use aspect term
        enriched_terms = enrich_text(aspect_term)
        enriched_review = review + ' ' + enriched_terms
        
        combined_text = f"{enriched_review} [SEP] {aspect_term} [SEP] {sentiment_term}"
        
        inputs = tokenizer(combined_text, return_tensors='pt', padding=True, truncation=True, max_length=512)
        
        with torch.no_grad():
            outputs = model(**inputs)
        features.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
        
        save_cache()
    
    save_cache()  
    return np.array(features)

def conceptnet_experiment(train_data, dev_data, test_data, model, tokenizer):
    load_cache()  
    print("Preprocessing data...")
    
    print("\nExtracting features with ConceptNet enrichment...")
    X_train = extract_conceptnet_features(train_data['review'], train_data['aspect_term'], train_data['sentiment_term'], model, tokenizer)
    X_dev = extract_conceptnet_features(dev_data['review'], dev_data['aspect_term'], dev_data['sentiment_term'], model, tokenizer)
    X_test = extract_conceptnet_features(test_data['review'], test_data['aspect_term'], test_data['sentiment_term'], model, tokenizer)
    
    print("Encoding labels...")
    le = LabelEncoder()
    le.fit(train_data['sentiment'].tolist() + dev_data['sentiment'].tolist() + test_data['sentiment'].tolist())
    y_train = le.transform(train_data['sentiment'])
    y_dev = le.transform(dev_data['sentiment'])
    y_test = le.transform(test_data['sentiment'])
    
    print("Training and tuning model...")
    best_classifier = train_and_tune_model(X_train, y_train, X_dev, y_dev)
    
    print("Evaluating model...")
    accuracy, report = evaluate_model(best_classifier, X_test, y_test)
    
    return best_classifier, le, accuracy, report

if __name__ == "__main__":
    bert_model = BertModel.from_pretrained(r'D:\AIProject\Bert\model')
    bert_tokenizer = BertTokenizer.from_pretrained(r'D:\AIProject\Bert\tokenizer')

    # restuarant
    rest_train_data = load_tsv_data(r'D:\AIProject\data\restaurant\rest16_quad_train.tsv')
    rest_test_data = load_tsv_data(r'D:\AIProject\data\restaurant\rest16_quad_test.tsv')
    rest_dev_data = load_tsv_data(r'D:\AIProject\data\restaurant\rest16_quad_dev.tsv')
    
    print("\nRunning ConceptNet experiment...")
    conceptnet_classifier, conceptnet_le, conceptnet_accuracy, conceptnet_report = conceptnet_experiment(rest_train_data, rest_dev_data, rest_test_data, bert_model, bert_tokenizer)
    
    print("\nConceptnet Experiment Results:")
    print(f"Accuracy: {conceptnet_accuracy}")
    print("Classification Report:")
    print(conceptnet_report)

    print("\nConceptnet Experiments completed.")


Running ConceptNet experiment...
Preprocessing data...

Extracting features with ConceptNet enrichment...
Encoding labels...
Training and tuning model...
Best model parameters: {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}
Development set accuracy: 0.8582375478927203
Evaluating model...

Conceptnet Experiment Results:
Accuracy: 0.8329694323144105
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.80      0.71       205
           1       0.00      0.00      0.00        44
           2       0.90      0.90      0.90       667

    accuracy                           0.83       916
   macro avg       0.52      0.57      0.54       916
weighted avg       0.80      0.83      0.82       916


Conceptnet Experiments completed.


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


4 sentiment_term

In [6]:
def enrich_text(primary_term, max_related=5):
    terms = primary_term.split()
    
    related_terms = []
    for term in terms:
        related_terms.extend(get_related_terms(term))
    
    filtered_terms = [term for term in related_terms if term not in terms]
    
    return ' '.join(filtered_terms[:max_related])

def extract_conceptnet_features(reviews, aspect_terms, sentiment_terms, model, tokenizer):
    features = []
    for review, aspect_term, sentiment_term in zip(reviews, aspect_terms, sentiment_terms):
        # only use sentiment_term
        enriched_terms = enrich_text(sentiment_term)
        enriched_review = review + ' ' + enriched_terms
        
        combined_text = f"{enriched_review} [SEP] {aspect_term} [SEP] {sentiment_term}"
        
        inputs = tokenizer(combined_text, return_tensors='pt', padding=True, truncation=True, max_length=512)
        
        with torch.no_grad():
            outputs = model(**inputs)
        features.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
        
        save_cache()
    
    save_cache()  
    return np.array(features)

def conceptnet_experiment(train_data, dev_data, test_data, model, tokenizer):
    load_cache()  
    print("Preprocessing data...")
    
    print("\nExtracting features with ConceptNet enrichment...")
    X_train = extract_conceptnet_features(train_data['review'], train_data['aspect_term'], train_data['sentiment_term'], model, tokenizer)
    X_dev = extract_conceptnet_features(dev_data['review'], dev_data['aspect_term'], dev_data['sentiment_term'], model, tokenizer)
    X_test = extract_conceptnet_features(test_data['review'], test_data['aspect_term'], test_data['sentiment_term'], model, tokenizer)
    
    print("Encoding labels...")
    le = LabelEncoder()
    le.fit(train_data['sentiment'].tolist() + dev_data['sentiment'].tolist() + test_data['sentiment'].tolist())
    y_train = le.transform(train_data['sentiment'])
    y_dev = le.transform(dev_data['sentiment'])
    y_test = le.transform(test_data['sentiment'])
    
    print("Training and tuning model...")
    best_classifier = train_and_tune_model(X_train, y_train, X_dev, y_dev)
    
    print("Evaluating model...")
    accuracy, report = evaluate_model(best_classifier, X_test, y_test)
    
    return best_classifier, le, accuracy, report

if __name__ == "__main__":
    bert_model = BertModel.from_pretrained(r'D:\AIProject\Bert\model')
    bert_tokenizer = BertTokenizer.from_pretrained(r'D:\AIProject\Bert\tokenizer')

    # restuarant
    rest_train_data = load_tsv_data(r'D:\AIProject\data\restaurant\rest16_quad_train.tsv')
    rest_test_data = load_tsv_data(r'D:\AIProject\data\restaurant\rest16_quad_test.tsv')
    rest_dev_data = load_tsv_data(r'D:\AIProject\data\restaurant\rest16_quad_dev.tsv')
    
    print("\nRunning ConceptNet experiment...")
    conceptnet_classifier, conceptnet_le, conceptnet_accuracy, conceptnet_report = conceptnet_experiment(rest_train_data, rest_dev_data, rest_test_data, bert_model, bert_tokenizer)
    
    print("\nConceptnet Experiment Results:")
    print(f"Accuracy: {conceptnet_accuracy}")
    print("Classification Report:")
    print(conceptnet_report)

    print("\nConceptnet Experiments completed.")


Running ConceptNet experiment...
Preprocessing data...

Extracting features with ConceptNet enrichment...
Encoding labels...
Training and tuning model...
Best model parameters: {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
Development set accuracy: 0.8544061302681992
Evaluating model...

Conceptnet Experiment Results:
Accuracy: 0.8482532751091703
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.82      0.74       205
           1       0.00      0.00      0.00        44
           2       0.92      0.91      0.91       667

    accuracy                           0.85       916
   macro avg       0.53      0.58      0.55       916
weighted avg       0.82      0.85      0.83       916


Conceptnet Experiments completed.


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
# Compare results
print("\nComparison of Results:")
print(f"Baseline Accuracy: {baseline_accuracy}")
print(f"ConceptNet Accuracy: {conceptnet_accuracy}")
print(f"Accuracy Improvement: {conceptnet_accuracy - baseline_accuracy}")

print("\nDetailed Comparison:")
print("Baseline Classification Report:")
print(baseline_report)
print("\nConceptNet Classification Report:")
print(conceptnet_report)

print("\nExperiments completed.")


Comparison of Results:
Baseline Accuracy: 0.8482532751091703
ConceptNet Accuracy: 0.8482532751091703
Accuracy Improvement: 0.0

Detailed Comparison:
Baseline Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.83      0.74       205
           1       0.56      0.11      0.19        44
           2       0.92      0.90      0.91       667

    accuracy                           0.85       916
   macro avg       0.71      0.62      0.61       916
weighted avg       0.85      0.85      0.84       916


ConceptNet Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.82      0.74       205
           1       0.00      0.00      0.00        44
           2       0.92      0.91      0.91       667

    accuracy                           0.85       916
   macro avg       0.53      0.58      0.55       916
weighted avg       0.82      0.85      0.83       916


Experiments complete

5 sentiment_term else aspect_term

In [7]:
def enrich_text(primary_term, max_related=5):
    terms = primary_term.split()
    
    related_terms = []
    for term in terms:
        related_terms.extend(get_related_terms(term))
    
    filtered_terms = [term for term in related_terms if term not in terms]
    
    return ' '.join(filtered_terms[:max_related])

def extract_conceptnet_features(reviews, aspect_terms, sentiment_terms, model, tokenizer):
    features = []
    for review, aspect_term, sentiment_term in zip(reviews, aspect_terms, sentiment_terms):
        # only use sentiment term if available, otherwise use aspect term
        primary_term = sentiment_term if sentiment_term else aspect_term
        enriched_terms = enrich_text(primary_term)
        enriched_review = review + ' ' + enriched_terms
        
        combined_text = f"{enriched_review} [SEP] {aspect_term} [SEP] {sentiment_term}"
        
        inputs = tokenizer(combined_text, return_tensors='pt', padding=True, truncation=True, max_length=512)
        
        with torch.no_grad():
            outputs = model(**inputs)
        features.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
        
        save_cache()
    
    save_cache()  
    return np.array(features)

def conceptnet_experiment(train_data, dev_data, test_data, model, tokenizer):
    load_cache()  
    print("Preprocessing data...")
    
    print("\nExtracting features with ConceptNet enrichment...")
    X_train = extract_conceptnet_features(train_data['review'], train_data['aspect_term'], train_data['sentiment_term'], model, tokenizer)
    X_dev = extract_conceptnet_features(dev_data['review'], dev_data['aspect_term'], dev_data['sentiment_term'], model, tokenizer)
    X_test = extract_conceptnet_features(test_data['review'], test_data['aspect_term'], test_data['sentiment_term'], model, tokenizer)
    
    print("Encoding labels...")
    le = LabelEncoder()
    le.fit(train_data['sentiment'].tolist() + dev_data['sentiment'].tolist() + test_data['sentiment'].tolist())
    y_train = le.transform(train_data['sentiment'])
    y_dev = le.transform(dev_data['sentiment'])
    y_test = le.transform(test_data['sentiment'])
    
    print("Training and tuning model...")
    best_classifier = train_and_tune_model(X_train, y_train, X_dev, y_dev)
    
    print("Evaluating model...")
    accuracy, report = evaluate_model(best_classifier, X_test, y_test)
    
    return best_classifier, le, accuracy, report

if __name__ == "__main__":
    bert_model = BertModel.from_pretrained(r'D:\AIProject\Bert\model')
    bert_tokenizer = BertTokenizer.from_pretrained(r'D:\AIProject\Bert\tokenizer')

    # restuarant
    rest_train_data = load_tsv_data(r'D:\AIProject\data\restaurant\rest16_quad_train.tsv')
    rest_test_data = load_tsv_data(r'D:\AIProject\data\restaurant\rest16_quad_test.tsv')
    rest_dev_data = load_tsv_data(r'D:\AIProject\data\restaurant\rest16_quad_dev.tsv')
    
    print("\nRunning ConceptNet experiment...")
    conceptnet_classifier, conceptnet_le, conceptnet_accuracy, conceptnet_report = conceptnet_experiment(rest_train_data, rest_dev_data, rest_test_data, bert_model, bert_tokenizer)
    
    print("\nConceptnet Experiment Results:")
    print(f"Accuracy: {conceptnet_accuracy}")
    print("Classification Report:")
    print(conceptnet_report)

    print("\nConceptnet Experiments completed.")


Running ConceptNet experiment...
Preprocessing data...

Extracting features with ConceptNet enrichment...
Encoding labels...
Training and tuning model...
Best model parameters: {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
Development set accuracy: 0.8544061302681992
Evaluating model...

Conceptnet Experiment Results:
Accuracy: 0.8482532751091703
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.82      0.74       205
           1       0.00      0.00      0.00        44
           2       0.92      0.91      0.91       667

    accuracy                           0.85       916
   macro avg       0.53      0.58      0.55       916
weighted avg       0.82      0.85      0.83       916


Conceptnet Experiments completed.


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


SelectKBest

In [13]:
def extract_conceptnet_features(reviews, aspect_terms, sentiment_terms, model, tokenizer):
    features = []
    for review, aspect_term, sentiment_term in zip(reviews, aspect_terms, sentiment_terms):
        # only use sentiment_term
        enriched_terms = enrich_text(sentiment_term)
        enriched_review = review + ' ' + enriched_terms
        
        combined_text = f"{enriched_review} [SEP] {aspect_term} [SEP] {sentiment_term}"
        
        inputs = tokenizer(combined_text, return_tensors='pt', padding=True, truncation=True, max_length=512)
        
        with torch.no_grad():
            outputs = model(**inputs)
        features.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
        
        save_cache()
    
    save_cache()  
    return np.array(features)

from sklearn.feature_selection import SelectKBest, f_classif

def conceptnet_experiment(train_data, dev_data, test_data, model, tokenizer):
    load_cache()
    print("Preprocessing data...")
    
    print("\nExtracting features with ConceptNet enrichment...")
    X_train = extract_conceptnet_features(train_data['review'], train_data['aspect_term'], train_data['sentiment_term'], model, tokenizer)
    X_dev = extract_conceptnet_features(dev_data['review'], dev_data['aspect_term'], dev_data['sentiment_term'], model, tokenizer)
    X_test = extract_conceptnet_features(test_data['review'], test_data['aspect_term'], test_data['sentiment_term'], model, tokenizer)
    
    print("Encoding labels...")
    le = LabelEncoder()
    le.fit(train_data['sentiment'].tolist() + dev_data['sentiment'].tolist() + test_data['sentiment'].tolist())
    y_train = le.transform(train_data['sentiment'])
    y_dev = le.transform(dev_data['sentiment'])
    y_test = le.transform(test_data['sentiment'])
    
    # 优化特征选择
    print("Performing feature selection...")
    selector = SelectKBest(f_classif, k=min(1000, X_train.shape[1]))
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_dev_selected = selector.transform(X_dev)
    X_test_selected = selector.transform(X_test)
    
    print("Training and tuning model...")
    best_classifier = train_and_tune_model(X_train_selected, y_train, X_dev_selected, y_dev)
    
    print("Evaluating model...")
    accuracy, report = evaluate_model(best_classifier, X_test_selected, y_test)
    
    return best_classifier, le, accuracy, report

if __name__ == "__main__":
    bert_model = BertModel.from_pretrained(r'D:\AIProject\Bert\model')
    bert_tokenizer = BertTokenizer.from_pretrained(r'D:\AIProject\Bert\tokenizer')

    # restuarant
    rest_train_data = load_tsv_data(r'D:\AIProject\data\restaurant\rest16_quad_train.tsv')
    rest_test_data = load_tsv_data(r'D:\AIProject\data\restaurant\rest16_quad_test.tsv')
    rest_dev_data = load_tsv_data(r'D:\AIProject\data\restaurant\rest16_quad_dev.tsv')
    
    print("\nRunning ConceptNet experiment...")
    conceptnet_classifier, conceptnet_le, conceptnet_accuracy, conceptnet_report = conceptnet_experiment(rest_train_data, rest_dev_data, rest_test_data, bert_model, bert_tokenizer)
    
    print("\nConceptnet Experiment Results:")
    print(f"Accuracy: {conceptnet_accuracy}")
    print("Classification Report:")
    print(conceptnet_report)

    print("\nConceptnet Experiments completed.")


Running ConceptNet experiment...
Preprocessing data...

Extracting features with ConceptNet enrichment...
Encoding labels...
Performing feature selection...
Training and tuning model...
Best model parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Development set accuracy: 0.8773946360153256
Evaluating model...

Conceptnet Experiment Results:
Accuracy: 0.851528384279476
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.80      0.73       205
           1       0.60      0.14      0.22        44
           2       0.92      0.91      0.92       667

    accuracy                           0.85       916
   macro avg       0.73      0.62      0.62       916
weighted avg       0.85      0.85      0.84       916


Conceptnet Experiments completed.


SMOTE

In [9]:
from imblearn.over_sampling import SMOTE
    
def extract_conceptnet_features(reviews, aspect_terms, sentiment_terms, model, tokenizer):
    features = []
    for review, aspect_term, sentiment_term in zip(reviews, aspect_terms, sentiment_terms):
        # only use sentiment_term
        enriched_terms = enrich_text(sentiment_term)
        enriched_review = review + ' ' + enriched_terms
        
        combined_text = f"{enriched_review} [SEP] {aspect_term} [SEP] {sentiment_term}"
        
        inputs = tokenizer(combined_text, return_tensors='pt', padding=True, truncation=True, max_length=512)
        
        with torch.no_grad():
            outputs = model(**inputs)
        features.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
        
        save_cache()
    
    save_cache()  
    return np.array(features)

def conceptnet_experiment(train_data, dev_data, test_data, model, tokenizer):
    load_cache()
    print("Preprocessing data...")
    
    print("\nExtracting features with ConceptNet enrichment...")
    X_train = extract_conceptnet_features(train_data['review'], train_data['aspect_term'], train_data['sentiment_term'], model, tokenizer)
    X_dev = extract_conceptnet_features(dev_data['review'], dev_data['aspect_term'], dev_data['sentiment_term'], model, tokenizer)
    X_test = extract_conceptnet_features(test_data['review'], test_data['aspect_term'], test_data['sentiment_term'], model, tokenizer)
    
    print("Encoding labels...")
    le = LabelEncoder()
    le.fit(train_data['sentiment'].tolist() + dev_data['sentiment'].tolist() + test_data['sentiment'].tolist())
    y_train = le.transform(train_data['sentiment'])
    y_dev = le.transform(dev_data['sentiment'])
    y_test = le.transform(test_data['sentiment'])
    
    # Optimal feature selection
    print("Performing feature selection...")
    selector = SelectKBest(f_classif, k=min(1000, X_train.shape[1]))
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_dev_selected = selector.transform(X_dev)
    X_test_selected = selector.transform(X_test)
    
    # SMOTE for class balancing
    print("Applying SMOTE for class balancing...")
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_selected, y_train)
    
    print("Training and tuning model...")
    best_classifier = train_and_tune_model(X_train_resampled, y_train_resampled, X_dev_selected, y_dev)
    
    print("Evaluating model...")
    accuracy, report = evaluate_model(best_classifier, X_test_selected, y_test)
    
    return best_classifier, le, accuracy, report

if __name__ == "__main__":
    bert_model = BertModel.from_pretrained(r'D:\AIProject\Bert\model')
    bert_tokenizer = BertTokenizer.from_pretrained(r'D:\AIProject\Bert\tokenizer')

    # restuarant
    rest_train_data = load_tsv_data(r'D:\AIProject\data\restaurant\rest16_quad_train.tsv')
    rest_test_data = load_tsv_data(r'D:\AIProject\data\restaurant\rest16_quad_test.tsv')
    rest_dev_data = load_tsv_data(r'D:\AIProject\data\restaurant\rest16_quad_dev.tsv')
    
    print("\nRunning ConceptNet experiment...")
    conceptnet_classifier, conceptnet_le, conceptnet_accuracy, conceptnet_report = conceptnet_experiment(rest_train_data, rest_dev_data, rest_test_data, bert_model, bert_tokenizer)
    
    print("\nConceptnet Experiment Results:")
    print(f"Accuracy: {conceptnet_accuracy}")
    print("Classification Report:")
    print(conceptnet_report)

    print("\nConceptnet Experiments completed.")



Running ConceptNet experiment...
Preprocessing data...

Extracting features with ConceptNet enrichment...
Encoding labels...
Performing feature selection...
Applying SMOTE for class balancing...
Training and tuning model...
Best model parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Development set accuracy: 0.8697318007662835
Evaluating model...

Conceptnet Experiment Results:
Accuracy: 0.8395196506550219
Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.78      0.71       205
           1       0.47      0.18      0.26        44
           2       0.91      0.90      0.91       667

    accuracy                           0.84       916
   macro avg       0.68      0.62      0.63       916
weighted avg       0.84      0.84      0.83       916


Conceptnet Experiments completed.


Optimize the model training process

In [3]:
def train_and_tune_model(X_train, y_train, X_dev, y_dev):
    param_grid = {
        'C': [0.1, 1, 10, 100],
        'kernel': ['rbf', 'linear'],
        'gamma': ['scale', 'auto', 0.1, 1],
        'class_weight': [None, 'balanced'] # add class_weight parameter, set to 'balanced' to account for class imbalance
    }
    
    grid_search = GridSearchCV(SVC(probability=True), param_grid, cv=5, scoring='f1_macro', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    dev_accuracy = best_model.score(X_dev, y_dev)
    print(f"Best model parameters: {grid_search.best_params_}")
    print(f"Development set accuracy: {dev_accuracy}")
    
    return best_model

def enrich_text(primary_term, max_related=5):
    terms = primary_term.split()
    
    related_terms = []
    for term in terms:
        related_terms.extend(get_related_terms(term))
    
    filtered_terms = [term for term in related_terms if term not in terms]
    
    return ' '.join(filtered_terms[:max_related])

def extract_conceptnet_features(reviews, aspect_terms, sentiment_terms, model, tokenizer):
    features = []
    for review, aspect_term, sentiment_term in zip(reviews, aspect_terms, sentiment_terms):
        # only use sentiment_term
        enriched_terms = enrich_text(sentiment_term)
        enriched_review = review + ' ' + enriched_terms
        
        combined_text = f"{enriched_review} [SEP] {aspect_term} [SEP] {sentiment_term}"
        
        inputs = tokenizer(combined_text, return_tensors='pt', padding=True, truncation=True, max_length=512)
        
        with torch.no_grad():
            outputs = model(**inputs)
        features.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
        
        save_cache()
    
    save_cache()  
    return np.array(features)

def conceptnet_experiment(train_data, dev_data, test_data, model, tokenizer):
    load_cache()
    print("Preprocessing data...")
    
    print("\nExtracting features with ConceptNet enrichment...")
    X_train = extract_conceptnet_features(train_data['review'], train_data['aspect_term'], train_data['sentiment_term'], model, tokenizer)
    X_dev = extract_conceptnet_features(dev_data['review'], dev_data['aspect_term'], dev_data['sentiment_term'], model, tokenizer)
    X_test = extract_conceptnet_features(test_data['review'], test_data['aspect_term'], test_data['sentiment_term'], model, tokenizer)
    
    print("Encoding labels...")
    le = LabelEncoder()
    le.fit(train_data['sentiment'].tolist() + dev_data['sentiment'].tolist() + test_data['sentiment'].tolist())
    y_train = le.transform(train_data['sentiment'])
    y_dev = le.transform(dev_data['sentiment'])
    y_test = le.transform(test_data['sentiment'])
    
    # perform feature selection to reduce dimensionality 
    print("Performing feature selection...")
    selector = SelectKBest(f_classif, k=min(1000, X_train.shape[1]))
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_dev_selected = selector.transform(X_dev)
    X_test_selected = selector.transform(X_test)
    
    print("Training and tuning model...")
    best_classifier = train_and_tune_model(X_train_selected, y_train, X_dev_selected, y_dev)
    
    print("Evaluating model...")
    accuracy, report = evaluate_model(best_classifier, X_test_selected, y_test)
    
    return best_classifier, le, accuracy, report

if __name__ == "__main__":
    bert_model = BertModel.from_pretrained(r'D:\AIProject\Bert\model')
    bert_tokenizer = BertTokenizer.from_pretrained(r'D:\AIProject\Bert\tokenizer')

    # restuarant
    rest_train_data = load_tsv_data(r'D:\AIProject\data\restaurant\rest16_quad_train.tsv')
    rest_test_data = load_tsv_data(r'D:\AIProject\data\restaurant\rest16_quad_test.tsv')
    rest_dev_data = load_tsv_data(r'D:\AIProject\data\restaurant\rest16_quad_dev.tsv')
    
    print("\nRunning ConceptNet experiment...")
    conceptnet_classifier, conceptnet_le, conceptnet_accuracy, conceptnet_report = conceptnet_experiment(rest_train_data, rest_dev_data, rest_test_data, bert_model, bert_tokenizer)
    
    print("\nConceptnet Experiment Results:")
    print(f"Accuracy: {conceptnet_accuracy}")
    print("Classification Report:")
    print(conceptnet_report)

    print("\nConceptnet Experiments completed.")


Running ConceptNet experiment...
Preprocessing data...

Extracting features with ConceptNet enrichment...
Encoding labels...
Performing feature selection...
Training and tuning model...
Best model parameters: {'C': 0.1, 'class_weight': 'balanced', 'gamma': 'scale', 'kernel': 'linear'}
Development set accuracy: 0.8352490421455939
Evaluating model...

Conceptnet Experiment Results:
Accuracy: 0.8187772925764192
Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.80      0.72       205
           1       0.23      0.32      0.27        44
           2       0.95      0.86      0.90       667

    accuracy                           0.82       916
   macro avg       0.61      0.66      0.63       916
weighted avg       0.85      0.82      0.83       916


Conceptnet Experiments completed.


try different classifier

RandomForestClassifier

In [4]:
from sklearn.ensemble import RandomForestClassifier

def train_and_tune_model(X_train, y_train, X_dev, y_dev):
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'class_weight': [None, 'balanced']
    }
    
    grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='f1_macro', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    dev_accuracy = best_model.score(X_dev, y_dev)
    print(f"Best model parameters: {grid_search.best_params_}")
    print(f"Development set accuracy: {dev_accuracy}")
    
    return best_model

def enrich_text(primary_term, max_related=5):
    terms = primary_term.split()
    
    related_terms = []
    for term in terms:
        related_terms.extend(get_related_terms(term))
    
    filtered_terms = [term for term in related_terms if term not in terms]
    
    return ' '.join(filtered_terms[:max_related])

def extract_conceptnet_features(reviews, aspect_terms, sentiment_terms, model, tokenizer):
    features = []
    for review, aspect_term, sentiment_term in zip(reviews, aspect_terms, sentiment_terms):
        # only use sentiment_term
        enriched_terms = enrich_text(sentiment_term)
        enriched_review = review + ' ' + enriched_terms
        
        combined_text = f"{enriched_review} [SEP] {aspect_term} [SEP] {sentiment_term}"
        
        inputs = tokenizer(combined_text, return_tensors='pt', padding=True, truncation=True, max_length=512)
        
        with torch.no_grad():
            outputs = model(**inputs)
        features.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
        
        save_cache()
    
    save_cache()  
    return np.array(features)

def conceptnet_experiment(train_data, dev_data, test_data, model, tokenizer):
    load_cache()
    print("Preprocessing data...")
    
    print("\nExtracting features with ConceptNet enrichment...")
    X_train = extract_conceptnet_features(train_data['review'], train_data['aspect_term'], train_data['sentiment_term'], model, tokenizer)
    X_dev = extract_conceptnet_features(dev_data['review'], dev_data['aspect_term'], dev_data['sentiment_term'], model, tokenizer)
    X_test = extract_conceptnet_features(test_data['review'], test_data['aspect_term'], test_data['sentiment_term'], model, tokenizer)
    
    print("Encoding labels...")
    le = LabelEncoder()
    le.fit(train_data['sentiment'].tolist() + dev_data['sentiment'].tolist() + test_data['sentiment'].tolist())
    y_train = le.transform(train_data['sentiment'])
    y_dev = le.transform(dev_data['sentiment'])
    y_test = le.transform(test_data['sentiment'])
    
    # perform feature selection to reduce dimensionality 
    print("Performing feature selection...")
    selector = SelectKBest(f_classif, k=min(1000, X_train.shape[1]))
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_dev_selected = selector.transform(X_dev)
    X_test_selected = selector.transform(X_test)
    
    print("Training and tuning model...")
    best_classifier = train_and_tune_model(X_train_selected, y_train, X_dev_selected, y_dev)
    
    print("Evaluating model...")
    accuracy, report = evaluate_model(best_classifier, X_test_selected, y_test)
    
    return best_classifier, le, accuracy, report

if __name__ == "__main__":
    bert_model = BertModel.from_pretrained(r'D:\AIProject\Bert\model')
    bert_tokenizer = BertTokenizer.from_pretrained(r'D:\AIProject\Bert\tokenizer')

    # restuarant
    rest_train_data = load_tsv_data(r'D:\AIProject\data\restaurant\rest16_quad_train.tsv')
    rest_test_data = load_tsv_data(r'D:\AIProject\data\restaurant\rest16_quad_test.tsv')
    rest_dev_data = load_tsv_data(r'D:\AIProject\data\restaurant\rest16_quad_dev.tsv')
    
    print("\nRunning ConceptNet experiment...")
    conceptnet_classifier, conceptnet_le, conceptnet_accuracy, conceptnet_report = conceptnet_experiment(rest_train_data, rest_dev_data, rest_test_data, bert_model, bert_tokenizer)
    
    print("\nConceptnet Experiment Results:")
    print(f"Accuracy: {conceptnet_accuracy}")
    print("Classification Report:")
    print(conceptnet_report)

    print("\nConceptnet Experiments completed.")


Running ConceptNet experiment...
Preprocessing data...

Extracting features with ConceptNet enrichment...
Encoding labels...
Performing feature selection...
Training and tuning model...
Best model parameters: {'class_weight': 'balanced', 'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 300}
Development set accuracy: 0.8275862068965517
Evaluating model...

Conceptnet Experiment Results:
Accuracy: 0.8165938864628821
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.65      0.66       205
           1       0.00      0.00      0.00        44
           2       0.86      0.92      0.89       667

    accuracy                           0.82       916
   macro avg       0.51      0.52      0.52       916
weighted avg       0.77      0.82      0.79       916


Conceptnet Experiments completed.


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


xgboost classifier

In [10]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

def train_and_tune_model(X_train, y_train, X_dev, y_dev):
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 6, 9],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }
    
    grid_search = GridSearchCV(XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss'), param_grid, cv=5, scoring='f1_macro', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    dev_accuracy = best_model.score(X_dev, y_dev)
    print(f"Best model parameters: {grid_search.best_params_}")
    print(f"Development set accuracy: {dev_accuracy}")
    
    return best_model

def enrich_text(primary_term, max_related=5):
    terms = primary_term.split()
    
    related_terms = []
    for term in terms:
        related_terms.extend(get_related_terms(term))
    
    filtered_terms = [term for term in related_terms if term not in terms]
    
    return ' '.join(filtered_terms[:max_related])

def extract_conceptnet_features(reviews, aspect_terms, sentiment_terms, model, tokenizer):
    features = []
    for review, aspect_term, sentiment_term in zip(reviews, aspect_terms, sentiment_terms):
        # only use sentiment_term
        enriched_terms = enrich_text(sentiment_term)
        enriched_review = review + ' ' + enriched_terms
        
        combined_text = f"{enriched_review} [SEP] {aspect_term} [SEP] {sentiment_term}"
        
        inputs = tokenizer(combined_text, return_tensors='pt', padding=True, truncation=True, max_length=512)
        
        with torch.no_grad():
            outputs = model(**inputs)
        features.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
        
        save_cache()
    
    save_cache()  
    return np.array(features)

def conceptnet_experiment(train_data, dev_data, test_data, model, tokenizer):
    load_cache()
    print("Preprocessing data...")
    
    print("\nExtracting features with ConceptNet enrichment...")
    X_train = extract_conceptnet_features(train_data['review'], train_data['aspect_term'], train_data['sentiment_term'], model, tokenizer)
    X_dev = extract_conceptnet_features(dev_data['review'], dev_data['aspect_term'], dev_data['sentiment_term'], model, tokenizer)
    X_test = extract_conceptnet_features(test_data['review'], test_data['aspect_term'], test_data['sentiment_term'], model, tokenizer)
    
    print("Encoding labels...")
    le = LabelEncoder()
    le.fit(train_data['sentiment'].tolist() + dev_data['sentiment'].tolist() + test_data['sentiment'].tolist())
    y_train = le.transform(train_data['sentiment'])
    y_dev = le.transform(dev_data['sentiment'])
    y_test = le.transform(test_data['sentiment'])
    
    # perform feature selection to reduce dimensionality 
    print("Performing feature selection...")
    selector = SelectKBest(f_classif, k=min(1000, X_train.shape[1]))
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_dev_selected = selector.transform(X_dev)
    X_test_selected = selector.transform(X_test)
    
    print("Training and tuning model...")
    best_classifier = train_and_tune_model(X_train_selected, y_train, X_dev_selected, y_dev)
    
    print("Evaluating model...")
    accuracy, report = evaluate_model(best_classifier, X_test_selected, y_test)
    
    return best_classifier, le, accuracy, report

if __name__ == "__main__":
    bert_model = BertModel.from_pretrained(r'D:\AIProject\Bert\model')
    bert_tokenizer = BertTokenizer.from_pretrained(r'D:\AIProject\Bert\tokenizer')

    # restuarant
    rest_train_data = load_tsv_data(r'D:\AIProject\data\restaurant\rest16_quad_train.tsv')
    rest_test_data = load_tsv_data(r'D:\AIProject\data\restaurant\rest16_quad_test.tsv')
    rest_dev_data = load_tsv_data(r'D:\AIProject\data\restaurant\rest16_quad_dev.tsv')
    
    print("\nRunning ConceptNet experiment...")
    conceptnet_classifier, conceptnet_le, conceptnet_accuracy, conceptnet_report = conceptnet_experiment(rest_train_data, rest_dev_data, rest_test_data, bert_model, bert_tokenizer)
    
    print("\nConceptnet Experiment Results:")
    print(f"Accuracy: {conceptnet_accuracy}")
    print("Classification Report:")
    print(conceptnet_report)

    print("\nConceptnet Experiments completed.")


Running ConceptNet experiment...
Preprocessing data...

Extracting features with ConceptNet enrichment...
Encoding labels...
Performing feature selection...
Training and tuning model...


Parameters: { "use_label_encoder" } are not used.



Best model parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 300, 'subsample': 0.8}
Development set accuracy: 0.842911877394636
Evaluating model...

Conceptnet Experiment Results:
Accuracy: 0.8504366812227074
Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.83      0.75       205
           1       0.50      0.02      0.04        44
           2       0.91      0.91      0.91       667

    accuracy                           0.85       916
   macro avg       0.70      0.59      0.57       916
weighted avg       0.84      0.85      0.83       916


Conceptnet Experiments completed.


In [11]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV

def train_and_tune_model(X_train, y_train, X_dev, y_dev):
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [-1, 10, 20],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }
    
    grid_search = GridSearchCV(LGBMClassifier(random_state=42), param_grid, cv=5, scoring='f1_macro', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    dev_accuracy = best_model.score(X_dev, y_dev)
    print(f"Best model parameters: {grid_search.best_params_}")
    print(f"Development set accuracy: {dev_accuracy}")
    
    return best_model

def enrich_text(primary_term, max_related=5):
    terms = primary_term.split()
    
    related_terms = []
    for term in terms:
        related_terms.extend(get_related_terms(term))
    
    filtered_terms = [term for term in related_terms if term not in terms]
    
    return ' '.join(filtered_terms[:max_related])

def extract_conceptnet_features(reviews, aspect_terms, sentiment_terms, model, tokenizer):
    features = []
    for review, aspect_term, sentiment_term in zip(reviews, aspect_terms, sentiment_terms):
        # only use sentiment_term
        enriched_terms = enrich_text(sentiment_term)
        enriched_review = review + ' ' + enriched_terms
        
        combined_text = f"{enriched_review} [SEP] {aspect_term} [SEP] {sentiment_term}"
        
        inputs = tokenizer(combined_text, return_tensors='pt', padding=True, truncation=True, max_length=512)
        
        with torch.no_grad():
            outputs = model(**inputs)
        features.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
        
        save_cache()
    
    save_cache()  
    return np.array(features)

def conceptnet_experiment(train_data, dev_data, test_data, model, tokenizer):
    load_cache()
    print("Preprocessing data...")
    
    print("\nExtracting features with ConceptNet enrichment...")
    X_train = extract_conceptnet_features(train_data['review'], train_data['aspect_term'], train_data['sentiment_term'], model, tokenizer)
    X_dev = extract_conceptnet_features(dev_data['review'], dev_data['aspect_term'], dev_data['sentiment_term'], model, tokenizer)
    X_test = extract_conceptnet_features(test_data['review'], test_data['aspect_term'], test_data['sentiment_term'], model, tokenizer)
    
    print("Encoding labels...")
    le = LabelEncoder()
    le.fit(train_data['sentiment'].tolist() + dev_data['sentiment'].tolist() + test_data['sentiment'].tolist())
    y_train = le.transform(train_data['sentiment'])
    y_dev = le.transform(dev_data['sentiment'])
    y_test = le.transform(test_data['sentiment'])
    
    # perform feature selection to reduce dimensionality 
    print("Performing feature selection...")
    selector = SelectKBest(f_classif, k=min(1000, X_train.shape[1]))
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_dev_selected = selector.transform(X_dev)
    X_test_selected = selector.transform(X_test)
    
    print("Training and tuning model...")
    best_classifier = train_and_tune_model(X_train_selected, y_train, X_dev_selected, y_dev)
    
    print("Evaluating model...")
    accuracy, report = evaluate_model(best_classifier, X_test_selected, y_test)
    
    return best_classifier, le, accuracy, report

if __name__ == "__main__":
    bert_model = BertModel.from_pretrained(r'D:\AIProject\Bert\model')
    bert_tokenizer = BertTokenizer.from_pretrained(r'D:\AIProject\Bert\tokenizer')

    # restuarant
    rest_train_data = load_tsv_data(r'D:\AIProject\data\restaurant\rest16_quad_train.tsv')
    rest_test_data = load_tsv_data(r'D:\AIProject\data\restaurant\rest16_quad_test.tsv')
    rest_dev_data = load_tsv_data(r'D:\AIProject\data\restaurant\rest16_quad_dev.tsv')
    
    print("\nRunning ConceptNet experiment...")
    conceptnet_classifier, conceptnet_le, conceptnet_accuracy, conceptnet_report = conceptnet_experiment(rest_train_data, rest_dev_data, rest_test_data, bert_model, bert_tokenizer)
    
    print("\nConceptnet Experiment Results:")
    print(f"Accuracy: {conceptnet_accuracy}")
    print("Classification Report:")
    print(conceptnet_report)

    print("\nConceptnet Experiments completed.")


Running ConceptNet experiment...
Preprocessing data...

Extracting features with ConceptNet enrichment...
Encoding labels...
Performing feature selection...
Training and tuning model...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.026175 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 2484, number of used features: 768
[LightGBM] [Info] Start training from score -1.220480
[LightGBM] [Info] Start training from score -3.263749
[LightGBM] [Info] Start training from score -0.405465
Best model parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 20, 'n_estimators': 300, 'subsample': 0.8}
Development set accuracy: 0.8467432950191571
Evaluating model...

Conceptnet Experiment Results:
Accuracy: 0.8504366812227074
Classification Report:
              precision    recall  f1-score   support

           0       0.70      