In [1]:
import torch
import pandas as pd
import numpy as np
from model import Model
from transformers import AutoTokenizer
import stanza
from copy import deepcopy
from transformers import pipeline
import json
import zeyrek
import nltk
from zemberek import (
    TurkishSentenceNormalizer,
    TurkishSentenceExtractor,
    TurkishMorphology,
    TurkishSpellChecker
)

morphology = TurkishMorphology.create_with_defaults()
normalizer = TurkishSentenceNormalizer(morphology)
extractor = TurkishSentenceExtractor()
spell_checker = TurkishSpellChecker(morphology)
from nltk.corpus import stopwords
import re
import nltk
nltk.download('stopwords')
stops = set(stopwords.words('turkish'))

pipe = pipeline("ner", model="51la5/roberta-large-NER", aggregation_strategy="simple")

2024-08-08 20:28:59,742 - zemberek.morphology.turkish_morphology - INFO
Msg: TurkishMorphology instance initialized in 6.42556619644165



[nltk_data] Downloading package stopwords to
[nltk_data]     /home/musasina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Some weights of the model checkpoint at 51la5/roberta-large-NER were not used when initializing XLMRobertaForTokenClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
def remove_punc(text:str) -> list:
    import string
    text = text.replace("'"," ")
    texts = extractor.from_paragraph(text)
    
    for i,sentence in enumerate(texts):
        for punc in string.punctuation:
            texts[i] = sentence.replace(punc,"")
    
    return texts

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
model_1 = Model()
model_1.load_state_dict(torch.load("/home/musasina/Desktop/projects/teknofest/msnet/model.pth",map_location=torch.device('cpu'))["model_state_dict"])
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`
  model_1.load_state_dict(torch.load("/home/musasina/Desktop/projects/teknofest/msnet/model.pth",map_location=torch.device('cpu'))["model_state_dict"])


0

In [5]:
sentiment_map_model_1 = {1: "nötr", 0: "olumsuz", 2: "olumlu"}

In [6]:
def get_sentiment(text: str) -> str:
    # Model 1 prediction
    tokens_sentence = tokenizer([text], max_length=128, padding="max_length", truncation=True, return_tensors="pt").to(device)
    with torch.inference_mode():
        output_sentence = model_1(tokens_sentence["input_ids"].view(1,-1))
    sentence_sentiment = sentiment_map_model_1[output_sentence.argmax(dim=1).item()]
    return sentence_sentiment

In [7]:
def predict(item):
    texts = remove_punc(deepcopy(item))
    results = []
    indexes = {}
    entity_list = []
    for text in texts:
        sent = get_sentiment(text)
        entities = pipe(text)
        for entity_dict in entities:
            entity_list.append(entity_dict["word"])
        for entity in entity_list:
            if entity in text:
                indexes[entity] = sent
    

    
    for entity,sent in indexes.items():
        results.append({
            "entity": entity,
            "entity_sentiment": sent,
        })
    
    return {
        "entity_list":entity_list,
        "results": results
    }

In [8]:
def test_model_with_excel():
    # Read the Excel file
    df = pd.read_excel('/home/musasina/Desktop/projects/teknofest/filtered_excel_file.xlsx')
    
    # Initialize counters
    total_entities = 0
    correct_entities = 0
    correct_sentiments = 0

    # Iterate through each row in the Excel file
    for index, row in df.iterrows():
        input_text = row['Generated Text']
        expected_output = json.loads(row['Output JSON'])
        
        # Get the model's prediction
        model_output = predict(input_text)
        
        # Create dictionaries for easy lookup
        expected_results = {item['entity']: item['sentiment'] for item in expected_output['results']}
        predicted_results = {item['entity']: item['entity_sentiment'] for item in model_output['results']}
        
        # Compare entities and sentiments
        for entity, expected_sentiment in expected_results.items():
            total_entities += 1
            
            for predicted_entity in predicted_results.keys():
                if entity in predicted_entity:
                    correct_entities += 1
                    
                    if predicted_results[predicted_entity] == expected_sentiment:
                        correct_sentiments += 1
                    
                    break
            
            print(f"Input: {input_text}")
            print(f"Entity: {entity}")
            print(f"Expected Sentiment: {expected_sentiment}")
            print(f"Predicted Sentiment: {predicted_results.get(predicted_entity, 'Not found')}")
            print("-----")
    
    # Calculate accuracy
    entity_accuracy = correct_entities / total_entities if total_entities > 0 else 0
    sentiment_accuracy = correct_sentiments / total_entities if total_entities > 0 else 0
    
    print(f"Total Entities: {total_entities}")
    print(f"Correct Entities: {correct_entities}")
    print(f"Correct Sentiments: {correct_sentiments}")
    print(f"Entity Recognition Accuracy: {entity_accuracy:.2%}")
    print(f"Sentiment Analysis Accuracy: {sentiment_accuracy:.2%}")

In [9]:
test_model_with_excel()

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Input: Hepsiburada'ya deneyimim nötr kaldı. Önce Amazon Türkiye'ye geçtim. Elektronik ürünlerini severek siparıs verdim, indirim kuponları da sıkıydı. Hepsiburada'ya da ev eşyaları fiyatından sonra memnun kaldım, hızlının teslimatlarıyla.
Entity: Hepsiburada
Expected Sentiment: nötr
Predicted Sentiment: olumlu
-----
Input: Hepsiburada'ya deneyimim nötr kaldı. Önce Amazon Türkiye'ye geçtim. Elektronik ürünlerini severek siparıs verdim, indirim kuponları da sıkıydı. Hepsiburada'ya da ev eşyaları fiyatından sonra memnun kaldım, hızlının teslimatlarıyla.
Entity: Amazon Türkiye
Expected Sentiment: olumlu
Predicted Sentiment: nötr
-----
Input: GittiGidiyor'da ev eşyaları aldım ama güvenli ödeme hizmeti dışında bir şey beğenmedim, ürünler beklediğim gibi değildi. Çiçeksepeti'den spor malzemeleri aldım ve hızlı teslimat gerçekten etkileyiciydi, memnun kaldım.
Entity: GittiGidiyor
Expected Sentiment: olumsuz
Predicted Sentiment: olumsuz
-----
Input: GittiGidiyor'da ev eşyaları aldım ama güvenli