In [94]:
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from transformers import pipeline

In [95]:
# Download necessary resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nehan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nehan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nehan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [96]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 186.2 kB/s eta 0:01:09
     --------------------------------------- 0.0/12.8 MB 196.9 kB/s eta 0:01:05
     --------------------------------------- 0.1/12.8 MB 403.5 kB/s eta 0:00:32
     - -------------------------------------- 0.6/12.8 MB 2.0 MB/s eta 0:00:07
     --- ------------------------------------ 1.2/12.8 MB 4.0 MB/s eta 0:00:03
     ----- ---------------------------------- 1.6/12.8 MB 4.7 MB/s eta 0:00:03
     ------- -------------------------------- 2.2/12.8 MB 5

In [97]:
# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Preprocessing functions
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


In [98]:

def preprocess_text(text):
    sentences = sent_tokenize(text)
    preprocessed_sentences = []
    for sentence in sentences:
        words = word_tokenize(sentence)
        words = [lemmatizer.lemmatize(word) for word in words if word.isalnum()]
        preprocessed_sentences.append(' '.join(words))
    return preprocessed_sentences

In [99]:
# Improved subtheme extraction function

def extract_subthemes(text):
    doc = nlp(text)
    subthemes = [chunk.text for chunk in doc.noun_chunks]
    return subthemes

In [100]:
sentiment_pipeline = pipeline("sentiment-analysis")


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [101]:
# Sentiment analysis function

def analyze_sentiments(sentences):
    sentiments = [(sentence, sentiment_pipeline(sentence)[0]['label']) for sentence in sentences]
    return sentiments

In [102]:
# Improved matching of subthemes with sentiments

def get_subtheme_sentiments(review):
    preprocessed_sentences = preprocess_text(review)
    subthemes = extract_subthemes(review)
    sentiments = analyze_sentiments(preprocessed_sentences)
    
    subtheme_sentiments = {}
    for subtheme in subthemes:
        subtheme_sentiments[subtheme] = 'NEUTRAL'

    for sentence, sentiment in sentiments:
        for subtheme in subthemes:
            if subtheme in sentence or subtheme.lower() in sentence.lower():
                subtheme_sentiments[subtheme] = sentiment
    
    # Handle pronouns
    resolved_subtheme_sentiments = []
    for subtheme, sentiment in subtheme_sentiments.items():
        if subtheme.lower() in ['it', 'they', 'them']:
            # Find the previous subtheme that is not a pronoun
            for prev_subtheme, prev_sentiment in reversed(resolved_subtheme_sentiments):
                if prev_subtheme.lower() not in ['it', 'they', 'them']:
                    subtheme = prev_subtheme
                    break
        resolved_subtheme_sentiments.append((subtheme, sentiment))

    # Remove duplicates by keeping the first occurrence
    unique_subtheme_sentiments = []
    seen_subthemes = set()
    for subtheme, sentiment in resolved_subtheme_sentiments:
        if subtheme not in seen_subthemes:
            unique_subtheme_sentiments.append((subtheme, sentiment))
            seen_subthemes.add(subtheme)

    return unique_subtheme_sentiments

In [103]:
# Example review
review = "One tyre went missing, so there was a delay to get the two tyres fitted. The way garage dealt with it was fantastic."

subtheme_sentiments = get_subtheme_sentiments(review)
print(subtheme_sentiments)



[('One tyre', 'NEGATIVE'), ('a delay', 'NEGATIVE'), ('the two tyres', 'NEUTRAL'), ('The way garage', 'POSITIVE')]
