### Imports

In [6]:
!pip install keybert
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=dc4f8ae90dc3857dc789d64d4d4d67950a5ac9663a9a23aab8b2459c8a0576a7
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [8]:
import tqdm
import numpy as np
import pandas as pd
from pprint import pprint
import pyspark.pandas as pd
import re, nltk, spacy, gensim

from keybert import KeyBERT
import torch.nn.functional as torch_funcs
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

from wordcloud import WordCloud
from nltk.corpus import stopwords

In [None]:
!python -m spacy download en_core_web_sm

### Code - Aspect Based Sentiment Classification

In [None]:
class AspectBasedSentimentClassification:

    def __init__(self):

        """
        Takes in a piece of text, captures the aspect terms and then identifies the sentiment around it.
        """

        self.absa_tokenizer = AutoTokenizer.from_pretrained("yangheng/deberta-v3-base-absa-v1.1.")
        self.absa_model = AutoModelForSequenceClassification.from_pretrained("yangheng/deberta-v3-base-absa-v1.1.")
        self.sentiment_model_path = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
        self.sentiment_model = pipeline("sentiment-analysis", model = self.sentiment_model_path, tokenizer = self.sentiment_model_path)
        self.term_extractor = KeyBERT()


    def identify_terms(self, txt_):
        """
        Aspect Term Identification using KeyBERT
        """

        terms = self.term_extractor.extract_keywords(txt_, keyphrase_ngram_range = (1, 1), stop_words = "english")
        return [item[0] for item in terms]


    def get_aspects(self, df):
        """
        Identify aspect terms from a piece of text.
        """

        first_run = True
        for idx, txt in df.itertuples():
            terms = self.identify_termstxt()
            x = pd.DataFrame({'cleaned_text': txt, 'aspects': terms})
            if first_run:
                aspects_df = x.copy()
                first_run = False
            else:
                aspects_df = pd.concat([aspects_df, x], axis = 0)

        return aspects_df[['cleaned_text', 'aspects']]


    def calculate_sentiment_polarity(self, df):

        """
        Use pretrained models to calculate sentiment polarity, given the aspect terms.
        """

        sentiments, scores = list(), list()
        for idx, txt_, asp_ in df.itertuples():
            inputs = self.absa_tokenizer(f"[CLS] {txt_} [SEP] {asp_} [SEP]", return_tensors = "pt")
            outputs = self.absa_model(**inputs)
            probabilities = torch_funcs.softmax(outputs.logits, dim = 1)
            probabilities = probabilities.detach().numpy()[0].tolist()
            score = max(probabilities)
            sentiment = ["negative", "neutral", "positive"][probabilities.index(score)]
            scores.append(score)
            sentiments.append(sentiment)

        return sentiments, scores


    def get_sentiments(self, df):
        """"
        Calculate the sentiment polarity and score.
        """
        df['sentiment'], df['score'] = self.calculate_sentiment_polarity(df)
        return df[['cleaned_text', 'aspects', 'sentiment', 'score']]


    def absa(self, data):
        """
        Identify aspect terms from the text.
        Calculate the sentiment polarity of the identified terms.
        """

        data = data.groupby(['cleaned_text']).apply(self.get_aspects)
        data = data.groupby(['cleaned_text']).apply(self.get_sentiments)

        return data

### Code - Topic Modelling

In [None]:
class TwitterTopicModelling:

    def __init__(self):

        """
        Performs Topic Modelling on the Twitter dataset.
        """

        nltk.download('stopwords')
        self.all_stopwords = stopwords.words('english')
        self.all_stopwords.extend(['amp'])
        self.spacy_nlp = spacy.load("en_core_web_sm")


    def preprocess_tweets(self, tweet):

        """"
        Preprocessing the tweets
        Block #01: Remove hyperlinks, split hashtags and remove mentions
        Block #02: Expand word contractions
        Block #03: Simple preprocessing using gensim and Lemmatization using spacy
        """

        tweet = re.sub(r"http\S+", '', tweet)
        tweet = re.sub(r"#[A-Z]{1,}\s", lambda m: m.group().lower(), tweet)
        tweet = re.sub(r"#[A-Za-z]\S*", lambda m: ' '.join(re.findall('[A-Z][^A-Z]*|[a-z][^A-Z]*', m.group().lstrip('#'))), tweet)
        tweet = re.sub(r"@\S*", '', tweet)

        tweet = re.sub("won\'t", "will not", tweet)
        tweet = re.sub("can\'t", "can not", tweet)
        tweet = re.sub("shan\'t", "shall not", tweet)
        tweet = re.sub("n\'t", " not", tweet)
        tweet = re.sub("\'re", " are", tweet)
        tweet = re.sub("\'s", " is", tweet)
        tweet = re.sub("\'d", " would", tweet)
        tweet = re.sub("\'ll", " will", tweet)
        tweet = re.sub("\'ve", " have", tweet)
        tweet = re.sub("\'m", " am", tweet)

        tweet = re.sub(r"[^A-Za-z0-9]", " ", tweet)
        tweet_tok = gensim.utils.simple_preprocess(str(tweet), deacc = True)
        tweet = ' '.join([word for word in tweet_tok if word not in self.all_stopwords])
        doc = self.spacy_nl(tweet)
        tweet_tok = [token.lemma_ for token in doc]

        return ' '.join(tweet_tok)


    def build_corpus(self, df):
        """
        Build the corpus in the format required by the LDA model
        """

        df = df[df.preprocessed_text != '']
        all_cleaned_tweets = [x.split() for x in set(df.preprocessed_text.tolist())]
        id2word = gensim.corpora.Dictionary(all_cleaned_tweets)
        texts = all_cleaned_tweets
        corpus = [id2word.doc2bow(text) for text in texts]

        return id2word, corpus, texts


    def build_lda_model(self, corpus_, id2word_, n_topics_, alpha_, eta_):

        """
        Builds the LDA model, given the corpus, BOW and other hyperparameters.
        """
        lda_model = gensim.models.LdaMulticore(corpus = corpus_, id2word = id2word_, num_topics = n_topics_,
                                               random_state = 100, chunksize = 100, passes = 10, alpha = alpha_, eta = eta_)
        return lda_model


    def compute_coherence(self, texts_, corpus_, id2word_, n_topics_, alpha_, eta_):
        lda_model = self.build_lda_model(corpus_ = corpus_, id2word_ = id2word_, n_topics_ = n_topics_, alpha_ = alpha_, eta_ = eta_)
        coherence_model = gensim.models.CoherenceModel(model = lda_model, texts = texts_, dictionary = id2word_, coherence = 'c_v')
        return coherence_model.get_coherence()


    def get_optimal_hyperparams(self, df):

        """
        Hyperparameter Tuning
        """

        df['preprocessed_text'] = df['content'].apply(self.preprocess_tweets)
        id2word, corpus, texts = self.build_corpus(df)

        grid = {'Validation_Set': {}}
        hyp_alpha = list(np.arange(0.01, 1, 0.3))
        hyp_alpha.append('symmetric')
        hyp_alpha.append('asymmetric')
        hyp_eta = list(np.arange(0.01, 1, 0.3))
        hyp_eta.append('symmetric')
        topics_range = range(2, 11, 1)

        num_docs = len(corpus)
        corpus_sets = [gensim.utils.ClippedCorpus(corpus, int(num_docs * 0.75)), corpus]
        corpus_title = ['75% Corpus', '100% Corpus']

        model_results = {'Validation_Set': [], 'Topics': [], 'Alpha': [], 'Beta': [], 'Coherence': []}
        pbar = tqdm.tqdm(total = len(hyp_eta) * len(hyp_alpha) * len(topics_range) * len(corpus_title))
        for i in range(len(corpus_sets)):
            for k in topics_range:
                for a in hyp_alpha:
                    for e in hyp_eta:
                        cv = self.compute_coherence(texts_ = texts, corpus_ = corpus_sets[i], id2word_ = id2word, n_topics = k,
                                                    alpha_ = a, eta_ = e)
                        model_results['Validation_Set'].append(corpus_title[i])
                        model_results['Topics'].append(k)
                        model_results['Alpha'].append(a)
                        model_results['Eta'].append(e)
                        model_results['Coherence'].append(cv)
                        pbar.update(1)
        model_results_df = pd.DataFrame(model_results)
        pbar.close()

        return model_results_df


        def perform_topic_modelling(self, df):
            df['preprocessed_text'] = df['content'].apply(self.preprocess_tweets)
            id2word, corpus, texts = self.build_corpus(df)
            x_mdl, x_doclda = self.build_lda_model(corpus_ = corpus, id2word_ = id2word, n_topics = 5, alpha_ = 0.05, eta_ = 0.5)


