# Import all required packages

In [36]:
import json
import os
import re
import time

import emoji
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import skopt
from IPython.display import display
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.over_sampling import RandomOverSampler, ADASYN, BorderlineSMOTE, SMOTE
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler, NearMiss
from lightgbm import LGBMClassifier
from mlxtend.classifier import EnsembleVoteClassifier
from nltk import word_tokenize, PorterStemmer
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from scipy import sparse
from sklearn.calibration import CalibratedClassifierCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import precision_recall_fscore_support, balanced_accuracy_score, roc_auc_score, \
    classification_report
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder
from skopt import BayesSearchCV
from skopt.space import Integer, Real, Categorical
from wordcloud import WordCloud
from joblib import dump, load
import plotly.graph_objects as go
from cuml.neighbors import NearestNeighbors as cuNN
from imblearn.under_sampling import EditedNearestNeighbours

# EDA

### Loads dataset from remote using tensorflow dataset

In [37]:
# splits = ['train', 'test', 'validation']
# df_data = pd.DataFrame()
# 
# for split in splits:
#     df_data = pd.concat([df_data, tfds.as_dataframe(tfds.load('goemotions', split=split))])
# 
# df_data.reset_index(drop=True, inplace=True)
# df_data.to_csv('data.csv')

### Loads dataset from local

In [None]:
df_data = pd.read_csv('data.csv')
del df_data[df_data.columns[0]]

df_data.head()

### Descriptive Analysis

In [None]:
display(df_data.describe())
df_data.isna().sum()

In [None]:
def plot_emotion_distribution(y):
    # Create a Bar trace
    trace = go.Bar(x=y.value_counts().index, y=y.value_counts().values,
                   marker=dict(color=y.value_counts().values, colorscale='Viridis'))

    # Layout settings
    layout = go.Layout(
        title="Distribution of Emotions in the Dataset",
        xaxis=dict(title="Emotions"),
        yaxis=dict(title="Count"),
        height=800
    )

    # Create Figure
    fig = go.Figure(data=[trace], layout=layout)

    # Show the plot
    fig.show()


# Assuming 'y' is a pandas Series
y = df_data.drop(['comment_text'], axis=1).idxmax(axis=1)
y.name = 'emotion'
plot_emotion_distribution(y)


In [None]:
X = df_data['comment_text']
wordcloud = WordCloud(background_color='white', width=800, height=800).generate(''.join(X.astype(str)))

# Set the size of the figure
plt.figure(figsize=(15, 8))

# Display the WordCloud
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.tight_layout(pad=0)

# Show the plot
plt.show()

# Preprocessing

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

### Actions performed for text cleaning:
- Decode and lowercase
- Emoji to text
- Remove URL, HTML, Special Characters, Punctuation, Stopwords, Whitespaces and Numbers
- Transform Abbreviations
- Tokenization
- Lemmatization

In [43]:
stopword = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()


def preprocess_text(text):
    def get_full_form_from_wordnet(abbreviation):
        synsets = wordnet.synsets(abbreviation)
        if synsets:
            return synsets[0].lemmas()[0].name()
        else:
            return abbreviation

    # Decode the text and convert to lowercase
    if text.startswith("b'") or text.startswith('b"'):
        try:
            # Convert string representation to an actual byte object
            text = eval(text)
            text = text.decode('utf-8')
        except:
            pass

    text = text.lower()

    # Emoji to text
    text = emoji.demojize(text, delimiters=(" ", ""))

    # Remove url, html, special char, punct, and numbers
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.compile(r'http[s]?://\S+|www\.\S+').sub(r' ', text)
    text = re.sub(r'<.*?>', ' ', text)
    text = re.sub(r'\d+', ' ', text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stopwords, transform abbreviations, and stem the tokens
    tokens = [word for word in tokens if word not in stopword]
    tokens = map(get_full_form_from_wordnet, tokens)
    tokens = map(lambda x: stemmer.stem(x), tokens)

    # Lemmatize the tokens
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join the lemmatized tokens into a string
    lemmatized_text = ' '.join(lemmatized_tokens)

    # Strip to single white space
    cleaned_text = re.sub(r'\s+', ' ', lemmatized_text).strip()

    return cleaned_text

### Preprocess text and reverse encoding

In [None]:
X = df_data['comment_text'].apply(preprocess_text).to_frame().squeeze()
y = df_data.drop(['comment_text'], axis=1).idxmax(axis=1)
y.name = 'emotion'

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, shuffle=True, random_state=3013)

### Vectorize Corpus

In [None]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X_train)
X_train_vectorized = vectorizer.transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)
X_train_vectorized

# Model Evaluation

### Functions for model evaluation

In [None]:
def save_resampled_data(X, y, filename_prefix):
    sparse.save_npz(f'{filename_prefix}_X.npz', X)
    np.save(f'{filename_prefix}_y.npy', y)


def evaluate_model(X_train, X_test, y_train, y_test, encoder, classifiers, resamplers=None):
    df_performance = pd.DataFrame(
        index=['Accuracy', 'Precision', 'Recall', 'F1 score', 'ROC', 'Training Time', 'Testing time'])

    if resamplers:

        df_resampler = pd.DataFrame(
            index=['Resample Time', 'Size'] + list(pd.unique(encoder.inverse_transform(y_train))))
        os.makedirs('resampled_data', exist_ok=True)

        for spl_name, spl_obj in resamplers:

            resampled_data_file_X = os.path.join('resampled_data', f'{spl_name}_resampled_data_X.npz')
            resampled_data_file_y = os.path.join('resampled_data', f'{spl_name}_resampled_data_y.npy')

            if os.path.exists(resampled_data_file_X) and os.path.exists(resampled_data_file_y):
                X_train_resampled, y_train_resampled = sparse.load_npz(resampled_data_file_X), \
                    np.load(resampled_data_file_y, allow_pickle=True)
            else:
                # Apply resampling and store the results
                start_time = time.time()
                X_train_resampled, y_train_resampled = spl_obj.fit_resample(X_train, y_train)
                resample_time = time.time() - start_time
                df_resampler.loc['Resample Time', spl_name] = resample_time
                df_resampler.loc['Size', spl_name] = len(y_train_resampled)
                for index, value in pd.DataFrame(encoder.inverse_transform(y_train_resampled)).value_counts().items():
                    df_resampler.loc[index, spl_name] = value

                save_resampled_data(X_train_resampled, y_train_resampled,
                                    os.path.join('resampled_data', f'{spl_name}_resampled_data'))

            for clf_name, clf in classifiers:
                label = f'{spl_name} & {clf_name}'
                results = evaluate_performance(X_train_resampled, X_test, y_train_resampled, y_test, encoder,
                                               (label, clf))
                df_performance[label] = results

        return df_resampler, df_performance

    else:
        for clf_name, clf in classifiers:
            results = evaluate_performance(X_train, X_test, y_train, y_test, encoder, (clf_name, clf))
            df_performance[clf_name] = results

        return df_performance


def evaluate_performance(X_train, X_test, y_train, y_test, encoder, clf):
    # Training the model
    start_time = time.time()
    clf[1].fit(X_train, y_train)
    training_time = time.time() - start_time

    # Making predictions on the test set
    start_time = time.time()
    predictions = clf[1].predict(X_test)
    testing_time = time.time() - start_time
    accuracy = balanced_accuracy_score(y_test, predictions)
    precision, recall, f1score, support = precision_recall_fscore_support(y_test, predictions,
                                                                          average='macro',
                                                                          zero_division=0)
    roc = roc_auc_score(y_test, clf[1].predict_proba(X_test), average='macro', multi_class='ovr')
    print(f'\n\n-----------------------{clf[0]} done evaluation.')
    print(classification_report(encoder.inverse_transform(y_test), encoder.inverse_transform(predictions),
                                zero_division=0))

    return [accuracy, precision, recall, f1score, roc, training_time, testing_time]


### Encode target labels

In [None]:
encoder = LabelEncoder().fit(y_train)
y_train_encoded = encoder.transform(y_train)
y_test_encoded = encoder.transform(y_test)

### Classifiers and resamplers involved

In [None]:
resamplers = [
    (
        'RandomUnderSampler',
        RandomUnderSampler(
            sampling_strategy='not minority',
            random_state=3013)
    ),
    (
        'NearMiss',
        NearMiss(
            sampling_strategy='not minority',
            version=3,
            n_jobs=-1)
    ),
    (
        'ADASYN',
        ADASYN(
            sampling_strategy='minority',
            random_state=3013,
            n_neighbors=cuNN())
    ),
    (
        'SMOTE',
        SMOTE(
            sampling_strategy='not majority',
            random_state=3013,
            k_neighbors=cuNN())
    ),
    (
        'BorderlineSMOTE',
        BorderlineSMOTE(
            sampling_strategy='not majority',
            random_state=3013,
            k_neighbors=cuNN(),
            m_neighbors=cuNN(n_neighbors=10)))
    ,
    (
        'SMOTETomek',
        SMOTETomek(
            sampling_strategy='not majority',
            random_state=3013,
            n_jobs=-1,
            smote=SMOTE(k_neighbors=cuNN()))
    ),
    (
        'SMOTEENN',
        SMOTEENN(
            sampling_strategy='not majority',
            random_state=3013,
            smote=SMOTE(k_neighbors=cuNN()),
            enn=EditedNearestNeighbours(n_neighbors=cuNN()))
    ),
    (
        'RandomOverSampler',
        RandomOverSampler(
            sampling_strategy='not majority',
            random_state=3013)),
]

classifiers = [
    (
        'LGBMClassifier',
        LGBMClassifier(
            random_state=3013,
            class_weight='balanced',
            n_jobs=-1,
            verbose=-1)
    ),
    (
        'SGDLogisticRegression',
        SGDClassifier(
            random_state=3013,
            loss='log_loss', class_weight='balanced',
            n_jobs=-1),
    ),
    (
        'SGDLinearSVC',
        CalibratedClassifierCV(
            SGDClassifier(
                random_state=3013,
                loss='hinge', class_weight='balanced',
                n_jobs=-1),
            method='isotonic'),
    )
]

voting_classifier = EnsembleVoteClassifier(clfs=[clf[1] for clf in classifiers],
                                           voting='soft',
                                           fit_base_estimators=False,
                                           use_clones=False)

classifiers.append(('Voting Classifier', voting_classifier))

### General performance comparison between classifiers and classifiers with resampling

In [None]:
import warnings

warnings.simplefilter("ignore", category=UserWarning)
warnings.simplefilter("ignore", category=FutureWarning)

In [None]:
df_performance_clfs = evaluate_model(X_train_vectorized, X_test_vectorized, y_train_encoded, y_test_encoded, encoder,
                                     classifiers)

df_performance_clfs

In [None]:
df_resample, df_performance_reclfs = evaluate_model(X_train_vectorized, X_test_vectorized, y_train_encoded,
                                                    y_test_encoded, encoder, classifiers,
                                                    resamplers)

display(df_resample)
df_performance_reclfs

### General performance comparison between classifiers and classifiers with resampling after grouping emotions

In [None]:
emotion_groups = {
    "anger": ["anger", "annoyance", "disapproval"],
    "disgust": ["disgust"],
    "fear": ["fear", "nervousness"],
    'neutral': ['neutral'],
    "joy": ["admiration", "amusement", "approval", "caring", "desire", "excitement", "gratitude", "joy", "love",
            "optimism", "pride", "relief"],
    "sadness": ["sadness", "disappointment", "embarrassment", "grief", "remorse"],
    "surprise": ["confusion", "curiosity", "realization", "surprise"]
}

y_train_grouped = y_train.apply(
    lambda x: next((group for group, emotions in emotion_groups.items() if x in emotions), x))
y_test_grouped = y_test.apply(lambda x: next((group for group, emotions in emotion_groups.items() if x in emotions), x))
y_grouped = pd.concat([y_train_grouped, y_test_grouped])
plot_emotion_distribution(y_grouped)

y_grouped.value_counts()

### Encode grouped target labels

In [None]:
gp_encoder = LabelEncoder().fit(y_train_grouped)
y_gp_train_encoded = gp_encoder.transform(y_train_grouped)
y_gp_test_encoded = gp_encoder.transform(y_test_grouped)

In [None]:
resamplers = [
    (
        'GP_RandomUnderSampler',
        RandomUnderSampler(
            sampling_strategy='not minority',
            random_state=3013)
    ),
    (
        'GP_NearMiss',
        NearMiss(
            sampling_strategy='not minority',
            version=3,
            n_jobs=-1)
    ),
    (
        'GP_ADASYN',
        ADASYN(
            sampling_strategy='minority',
            random_state=3013,
            n_neighbors=cuNN())
    ),
    (
        'GP_SMOTE',
        SMOTE(
            sampling_strategy='not majority',
            random_state=3013,
            k_neighbors=cuNN())
    ),
    (
        'GP_BorderlineSMOTE',
        BorderlineSMOTE(
            sampling_strategy='not majority',
            random_state=3013,
            k_neighbors=cuNN(),
            m_neighbors=cuNN(n_neighbors=10)))
    ,
    (
        'GP_SMOTETomek',
        SMOTETomek(
            sampling_strategy='not majority',
            random_state=3013,
            n_jobs=-1,
            smote=SMOTE(k_neighbors=cuNN()))
    ),
    (
        'GP_SMOTEENN',
        SMOTEENN(
            sampling_strategy='not majority',
            random_state=3013,
            smote=SMOTE(k_neighbors=cuNN()),
            enn=EditedNearestNeighbours(n_neighbors=cuNN()))
    ),
    (
        'GP_RandomOverSampler',
        RandomOverSampler(
            sampling_strategy='not majority',
            random_state=3013)),
]

classifiers = [
    (
        'GP_LGBMClassifier',
        LGBMClassifier(
            random_state=3013,
            class_weight='balanced',
            n_jobs=-1,
            verbose=-1)
    ),
    (
        'GP_SGDLogisticRegression',
        SGDClassifier(
            random_state=3013,
            loss='log_loss', class_weight='balanced',
            n_jobs=-1)
    ),
    (
        'GP_SGDLinearSVM',
        CalibratedClassifierCV(
            SGDClassifier(
                random_state=3013,
                loss='hinge', class_weight='balanced',
                n_jobs=-1),
            method='isotonic')
    )
]

voting_classifier = EnsembleVoteClassifier(clfs=[clf[1] for clf in classifiers],
                                           voting='soft',
                                           fit_base_estimators=False)

classifiers.append(('Voting Classifier', voting_classifier))

In [None]:
df_performance_gp_clfs = evaluate_model(X_train_vectorized, X_test_vectorized, y_gp_train_encoded, y_gp_test_encoded,
                                        gp_encoder,
                                        classifiers)

df_performance_gp_clfs

In [None]:
df_gp_resample, df_performance_gp_reclfs = evaluate_model(X_train_vectorized, X_test_vectorized, y_gp_train_encoded,
                                                          y_gp_test_encoded, gp_encoder, classifiers, resamplers)

display(df_gp_resample)
df_performance_gp_reclfs

# Parameter Tuning using BayesSearchCV

In [27]:
def searchCV(X_train, y_train, pipeline, params):
    bscv = BayesSearchCV(
        pipeline[1],
        params,
        n_iter=100,
        n_points=4,
        scoring='f1_macro',
        cv=StratifiedKFold(n_splits=5, shuffle=False),
        verbose=0,
        n_jobs=3,
        pre_dispatch='2*n_jobs',
        random_state=3013,
        error_score=0,
    )

    np.int = int
    start_time = time.time()
    bscv.fit(X_train, y_train)
    search_time = time.time() - start_time
    best_parameters = bscv.best_params_
    display(pd.DataFrame)

    clf_name = pipeline[0]
    print(f'------------------{clf_name} tuning done.')
    print(f'Search time: {search_time}')
    print(f'Best parameters: {best_parameters}')
    print(f'Best CV score: {bscv.best_score_}')
    skopt.dump(bscv, f'tuned_{clf_name}.pkl')

    return bscv


Required for applying bayes search on ngram_range with skopt as list of tuple is not supported

In [18]:
class CustomTfidfVectorizer(TfidfVectorizer):
    def __init__(
            self,
            ngram_lower=1,
            ngram_upper=1,
            input="content",
            encoding="utf-8",
            decode_error="strict",
            strip_accents=None,
            lowercase=True,
            preprocessor=None,
            tokenizer=None,
            analyzer="word",
            stop_words=None,
            token_pattern=r"(?u)\b\w\w+\b",
            max_df=1.0,
            min_df=1,
            max_features=None,
            vocabulary=None,
            binary=False,
            dtype=np.float64,
            norm="l2",
            use_idf=True,
            smooth_idf=True,
            sublinear_tf=False,
    ):
        self.ngram_lower = ngram_lower
        self.ngram_upper = ngram_upper
        super().__init__(
            ngram_range=(ngram_lower, ngram_upper),
            input=input,
            encoding=encoding,
            decode_error=decode_error,
            strip_accents=strip_accents,
            lowercase=lowercase,
            preprocessor=preprocessor,
            tokenizer=tokenizer,
            analyzer=analyzer,
            stop_words=stop_words,
            token_pattern=token_pattern,
            max_df=max_df,
            min_df=min_df,
            max_features=max_features,
            vocabulary=vocabulary,
            binary=binary,
            dtype=dtype,
            norm=norm,
            use_idf=use_idf,
            smooth_idf=smooth_idf,
            sublinear_tf=sublinear_tf,
        )

The process is carried out on cloud platform, hence the missing output

In [None]:
param_grid_random_over_sampler = {
    'spl': [RandomOverSampler(random_state=3013)],
    'spl__sampling_strategy': ['not majority', 'minority'],
}

param_grid_smote = {
    'spl': [SMOTE(random_state=3013, k_neighbors=cuNN())],
    'spl__k_neighbors__n_neighbors': Integer(3, 15),
    'spl__sampling_strategy': ['not majority', 'minority'],
}

param_ADASYN = {
    'spl': [ADASYN(random_state=3013, n_neighbors=cuNN(), sampling_strategy='minority')],
    'spl__n_neighbors__n_neighbors': Integer(3, 15),
}

param_SGDLR = {
    'clf__loss': ['log_loss'],
    'clf__penalty': ['l1', 'l2', 'elasticnet'],
    'clf__learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
    'clf__eta0': Real(0.00001, 0.2),
    'clf__max_iter': Integer(1000, 5000),
    'clf__alpha': Real(0.0001, 0.1),
    'clf__l1_ratio': Real(0.1, 0.5),
    'clf__class_weight': ['balanced', None],
}

param_LGBM = {
    'clf__max_depth': Integer(5, 50),
    'clf__min_child_samples': Integer(20, 100),
    'clf__boosting_type': ['gbdt', 'dart', 'rf'],
    'clf__learning_rate': Real(0.00001, 0.2),
    'clf__n_estimators': Integer(80, 200),
    'clf__num_iterations': Integer(100, 500),
    'clf__num_leaves': Integer(25, 60),
    'clf__bagging_freq': Integer(1, 10),
    'clf__bagging_fraction': Real(0.01, 0.99),
    'clf__feature_fraction': Real(0.01, 0.99),
    'clf__class_weight': ['balanced', None],
}

param_SGDSVC = {
    'clf__estimator__loss': ['hinge'],
    'clf__estimator__penalty': ['l1', 'l2', 'elasticnet'],
    'clf__estimator__learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
    'clf__estimator__eta0': Real(0.00001, 0.2),
    'clf__estimator__max_iter': Integer(1000, 5000),
    'clf__estimator__alpha': Real(0.0001, 0.1),
    'clf__estimator__class_weight': ['balanced', None],
    'clf__estimator__l1_ratio': Real(0.1, 0.5),
    'clf__method': ['sigmoid', 'isotonic'],
    'clf__cv': Integer(2, 10),
}

pipelines = [
    (
        param_SGDLR,
        ('Logistic Regression', Pipeline([
            ('vect', CustomTfidfVectorizer(ngram_lower=1)),
            ('spl', RandomOverSampler()),  # Dummy Sampler
            ('clf', SGDClassifier(random_state=3013, n_jobs=-1, loss='log_loss')),
        ]))
    ),
    (
        param_SGDSVC,
        ('Linear SVM', Pipeline([
            ('vect', CustomTfidfVectorizer(ngram_lower=1)),
            ('spl', RandomOverSampler()),  # Dummy Sampler
            ('clf', CalibratedClassifierCV(estimator=SGDClassifier(random_state=3013, n_jobs=-1, loss='hinge'))),
        ]))
    ),
    (
        param_LGBM,
        ('LGBM Classifier', Pipeline([
            ('vect', CustomTfidfVectorizer(ngram_lower=1)),
            ('spl', RandomOverSampler()),  # Dummy Sampler
            ('clf', LGBMClassifier()),
        ]))
    ),
]

cvinstances = []

for param, pipeline in pipelines:

    param_vectorizer = {
        'vect__max_features': Integer(2500, 50000),
        'vect__max_df': Real(0.9, 1.0),
        'vect__min_df': Real(0.0, 0.1),
        'vect__ngram_upper': Integer(1, 3),
    }

    param_grids = []

    for param_oversampler, oversampler_params in [('smote', param_grid_smote),
                                                  ('random_over_sampler', param_grid_random_over_sampler),
                                                  ('adasyn', param_ADASYN),
                                                  ('no_spl', {'spl': ['passthrough']})]:
        param_grid = {**param_vectorizer,
                      **{f"{key}": value for key, value in oversampler_params.items()},
                      **param}
        param_grids.append(param_grid)

    bscv = searchCV(X_train, y_gp_train_encoded, pipeline, param_grids)
    cvinstances.append((pipeline[0], bscv))

### Rebuild models with best parameters

In [None]:
classifiers = [
    (
        'SGDLogisticRegression',
        Pipeline([
            ('vect', TfidfVectorizer(max_df=0.9679849809437826, max_features=26641, min_df=0.0, ngram_range=(1, 1))),
            ('clf', SGDClassifier(
                alpha=0.0001,
                class_weight='balanced',
                eta0=0.05000915524973822,
                l1_ratio=0.1420892145814889,
                learning_rate='optimal',
                loss='log_loss',
                max_iter=3163,
                n_jobs=-1,
                penalty='elasticnet',
                random_state=3013,
            ))
        ])
    ),
    (
        'SGDLinearSVM',
        Pipeline([ 
            ('vect', TfidfVectorizer(max_df=1, max_features=22119, min_df=0.0, ngram_range=(1, 3))),
            ('clf', CalibratedClassifierCV(
                base_estimator=SGDClassifier(
                    alpha=0.0001,
                    class_weight=None,
                    eta0=0.02558639455287378,
                    l1_ratio=0.47339370383296875,
                    learning_rate='adaptive',
                    loss='hinge',
                    max_iter=5000,
                    penalty='l2',
                    random_state=3013,
                    n_jobs=-1,
                ),
                cv=10,
                method='isotonic'
            ))
        ])
    ),
    (
        'LGBMClassifier',
        Pipeline([
            ('vect', TfidfVectorizer(max_df=0.980786564471343, max_features=50000, min_df=0.0)),
            ('spl', SMOTE(
                sampling_strategy='not majority',
                random_state=3013,
                k_neighbors=cuNN(n_neighbors=10, n_jobs=-1),
            )),
            ('clf', LGBMClassifier(
                bagging_fraction=0.99,
                bagging_freq=4,
                boosting_type='dart',
                class_weight=None,
                feature_fraction=0.99,
                learning_rate=0.1736279711857161,
                n_estimators=123,
                n_jobs=-1,
                num_iterations=117,
                num_leaves=60,
                random_state=3013
            ))
        ])
    )
]

voting_classifier = EnsembleVoteClassifier(clfs=[clf[1] for clf in classifiers], voting='soft',
                                           fit_base_estimators=False)

classifiers.append(('Voting Classifier', voting_classifier))

evaluate_model(X_train, X_test, y_gp_train_encoded, y_gp_test_encoded, gp_encoder, classifiers)

### Save best model

In [None]:
best_model_name = 'Voting Classifier'

best_model = next((classifier for name, classifier in classifiers if name == best_model_name), None)

dump(best_model, 'best_model.joblib')

# Model Prediction

In [None]:
text = input('Enter text for emotion classification: ')

emo_clf = load('best_model.joblib')
emo_clf.predict([preprocess_text('text')])

# Intent Classification using Rasa DIETClassifier

### Model training with intents data in [nlu.yml](/data/nlu.yml)

In [None]:
!pip install rasa
!rasa init

In [None]:
!rasa train

### Load trained model

In [None]:
from rasa.core.agent import Agent
from rasa.core

nlu_interpreter = Agent.load("/content/drive/MyDrive/PROJECT_PATTERN/models/20240203-225351-fast-median.tar.gz")

In [None]:
from rasa.model import get_latest_model
from rasa.cli.utils import get_validated_path
from rasa.cli.scaffold import create_initial_project

model = get_latest_model('/models')


# Define a function to get the response
def get_rasa_response(user_input):
    interpreter = model.get_interpreter()
    response = interpreter.parse_message(user_input)
    return response['text']


# Example usage
user_input = "Hello, how are you?"
rasa_response = get_rasa_response(user_input)
print("Assistant:", rasa_response)


In [None]:
def classify_intent(text):
    return await nlu_interpreter.parse_message(message_data=text)['intent']

In [None]:
from pprint import pprint

pprint(classify_intent(input('Enter input')))

# Response generation based on ChatGPT 3.5

In [None]:
from openai import OpenAI

os.environ['OPENAI_API_KEY'] = "YOUR_KEY"

client = OpenAI()

In [None]:
def generateResponse(emotion, intent, ori_text, user_info, suggestion_reply=None):
    content = json.dumps(
        {'emotion': emotion, 'intent': intent, 'user_input': ori_text, 'suggestion_reply': suggestion_reply,
         'user_info': user_info})

    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system",
             "content": "You are a supportive virtual counseling assistant, empathetic and understanding. Your goal is to assist users in expressing their feelings, providing guidance, and offering a comforting presence."},
            {"role": "user", "content": content}
        ]
    )

    return completion.choices[0].message.content.replace('\n', ' ')

# Frontend for Chatbot

In [None]:
!ngrok authtoken '2YgweMYFLveRiLfPxFkJPnCAQxo_5K6Y6536DzswvQgZfBHDW'

In [None]:
from flask import render_template, Flask, request
from flask_ngrok import run_with_ngrok

app = Flask(__name__)

run_with_ngrok(app)

avatars = [
    'https://api.dicebear.com/7.x/bottts/svg?seed=Bear&scale=90&backgroundColor=b6e3f4,c0aede,d1d4f9,ffd5dc,ffdfbf',
    'https://api.dicebear.com/7.x/bottts/svg?seed=Fluffy&scale=90&backgroundColor=b6e3f4,c0aede,d1d4f9,ffd5dc,ffdfbf',
    'https://api.dicebear.com/7.x/bottts/svg?seed=Jasmine&scale=90&backgroundColor=b6e3f4,c0aede,d1d4f9,ffd5dc,ffdfbf',
    'https://api.dicebear.com/7.x/bottts/svg?seed=Sassy&scale=90&backgroundColor=b6e3f4,c0aede,d1d4f9,ffd5dc,ffdfbf',
    'https://api.dicebear.com/7.x/bottts/svg?seed=Angel&scale=90&backgroundColor=b6e3f4,c0aede,d1d4f9,ffd5dc,ffdfbf',
    'https://api.dicebear.com/7.x/bottts/svg?seed=Boots&scale=90&backgroundColor=b6e3f4,c0aede,d1d4f9,ffd5dc,ffdfbf',
    'https://api.dicebear.com/7.x/bottts/svg?seed=Cookie&scale=90&backgroundColor=b6e3f4,c0aede,d1d4f9,ffd5dc,ffdfbf',
    'https://api.dicebear.com/7.x/bottts/svg?seed=Gizmo&scale=90&backgroundColor=b6e3f4,c0aede,d1d4f9,ffd5dc,ffdfbf',
    'https://api.dicebear.com/7.x/bottts/svg?seed=Leo&scale=90&backgroundColor=b6e3f4,c0aede,d1d4f9,ffd5dc,ffdfbf',
    'https://api.dicebear.com/7.x/bottts/svg?seed=Whiskers&scale=90&backgroundColor=b6e3f4,c0aede,d1d4f9,ffd5dc,ffdfbf',
    'https://api.dicebear.com/7.x/bottts/svg?seed=Midnight&scale=90&backgroundColor=b6e3f4,c0aede,d1d4f9,ffd5dc,ffdfbf',
    'https://api.dicebear.com/7.x/bottts/svg?seed=Willow&scale=90&backgroundColor=b6e3f4,c0aede,d1d4f9,ffd5dc,ffdfbf',
    'https://api.dicebear.com/7.x/bottts/svg?seed=Maggie&scale=90&backgroundColor=b6e3f4,c0aede,d1d4f9,ffd5dc,ffdfbf',
    'https://api.dicebear.com/7.x/bottts/svg?seed=Misty&scale=90&backgroundColor=b6e3f4,c0aede,d1d4f9,ffd5dc,ffdfbf',
    'https://api.dicebear.com/7.x/bottts/svg?seed=Gracie&scale=90&backgroundColor=b6e3f4,c0aede,d1d4f9,ffd5dc,ffdfbf',
    'https://api.dicebear.com/7.x/bottts/svg?seed=Pumpkin&scale=90&backgroundColor=b6e3f4,c0aede,d1d4f9,ffd5dc,ffdfbf',
    'https://api.dicebear.com/7.x/bottts/svg?seed=Pepper&scale=90&backgroundColor=b6e3f4,c0aede,d1d4f9,ffd5dc,ffdfbf',
    'https://api.dicebear.com/7.x/bottts/svg?seed=Boo&scale=90&backgroundColor=b6e3f4,c0aede,d1d4f9,ffd5dc,ffdfbf',
    'https://api.dicebear.com/7.x/bottts/svg?seed=Cali&scale=90&backgroundColor=b6e3f4,c0aede,d1d4f9,ffd5dc,ffdfbf'
]


@app.route("/")
def index():
    return render_template('index.html', avatars=avatars)


@app.route('/get_response', methods=['POST'])
def get_response():
    data = request.get_json()
    text = data['text']
    profile = data['profile']
    intent = classify_intent(text)
    emotion = emo_clf.predict(preprocess_text(text))
    return generateResponse(emotion, intent, text, profile)


if __name__ == '__main__':
    app.run(debug=True)