# Import requirements

In [None]:
import numpy
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline 
from embetter.text import SentenceEncoder
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import (classification_report, 
                             confusion_matrix, 
                             accuracy_score)
import os
import pickle
import time
from itables import init_notebook_mode
init_notebook_mode(all_interactive=True)

# Step 1: Open csv

In [None]:
path_to_csv = "../dataset/Tweets.csv"
dataframe = pd.read_csv(path_to_csv)
dataframe = dataframe[dataframe['airline_sentiment_confidence'] >= 0.75].drop_duplicates()
filtered_df = dataframe.filter(items=["text","airline_sentiment"])
filtered_df

# Step 2: Extract phrases and labels

In [None]:
training_phrases = filtered_df["text"].to_numpy()
training_labels = filtered_df["airline_sentiment"].to_numpy()

# Step 3: Plot label distribution

In [None]:
def plot_label_dist(labels) -> None:
    label_dist, label_dist_count = numpy.unique(labels, return_counts = True)
    label_dist = label_dist.astype("str")
    fig = plt.figure()
    ax = fig.add_axes([0,0,1,1])
    ax.set_ylabel('Count')
    ax.set_title('Label distribution')
    ax.bar(label_dist,label_dist_count)
    plt.show()

plot_label_dist(training_labels)

# Step 4: Split data in train and test

In [None]:
x_train, x_val, y_train, y_val = train_test_split(training_phrases, 
                                                  training_labels,
                                                  test_size = 0.30, 
                                                  random_state = 1, 
                                                  stratify = training_labels)

# =============== Classification algorithm ===============

# Step 5: Make & fit pipeline

In [None]:
classifier = make_pipeline(SentenceEncoder("distiluse-base-multilingual-cased-v2"),
                           SVC(kernel='linear', 
                               probability=True))

classifier.fit(x_train, y_train)

# Step 6: Model evaluation

In [None]:
y_pred = classifier.predict(x_val)
print(accuracy_score(y_val,y_pred))
print(classification_report(y_val,y_pred))
print(confusion_matrix(y_val,y_pred))
print()
print(accuracy_score(y_val,y_pred))

# =============== Save model to disk ===============

# Paths

In [None]:
path_to_model = "../models/"
model_name = 'tfm_svm'

# Save as pickle

In [None]:
def save_as_pickle(obj, dir_name, file_name):
    is_exist = os.path.exists(dir_name)
    if not is_exist:
        os.makedirs(dir_name)
    pickle.dump(obj, open(dir_name+file_name, 'wb'))
    
save_as_pickle(classifier, path_to_model, model_name)
print("model object is successfully saved...")

# =================== Test saved model ===================

In [None]:
# Load model and vectorizer
model = pickle.load(open("../models/tfm_svm", 'rb'))

# Classification function

In [None]:
def classification_model(text):
    """
    this function predicts the sentiment of a text
    params: string, string, array, model object
    returns: dict
    """
    # Predict sentiment
    prediction = model.predict([text])[0]
    # Get class probabilities
    prediction_proba = model.predict_proba([text])[0]
    # Get all classes
    prediction_classes = model.classes_
    # Create class ranking
    class_ranking = {classes:conf for classes, conf in zip(prediction_classes, prediction_proba)}
    # Sort class ranking
    class_ranking = dict(sorted(class_ranking.items(), key=lambda x: x[1], reverse=True))
    # Dictionary with empty values
    result = {}
    # update dictionay
    result.update(user_input=text,
                  prediction=prediction,
                  confidence=class_ranking[prediction],
                  class_ranking=class_ranking)

    return result

# Predict sentiment

In [None]:
print("Test model...")
text = "ik ben boos"
start_time = time.time()
result = classification_model(text)
# update execution time
result.update(execution_time="%s seconds" % (time.time() - start_time))
print(f"predicted results for input: '{text}'\n\n{result}")