# Import requirements

In [None]:
import numpy
import pandas as pd
import string
import re
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import (classification_report, 
                             confusion_matrix, 
                             accuracy_score)
import os
import pickle
import time
from itables import init_notebook_mode
init_notebook_mode(all_interactive=True)

# Step 1: Open csv

In [None]:
path_to_csv = "../dataset/Tweets.csv"
dataframe = pd.read_csv(path_to_csv)
dataframe = dataframe[dataframe['airline_sentiment_confidence'] >= 0.75].drop_duplicates()
filtered_df = dataframe.filter(items=["text","airline_sentiment"])
filtered_df

# Step 2: Extract phrases and labels

In [None]:
training_phrases = filtered_df["text"].to_numpy()
training_labels = filtered_df["airline_sentiment"].to_numpy()

# Step 3: Plot label distribution

In [None]:
def plot_label_dist(labels):
    label_dist, label_dist_count = numpy.unique(labels, return_counts = True)
    label_dist = label_dist.astype("str")

    fig = plt.figure()
    ax = fig.add_axes([0,0,1,1])
    ax.set_ylabel('Count')
    ax.set_title('Label distribution')
    ax.bar(label_dist,label_dist_count)
    plt.show()

plot_label_dist(training_labels)

# Step 4: Text preprocessor
- lowercase
- remove punctuations
- remove white spaces

In [None]:
def text_preprocessor(training_phrases):
    """
    input: array
    Takes in an array with sentences & cleans the sentences
    output: array """
    clean_tps = []
    for sentence in training_phrases:
        # Lowercase
        sentence = sentence.lower()
        # Remove white spaces
        sentence = re.sub(' +', ' ',sentence).strip()
        # Append cleaned sentences
        clean_tps.append(sentence)

    return clean_tps

clean_tps = text_preprocessor(training_phrases)

print(f"Before preprocessor:\n{training_phrases[20]}\n\nAfter preprocessor: \n{clean_tps[20]}")

# ======================= Featurization =======================

# Step 5: Countvectorizer

In [None]:
vectorizer = CountVectorizer(max_features=5000).fit(clean_tps)
sparse_features = vectorizer.transform(clean_tps)

# Checkpoint 

In [None]:
print("Shape of features: {}".format(sparse_features.shape))
print("Vectorized example: {}".format(sparse_features[0].toarray()))

# Step 6: Split data in train and test

In [None]:
x_train, x_val, y_train, y_val = train_test_split(sparse_features, 
                                                  training_labels,
                                                  test_size = 0.30, 
                                                  random_state = 1, 
                                                  stratify = training_labels)

# =============== Classification algorithm ===============

# Step 7: SVM model

In [None]:
clf = SVC(kernel='linear', 
          probability=True)

clf.fit(x_train,y_train)
y_pred = clf.predict(x_val)
print(accuracy_score(y_val,y_pred))

# Step 8: Model evaluation

In [None]:
print(classification_report(y_val,y_pred))
print(confusion_matrix(y_val,y_pred))
print()
print(accuracy_score(y_val,y_pred))

# =============== Save objects to disk ===============
- count vectorizer
- trained model

## Paths

In [None]:
path_to_model = "../models/"
path_to_features = "../featurizers/"
feature_name = 'sparse_features'
model_name = 'svm'

## Save as pickle

In [None]:
def save_as_pickle(obj, dir_name, file_name):
    is_exist = os.path.exists(dir_name)
    if not is_exist:
        os.makedirs(dir_name)
    pickle.dump(obj, open(dir_name+file_name, 'wb'))
    
save_as_pickle(vectorizer, path_to_features, feature_name)
print("feature object is succesfully saved...")
save_as_pickle(clf, path_to_model, model_name)
print("model object is successfully saved...")

# =================== Test saved model ===================

In [None]:
# Load model and vectorizer
model = pickle.load(open("../models/svm", 'rb'))
vectorizer = pickle.load(open("../featurizers/sparse_features", 'rb'))

# Step 1: Preprocessing

In [None]:
def text_preprocessor(sentence):
    """ 
    this function cleans a text
    param: string
    returns: string
    """
    # Lowercase
    sentence = sentence.lower()
    # Remove white spaces
    sentence = re.sub(' +', ' ',sentence).strip()
    return sentence

# Step 2: Feature extraction

In [None]:
def count_vectorizer(clean_text):
    """
    this function vectorizes a text
    params: string, vectorizer object
    returns: array
    """
    # Transform text to vector
    vectorized_text = vectorizer.transform([clean_text])
    return vectorized_text

# Step 3: Classification

In [None]:
def classification_model(text, clean_text, vectorized_text):
    """
    this function predicts the sentiment of a text
    params: string, string, array, model object
    returns: dict
    """
    # Predict sentiment
    prediction = model.predict(vectorized_text)[0]
    # Get class probabilities
    prediction_proba = model.predict_proba(vectorized_text)[0]
    # Get all classes
    prediction_classes = model.classes_
    # Create class ranking
    class_ranking = {classes:conf for classes, conf in zip(prediction_classes, prediction_proba)}
    # Sort class ranking
    class_ranking = dict(sorted(class_ranking.items(), key=lambda x: x[1], reverse=True))
    # Dictionary with empy values
    result = {}
    # update dictionay
    result.update(user_input=text,
                  clean_text=clean_text,
                  prediction=prediction,
                  confidence=class_ranking[prediction],
                  class_ranking=class_ranking)

    return result

# Combine all steps

In [None]:
text = "I am sad"
start_time = time.time()
# Text cleaner
clean_text = text_preprocessor(text)
# Feature_extraction
vectorized_text = count_vectorizer(clean_text)
# Classification model
result = classification_model(text, clean_text, vectorized_text)
# update execution time
result.update(execution_time="%s seconds" % (time.time() - start_time))
result