# Visualizing

In [5]:
import pickle

In [7]:
import pandas as pd
import numpy as np
import os

#Basic cleaning
import string
import re

#ML tokenizing, lemmatizing and vectorizing
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer


def basic_cleaning(text):
    """
    Function which takes a string and cleans it to get the string ready for
    future preprocessing. This is a universal step which the data will always
    undergo.
    Input: String
    Output: String
    """

    #No whitespaces in beginning or end
    text = text.strip()

    #lowercase
    text= text.lower()

    #remove numbers
    text = re.sub(r'\b\d+\b', '', text)

    #Removing punctuation
    text = re.sub(rf"[{re.escape(string.punctuation)}]", '', text)

    # Tokenizing
    tokenized = word_tokenize(text)

    # Lemmatizing
    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(word) for word in tokenized]

    text = " ".join(lemmatized)
    return text

def preprocess_series(X):
        """
        Helper function for pipeline
        This function always has to be available in the namespace when loading
        the pipeline.
        """
        return[basic_cleaning(text) for text in X]

In [8]:
with open ("/Users/johannesb/code/Jojo2813/SentiScope/preprocessing_pipelines/preproc_pipeline_ml.pkl", 'rb') as file:
    pipe = pickle.load(file)

In [17]:
vectorizer = pipe['vectorizer']

In [None]:
#Example review
review = pd.Series("This is a very bad review but the delivery was good")
vector_review = pipe.transform(review)

In [22]:
vector_review

<1x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [None]:
import joblib

In [12]:
model = joblib.load("/Users/johannesb/code/Jojo2813/SentiScope/model/logreg_full.pkl")

In [23]:
sentiment = model.predict(vector_review)

In [24]:
coefs = model.coef_[0]
feature_names = vectorizer.get_feature_names_out()

In [25]:
input_indices = vector_review.nonzero()[1]
tfidf_values = vector_review.toarray()[0][input_indices]
input_tokens = [feature_names[i] for i in input_indices]
word_coefs = coefs[input_indices]

In [26]:
 # Compute word contributions
contributions = tfidf_values * word_coefs
contrib_dict = dict(zip(input_tokens, contributions))

In [27]:
# Sort contributions to find top positives and negatives
sorted_items = sorted(contrib_dict.items(), key=lambda x: x[1])
top_negative = [w for w, _ in sorted_items[:2]]
top_positive = [w for w, _ in sorted_items[-2:]]

In [28]:
contributions

array([-2.77107717,  2.52566384, -0.12305621,  0.18185113,  0.52103514,
       -0.14851631,  0.88507473,  0.10844661, -2.83168388])

In [29]:
top_negative, top_positive

(['very bad', 'bad'], ['this is', 'bad review'])