## Beyond Merrity zadatak

In [None]:
# Imports
import pandas as pd
import re
from deep_translator import GoogleTranslator
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Display full DataFrames
pd.set_option('display.max_columns', None) 
pd.set_option('display.max_rows', None) 
pd.set_option('display.max_colwidth', None)

# Read data (promijeniti putanju do datoteke)
train = pd.read_excel('D:/Documents/Posao/Prijave/Spona_code/Zadatak/train.xlsx')
test = pd.read_excel('D:/Documents/Posao/Prijave/Spona_code/Zadatak/test.xlsx')

In [None]:
# Create sentiment categories

minus_three = {'stalno',
'previše',
'nema',
'nema ravnoteže',
'neravnoteže',
'na strani poslovnog',
'nefleksibilnost'
}

minus_two = {'neravnoteža',
'loše',
'narušen' 
}

minus_one = {'manje privatnog života',
'nije najbolje',
'covid loše',
'manjak home-office',
'izazove',
'dosta malo privatnog'
}

zero = {'kako kad', 'nije kod svih isto'}

one = {'može bolje',
'raditi sam na tome',
'kompromisi',
'ovisno',
'ovisi'
}

two = {'ovisi o meni',
'podnošljivo',
'povremeno optimalno',
'balansirano'
}

three = {'pozitivno',
'zadovoljna',
'zadovoljan',
'ok',
'fair',
'balans',
'dobro',
'postoji balans',
'pozitivno iskustvo', 
'dobro',
'fleksibilnost',
'dobar',
'ok je',
}

four = {'odlično',
'ravnoteža u potpunosti',
'vrlo zadovoljna',
'uravnoteženo',
'dobra ravnoteža',
'uravnoteženo',
'fleksibilno', 
'otvorenog uma', 
'prihvatljivo',
'balansirano',
'vrlo dobro',
}

five = {'sklad', 'super'}

In [None]:
# Determine sentiment on predefined categories

def get_sentiment(sentence):
  sent=0
  words = [word.lower() for word in nltk.word_tokenize(sentence)]
  for word in words:
    if word in minus_three:
      sent -= 3
    elif word in minus_two:
      sent -= 2
    elif word in minus_one:
      sent -= 1
    elif word in one:
      sent += 1
    elif word in two:
      sent += 2
    elif word in three:
      sent += 3
    elif word in four:
      sent += 4
    elif word in five:
      sent += 5
    if sent > 5:
        sent = 5
    if sent < -5:
        sent = -5
  return sent

In [None]:
test['my_sentiment'] = test['review'].apply(get_sentiment)
test

- nedostaje puno riječi, pa je puno rezultata 0
- za riječi iz riječnika relativno dobri rezultati

## Preprocessing

In [None]:
# Translate to english
translator = GoogleTranslator(source='hr', target='en')

for i in range(len(train)):
    train.loc[i, 'review_en'] = translator.translate(train.loc[i, 'review'])  

In [None]:
# Clean, remove punctuation, digits...
def clean(text):
    text = re.sub('[^A-Za-z]+', ' ', text)
    return text

train['clean_reviews'] = train['review_en'].apply(clean)
train

In [None]:
# Remove stopwords and tag parts of speech

sw = nltk.corpus.stopwords.words('english')
not_sw = {'not', 'no', 'can', 'do', 'very'} 
new_sw = set([word for word in sw if word not in not_sw])


pos_dict = {'J':wordnet.ADJ, 'V':wordnet.VERB, 'N':wordnet.NOUN, 'R':wordnet.ADV}
def token_stop_pos(text):
    tags = pos_tag(word_tokenize(text))
    newlist = []
    for word, tag in tags:
        if word.lower() not in new_sw:
            newlist.append(tuple([word, pos_dict.get(tag[0])]))
    return newlist

train['pos'] = train['clean_reviews'].apply(token_stop_pos)
train

In [None]:
# Create lemmas
wordnet_lemmatizer = WordNetLemmatizer()

def lemmatize(pos_data):
    lemma_rew = " "
    for word, pos in pos_data:
        if not pos:
            lemma = word
            lemma_rew = lemma_rew + " " + lemma
        else:
            lemma = wordnet_lemmatizer.lemmatize(word, pos=pos)
            lemma_rew = lemma_rew + " " + lemma
    return lemma_rew

train['lemma'] = train['pos'].apply(lemmatize)
train

## TextBlob

In [None]:
# TextBlob polarity
def getPolarity(review):
    return TextBlob(review).sentiment.polarity

In [None]:
train['polarity'] = train['lemma'].apply(getPolarity) 
train.head()

In [None]:
# Rescale polarity to get in range of sentiment (-5 - 5 instead of -1 - 1)

def rescale(col):
    if col >= -1 and col < -0.8:
        col = -5
    elif col >= -0.8 and col <= -0.6:
        col = -4
    elif col >= -0.6 and  col <= -0.4:
        col = -3
    elif col >= -0.4 and  col <= -0.2:
        col = -2
    elif col >= -0.2 and  col < 0:
        col = -1
    elif col == 0:
        col = 0
    elif col >= 0 and  col <= 0.2:
        col = 1
    elif col >= 0.2 and  col <= 0.4:
        col = 2
    elif col >= 0.4 and  col <= 0.6:
        col = 3
    elif col >= 0.6 and  col < 0.8:
        col = 4
    else:
        col = 5
    return col

train['blob_sentiment'] = train['polarity'].apply(rescale)
train

## TextBlob test

In [None]:
# Translate, clean... the test set in the same way

for i in range(len(test)):
    test.loc[i, 'review_en'] = translator.translate(test.loc[i, 'review']) 

test['clean_reviews'] = test['review_en'].apply(clean)

test['pos'] = test['clean_reviews'].apply(token_stop_pos)

test['lemma'] = test['pos'].apply(lemmatize)

test['polarity'] = test['lemma'].apply(getPolarity)
test['blob_sentiment'] = test['polarity'].apply(rescale)

test

- također puno nula, kontekst se ne uzima u obzir 

## VADER

In [None]:
# Create vader analyzer, add new relevant words and weights

analyzer = SentimentIntensityAnalyzer()

new_words = {'overtime': -3.0, 'disturbed': -2.0,'compromise': 1.0, 'compromises': 1.0,
             'imbalance': -2.0,'no balance': -3.0, 'established': 4.0, 'depends': 0.0,
             'balanced': 3.0, 'harmony': 5.0, 'tolerably': 2.0, 'tolerable': 2.0,
             'flexibility': 3.0, 'flexible': 3.0, 'open-minded': 4.0, 'inflexibility': -3.0,
             'inflexible': -3.0}

analyzer.lexicon.update(new_words)
 
def vadersentimentanalysis(review):
    vs = analyzer.polarity_scores(review)
    return vs['compound']

train['vader_polarity'] = train['review_en'].apply(vadersentimentanalysis)

In [None]:
# Rescale for easier comparisson with sentiment
train['vader_sentiment'] = train['vader_polarity'].apply(rescale)
train

## VADER test

In [None]:
# Apply VADER to test set and rescale
test['vader_polarity'] = test['review_en'].apply(vadersentimentanalysis)
test['vader_sentiment'] = test['vader_polarity'].apply(rescale)
test

- bolje od textblob-a pošto se mogu dodati custom riječi

## Vectorizer, Supervised Learning

In [None]:
# Create Vectorizers
cvect = CountVectorizer(ngram_range=(1,1), max_features=100)
tfvect = TfidfVectorizer(ngram_range=(1,1), max_features=100)

csparse = cvect.fit_transform(train.review_en)
tfsparse = tfvect.fit_transform(train.review_en)

In [None]:
# Transform sparse matrix to dataframe
train_df = pd.DataFrame(csparse.toarray(), columns=cvect.get_feature_names())
train_df['sentiment'] = train.sentiment
train_df.shape

In [None]:
# Train models and make predictions

bayes = MultinomialNB()
svc = LinearSVC()

X = train_df.drop('sentiment', axis=1)
y = train_df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

bayes.fit(X_train, y_train)
svc.fit(X_train, y_train)

y_pred_b = bayes.predict(X_test)
print(y_pred_b)

y_pred_s = svc.predict(X_test)
print(y_pred_s)

print(list(y_test))

In [None]:
# Accuracy
print(accuracy_score(y_test, y_pred_b))
print(accuracy_score(y_test, y_pred_s))

## Vectorizer, Supervised Learning test

In [None]:
# Vectorize, transform to dataframe, predict
csparse_test = cvect.fit_transform(test.review_en)
test_df = pd.DataFrame(csparse_test.toarray(), columns=cvect.get_feature_names())
test_y_pred = bayes.predict(test_df)
test['bayes_sentiment'] = test_y_pred
test

- vrlo nizak accuracy, bilo bi bolje s većim setom za treniranje, a i onda bi se isplatilo namještati detalje modela i preprocessing-a

## Usporedba

In [None]:
train_pred = train[['review', 'review_en', 'sentiment', 'blob_sentiment', 'vader_sentiment']]
train_pred

In [None]:
sentiment_range = [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5]
fig = plt.figure(figsize=(15,5))
ax1 = plt.subplot(1,3,1)
sns.countplot(data=train, x='sentiment', order=sentiment_range)
ax2 = plt.subplot(1,3,2, sharey=ax1)
sns.countplot(data=train, x='blob_sentiment', order=sentiment_range)
ax3 = plt.subplot(1,3,3, sharey=ax1)
sns.countplot(data=train, x='vader_sentiment', order=sentiment_range)

In [None]:
print('Blob Accuracy:', accuracy_score(train.sentiment, train.blob_sentiment))
print('VADER Accuracy', accuracy_score(train.sentiment, train.vader_sentiment))

In [None]:
test_pred = test[['review', 'review_en', 'my_sentiment', 'blob_sentiment', 'vader_sentiment', 'bayes_sentiment']]
test_pred

In [None]:
fig = plt.figure(figsize=(15,5))
ax1 = plt.subplot(1,4,1)
sns.countplot(data=test, x='my_sentiment', order=sentiment_range)
ax2 = plt.subplot(1,4,2, sharey=ax1)
sns.countplot(data=test, x='blob_sentiment', order=sentiment_range)
ax3 = plt.subplot(1,4,3, sharey=ax1)
sns.countplot(data=test, x='vader_sentiment', order=sentiment_range)
ax4 = plt.subplot(1,4,4, sharey=ax1)
sns.countplot(data=test, x='bayes_sentiment', order=sentiment_range)

In [None]:
#train_pred.to_excel('D:/Documents/Posao/Prijave/Spona_code/Zadatak/train_pred.xlsx')
#test_pred.to_excel('D:/Documents/Posao/Prijave/Spona_code/Zadatak/test_pred.xlsx')

### Zaključak
- Određivanje sentimenta samo preko ručno definiranih kategorija riječi bi bilo bolje s više definiranih riječi ali vjerojatno bi bilo lakše koristiti preddefinirane (TextBlob, VADER...) iako u tom slučaju utječe i kvaliteta prijevoda
- TextBlob nizak accuracy jer ne uzima u obzir work-life balance kontekst
- Za supervised learning premalo uzoraka 
- Iako VADER daje najbolje rezultate jer je moguće dodati vlastite riječi i weights još uvijek je accuracy samo 30%
    - bilo bi bolje s više uzoraka i više dodanih riječi s preciznije određenim weights-ima
- čišćenje teksta (tokenizacija, lematizacija...) ili nije imalo utjecaja na rezultata ili su čak bili lošiji
- isprobati preprocessing s paketima za hrvatski jezik i fuzzy matching