In [None]:
import pandas as pd
import numpy as np
import spacy
import nltk
import re

from textblob import TextBlob
from gensim.models import KeyedVectors
from nltk.tokenize import word_tokenize

In [None]:
def remove_special_characters(text):

    pattern = r'[^a-zA-Z0-9\s]'  

    clean_text = re.sub(pattern, '', text)
    return clean_text

In [None]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
main_data = pd.read_csv('data/emotion_data_merged.csv')
kaggle_data = pd.read_csv('data/simplified_emotions_f.csv')
test = pd.read_csv('data/test.csv', delimiter='\t')

In [None]:
main_data['Sentences'] = main_data['Sentences'].apply(remove_special_characters)
kaggle_data['Sentences'] = kaggle_data['Sentences'].apply(remove_special_characters)

# Part of speech

In [None]:
def POStag_extracting(data, column='Sentence'):

    verbs_per_sentence = []
    adjectives_per_sentence = []

    for sent in nlp.pipe(data[column]):
        if sent.has_annotation('POS'):
            verbs = [word.text for word in sent if word.pos_ == 'VERB']
            adjectives = [word.text for word in sent if word.pos_ == 'ADJ']

            verbs_per_sentence.append(verbs)
            adjectives_per_sentence.append(adjectives)

    data['Verbs'] = verbs_per_sentence
    data['Adjectives'] = adjectives_per_sentence

    return data

In [None]:
kaggle_data = POStag_extracting(kaggle_data)
main_data = POStag_extracting(main_data)
test = POStag_extracting(test, 'sentence')

In [None]:
kaggle_data[['Verbs', 'Adjectives']]

In [None]:
main_data[['Verbs', 'Adjectives']]

In [None]:
test[['Verbs', 'Adjectives']]

# Sentiment analysis

In [None]:
def analyze_sentiment(text):
    
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return 'Positive'
    elif analysis.sentiment.polarity < 0:
        return 'Negative'
    else:
        return 'Neutral'

In [None]:
kaggle_data['sentiment'] = kaggle_data['Sentences'].apply(analyze_sentiment)
main_data['sentiment'] = main_data['Sentences'].apply(analyze_sentiment)
test['sentiment'] = test['Sentences'].apply(analyze_sentiment)

In [None]:
kaggle_data['sentiment']

In [None]:
main_data['sentiment']

In [None]:
test['sentiment']

# Word embedding

In [None]:
model_path = r"C:\Users\mened\OneDrive\Desktop\GoogleNews-vectors-negative300.bin"

In [None]:
def word_embeddings(sentence, model):
    words = word_tokenize(sentence.lower())
    # Initialize an empty vector
    total_vector = np.zeros(model.vector_size)
    for word in words:
        if word in model.wv:
            total_vector += model.wv[word]
    # Average the vectors
    if len(words) > 0:
        total_vector /= len(words)
    return total_vector

In [None]:
def add_embeddings_to_dataset(model, dataset, column):
    embeddings = []
    for sentence in dataset[column]:
        embeddings.append(word_embeddings(sentence, model))
    dataset['embedding'] = embeddings

In [None]:
word2vec_model = KeyedVectors.load_word2vec_format(model_path, binary=True)

In [None]:
add_embeddings_to_dataset(word2vec_model, kaggle_data, 'Sentences')

In [None]:
add_embeddings_to_dataset(word2vec_model, main_data, 'Sentences')

In [None]:
add_embeddings_to_dataset(word2vec_model, test, 'sentence')

In [None]:
kaggle_data['embedding']

In [None]:
main_data['embedding']

In [None]:
test['embedding']