In [42]:
import pandas as pd
import numpy as np

In [43]:
emotion=pd.read_csv('dataset\combined_emotion.csv')
sentiment=pd.read_csv('dataset\combined_sentiment_data.csv')

In [44]:
emotion.head()

Unnamed: 0,sentence,emotion
0,i just feel really helpless and heavy hearted,fear
1,ive enjoyed being able to slouch about relax a...,sad
2,i gave up my internship with the dmrg and am f...,fear
3,i dont know i feel so lost,sad
4,i am a kindergarten teacher and i am thoroughl...,fear


In [45]:
sentiment.head()

Unnamed: 0,sentence,sentiment
0,So there is no way for me to plug it in here i...,negative
1,"Good case, Excellent value.",positive
2,Great for the jawbone.,positive
3,Tied to charger for conversations lasting more...,negative
4,The mic is great.,positive


In [46]:
print(emotion.shape)
print(sentiment.shape)

(422746, 2)
(3309, 2)


In [47]:
print('Emotions are: ', emotion['emotion'].unique())
print('Sentiments are: ', sentiment['sentiment'].unique())

Emotions are:  ['fear' 'sad' 'love' 'joy' 'suprise' 'anger']
Sentiments are:  ['negative' 'positive']


In [48]:
emotion.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 422746 entries, 0 to 422745
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   sentence  422746 non-null  object
 1   emotion   422746 non-null  object
dtypes: object(2)
memory usage: 6.5+ MB


In [49]:
sentiment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3309 entries, 0 to 3308
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentence   3309 non-null   object
 1   sentiment  3309 non-null   object
dtypes: object(2)
memory usage: 51.8+ KB


In [50]:
import nltk
from nltk.corpus import stopwords
from nltk import sent_tokenize
from gensim.utils import simple_preprocess
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

[nltk_data] Error loading stopwords: <urlopen error [WinError 10053]
[nltk_data]     An established connection was aborted by the software
[nltk_data]     in your host machine>


In [51]:
stop_words = set(stopwords.words('english'))  # Load stopwords


def preprocess_text(text):
    words = []
    sent_token = sent_tokenize(text)  # Sentence tokenization
    for sent in sent_token:
        tokens = simple_preprocess(sent)  # Word tokenization & preprocessing
        filtered_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]  # Remove stopwords and lemmatize
        words.extend(filtered_tokens)
    return words

In [52]:

words = [preprocess_text(sentence) for sentence in emotion['sentence']]


In [53]:
words

[['feel', 'really', 'helpless', 'heavy', 'hearted'],
 ['ive',
  'enjoyed',
  'able',
  'slouch',
  'relax',
  'unwind',
  'frankly',
  'needed',
  'last',
  'week',
  'around',
  'end',
  'uni',
  'expo',
  'lately',
  'started',
  'find',
  'feeling',
  'bit',
  'listless',
  'never',
  'really',
  'good',
  'thing'],
 ['gave', 'internship', 'dmrg', 'feeling', 'distraught'],
 ['dont', 'know', 'feel', 'lost'],
 ['kindergarten',
  'teacher',
  'thoroughly',
  'weary',
  'job',
  'taken',
  'university',
  'entrance',
  'exam',
  'suffered',
  'anxiety',
  'week',
  'want',
  'carry',
  'work',
  'study',
  'alternative'],
 ['beginning', 'feel', 'quite', 'disheartened'],
 ['would',
  'think',
  'whomever',
  'would',
  'lucky',
  'enough',
  'stay',
  'suite',
  'must',
  'feel',
  'like',
  'romantic',
  'place',
  'earth'],
 ['fear',
  'ever',
  'feel',
  'delicious',
  'excitement',
  'christmas',
  'eve',
  'least',
  'way',
  'remember'],
 ['im', 'forever', 'taking', 'time', 'lie', 

In [54]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X = words
y = emotion["emotion"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [55]:

from gensim.models import Word2Vec
model = Word2Vec(sentences=X_train, vector_size=100, window=5, min_count=1, workers=4)


In [56]:
# Define Function to Get Sentence Vector
def avg_word2vec(doc):
    vectors = [model.wv[word] for word in doc if word in model.wv.index_to_key]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

In [57]:
# Convert X_train & X_test into Word Embeddings
X_train_vectors = np.array([avg_word2vec(sentence) for sentence in X_train])
X_test_vectors = np.array([avg_word2vec(sentence) for sentence in X_test])

### Model training

In [58]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Initialize models
logistic = LogisticRegression(max_iter=500, multi_class="multinomial", solver="lbfgs")

logistic.fit(X_train_vectors, y_train)  # Train the model
    
# Make predictions
y_train_pred = logistic.predict(X_train_vectors)
y_test_pred = logistic.predict(X_test_vectors)

# Evaluate performance on training data
model_train_accuracy = accuracy_score(y_train, y_train_pred)
model_train_f1 = f1_score(y_train, y_train_pred, average="weighted")
model_train_precision = precision_score(y_train, y_train_pred, average="weighted")
model_train_recall = recall_score(y_train, y_train_pred, average="weighted")

# Evaluate performance on test data
model_test_accuracy = accuracy_score(y_test, y_test_pred)
model_test_f1 = f1_score(y_test, y_test_pred, average="weighted")
model_test_precision = precision_score(y_test, y_test_pred, average="weighted")
model_test_recall = recall_score(y_test, y_test_pred, average="weighted")

# Print model performance

print("Model performance for Training set")
print(f"- Accuracy: {model_train_accuracy:.4f}")
print(f"- F1 score: {model_train_f1:.4f}")
print(f"- Precision: {model_train_precision:.4f}")
print(f"- Recall: {model_train_recall:.4f}")
print("-" * 35)

print("Model performance for Test set")
print(f"- Accuracy: {model_test_accuracy:.4f}")
print(f"- F1 score: {model_test_f1:.4f}")
print(f"- Precision: {model_test_precision:.4f}")
print(f"- Recall: {model_test_recall:.4f}")
print("=" * 35)
print("\n")




Model performance for Training set
- Accuracy: 0.6776
- F1 score: 0.6653
- Precision: 0.6683
- Recall: 0.6776
-----------------------------------
Model performance for Test set
- Accuracy: 0.6755
- F1 score: 0.6631
- Precision: 0.6662
- Recall: 0.6755




In [61]:
# Example Prediction
example_sentences =  [
        "The weather is wonderful today!",
        "I am feeling really sad and lonely.",
        "What an exciting game we had last night!",
        "I can't believe how much I've learned this year!",
        "I was so angry during the meeting today.",
        "I can't believe how everything turned out today, it's a bit overwhelming.",
        "I guess I'm doing okay, but it's not as good as I hoped.",
        "That was such a fantastic presentation, even though I made some mistakes.",
        "I'm so glad I made it through the day, but I'm exhausted!",
        "I can't stop thinking about how much better things could have been.",
        "It was a long day, but I had a lot of fun!"

    ]
example_preprocessed = [preprocess_text(sentence) for sentence in example_sentences]
example_vectors = np.array([avg_word2vec(sentence) for sentence in example_preprocessed])

# Predict the emotions for the example sentences
example_predictions = logistic.predict(example_vectors)

for sentence, emotion in zip(example_sentences, example_predictions):
    print(f"Sentence: '{sentence}' --> Predicted Emotion: {emotion}")

Sentence: 'The weather is wonderful today!' --> Predicted Emotion: joy
Sentence: 'I am feeling really sad and lonely.' --> Predicted Emotion: sad
Sentence: 'What an exciting game we had last night!' --> Predicted Emotion: fear
Sentence: 'I can't believe how much I've learned this year!' --> Predicted Emotion: joy
Sentence: 'I was so angry during the meeting today.' --> Predicted Emotion: anger
Sentence: 'I can't believe how everything turned out today, it's a bit overwhelming.' --> Predicted Emotion: suprise
Sentence: 'I guess I'm doing okay, but it's not as good as I hoped.' --> Predicted Emotion: joy
Sentence: 'That was such a fantastic presentation, even though I made some mistakes.' --> Predicted Emotion: sad
Sentence: 'I'm so glad I made it through the day, but I'm exhausted!' --> Predicted Emotion: sad
Sentence: 'I can't stop thinking about how much better things could have been.' --> Predicted Emotion: joy
Sentence: 'It was a long day, but I had a lot of fun!' --> Predicted Emot

In [62]:
## Saving model

model.save("word2vec_model.bin")

In [64]:
import pickle

with open("emotion_classifier_logistic.pkl", "wb") as file:
    pickle.dump(logistic, file)


In [68]:
import joblib

# Saving the model efficiently
joblib.dump(model, 'word2vec_model.joblib')


['word2vec_model.joblib']

In [69]:
joblib.dump(logistic,'emotion_classifier_logistic.joblib')

['emotion_classifier_logistic.joblib']