In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install pandas scikit-learn
import pandas as pd

# Load the dataset
df = pd.read_parquet('/content/drive/MyDrive/Project_dataset/Textdata_emotion.parquet')

# Preview the first few rows
print(df.head())

                                                text  label
0  i feel awful about it too because it s my job ...      0
1                              im alone i feel awful      0
2  ive probably mentioned this before but i reall...      1
3           i was feeling a little low few days back      0
4  i beleive that i am much more sensitive to oth...      2


In [4]:
# Check the structure of the data
print(df.info())

# Check the distribution of the target labels (emotions)
print(df['label'].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 416809 entries, 0 to 416808
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    416809 non-null  object
 1   label   416809 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 6.4+ MB
None
label
1    141067
0    121187
3     57317
4     47712
2     34554
5     14972
Name: count, dtype: int64


In [5]:
# Map the labels to emotion names for better readability
emotion_map = {0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'}
df['emotion'] = df['label'].map(emotion_map)

# Preview the updated dataframe
print(df[['text', 'emotion']].head())

                                                text  emotion
0  i feel awful about it too because it s my job ...  sadness
1                              im alone i feel awful  sadness
2  ive probably mentioned this before but i reall...      joy
3           i was feeling a little low few days back  sadness
4  i beleive that i am much more sensitive to oth...     love


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

# Split data
X = df['text']
y = df['label'] # Assuming 'label' column contains emotion labels (0-5)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Vectorize text data
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


# Train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train_vec, y_train)


# Predict and evaluate
y_pred = model.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.9025455243396272
              precision    recall  f1-score   support

           0       0.94      0.94      0.94     24504
           1       0.92      0.94      0.93     28247
           2       0.81      0.77      0.79      6853
           3       0.90      0.90      0.90     11339
           4       0.85      0.86      0.86      9376
           5       0.82      0.71      0.76      3043

    accuracy                           0.90     83362
   macro avg       0.87      0.85      0.86     83362
weighted avg       0.90      0.90      0.90     83362



In [7]:
import joblib

# Save the Logistic Regression model
joblib.dump(model, 'text_emotion_model.pkl')

# Save the TfidfVectorizer
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [8]:
import joblib

# Load the Logistic Regression model
model = joblib.load('text_emotion_model.pkl')

# Load the TfidfVectorizer
vectorizer = joblib.load('tfidf_vectorizer.pkl')

In [7]:
# Get user input
user_input = input("Enter a text: ")

# Vectorize the user input
user_input_vec = vectorizer.transform([user_input])

# Predict the emotion
predicted_label = model.predict(user_input_vec)[0]

# Map the predicted label to the emotion name
predicted_emotion = emotion_map[predicted_label]

# Print the predicted emotion
print(f"Predicted emotion: {predicted_emotion}")

Enter a text: i am very happy
Predicted emotion: joy
