In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
import pickle
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn import set_config
import matplotlib.pyplot as plt

In [5]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = text.split()
    basic_stopwords = {'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for',
                      'from', 'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on',
                      'that', 'the', 'to', 'was', 'were', 'will', 'with'}
    tokens = [token for token in tokens if token not in basic_stopwords]
    return ' '.join(tokens)

In [6]:
print("Loading and preprocessing data...")
data = pd.read_csv('/Users/kejnain/Downloads/Semiotics 4/Clean semiotics/data/goemotions.csv')
features = data['text'].apply(preprocess_text)
targets = data.iloc[:, 9:]  
X_train, X_test, y_train, y_test = train_test_split(
    features, targets, test_size=0.2, random_state=42, stratify=targets.iloc[:, 0]
)

Loading and preprocessing data...


In [7]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=15000,
        ngram_range=(1, 2),
        min_df=2,
        max_df=0.95,
        strip_accents='unicode',
        use_idf=True,
        smooth_idf=True,
        sublinear_tf=True
    )),
    ('scaler', StandardScaler(with_mean=False)),
    ('classifier', MultiOutputClassifier(RandomForestClassifier(
        n_estimators=200,
        max_depth=20,
        min_samples_split=5,
        min_samples_leaf=2,
        class_weight='balanced_subsample',  
        bootstrap=True,
        random_state=42,
        n_jobs=-1
    )))
])

set_config(display='diagram')
pipeline

In [8]:
print("Setting up GridSearchCV...")
param_grid = {
    'tfidf__max_features': [10000, 15000],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'classifier__estimator__n_estimators': [100, 200],
    'classifier__estimator__max_depth': [15, 20],
    'classifier__estimator__min_samples_split': [2, 5],
    'classifier__estimator__min_samples_leaf': [1, 2]
}
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=3,
    n_jobs=-1,
    verbose=2,
    scoring='accuracy',
    error_score='raise'
)

Setting up GridSearchCV...


In [9]:
print("Training the model...")
history = grid_search.fit(X_train, y_train)

Training the model...
Fitting 3 folds for each of 64 candidates, totalling 192 fits


In [1]:
results = pd.DataFrame(grid_search.cv_results_)
plt.figure(figsize=(10, 6))
plt.plot(results['mean_test_score'], label='Mean Test Accuracy')
plt.xlabel('Parameter Combination Index')
plt.ylabel('Mean Accuracy')
plt.title('GridSearchCV Mean Test Accuracy')
plt.legend()
plt.show()

NameError: name 'pd' is not defined

In [None]:
print("\nBest parameters:", grid_search.best_params_)
print("Making predictions...")
y_pred = grid_search.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=targets.columns))
print("\nAccuracy Score:", accuracy_score(y_test, y_pred))
print("\nPer-emotion accuracy:")
per_emotion_accuracy = {}
for i, emotion in enumerate(targets.columns):
    acc = accuracy_score(y_test.iloc[:, i], y_pred[:, i])
    per_emotion_accuracy[emotion] = acc
    print(f"{emotion}: {acc:.4f}")

In [None]:
print("\nSaving model...")
with open('sentiment_model.pkl', 'wb') as model_file:
    pickle.dump(grid_search.best_estimator_, model_file)

In [None]:
def predict_emotions(text, model):
    processed_text = preprocess_text(text)
    prediction = model.predict([processed_text])
    emotions_predicted = []
    for i, emotion in enumerate(targets.columns):
        if prediction[0][i] == 1:
            emotions_predicted.append(emotion)
    return emotions_predicted



In [None]:
print("\nTesting model with sample text...")
sample_text = "I'm so sad today how unfortunate"
best_model = grid_search.best_estimator_
emotions = predict_emotions(sample_text, best_model)
print(f"Sample text: {sample_text}")
print(f"Predicted emotions: {emotions}")

print("\nModel training and evaluation completed successfully!")