In [None]:
pip install SpeechRecognition PyAudio

In [None]:
pip install keyboard

In [9]:
import pyaudio
import wave
import speech_recognition as sr

# Audio settings
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
CHUNK = 1024
WAVE_OUTPUT_FILENAME = "recording.wav"

def record_audio():
    audio = pyaudio.PyAudio()
    stream = audio.open(format=FORMAT, channels=CHANNELS,
                        rate=RATE, input=True,
                        frames_per_buffer=CHUNK)

    print("Recording... Press Enter to stop.")
    frames = []

    try:
        while True:
            data = stream.read(CHUNK)
            frames.append(data)
    except KeyboardInterrupt:
        pass
    finally:
        stream.stop_stream()
        stream.close()
        audio.terminate()

        # Save audio file
        wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
        wf.setnchannels(CHANNELS)
        wf.setsampwidth(audio.get_sample_size(FORMAT))
        wf.setframerate(RATE)
        wf.writeframes(b''.join(frames))
        wf.close()
        print("Recording saved as", WAVE_OUTPUT_FILENAME)

def transcribe_audio(filename):
    recognizer = sr.Recognizer()
    with sr.AudioFile(filename) as source:
        audio_data = recognizer.record(source)
    try:
        text = recognizer.recognize_google(audio_data)
        print("Transcription:", text)
    except sr.UnknownValueError:
        print("Could not understand audio")
    except sr.RequestError as e:
        print("API request error:", e)
    return text

if __name__ == "__main__":
    input("Press Enter to start recording...")
    try:
        record_audio()
    except KeyboardInterrupt:
        pass
    text = transcribe_audio(WAVE_OUTPUT_FILENAME)




Recording... Press Enter to stop.
Recording saved as recording.wav
Transcription: ok so let's use some identifiable words so that it can match calm fearful scared annoyed irritated as irritated that enough


In [3]:
from tensorflow import keras
import joblib

# Load trained model
model = keras.models.load_model("../models/speech_emotion_recognition_model.h5")

# Load the label encoder you fit during training
label_encoder = joblib.load("../models/speech_label_encoder.pkl")


In [10]:
import librosa
import numpy as np

def preprocess_audio(file_path, max_len=174):
    # Load audio (same sr as training)
    y, sr = librosa.load(file_path, sr=None, res_type='kaiser_fase')

    # Extract MFCCs (same n_mfcc as training)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)

    # Transpose to (time, features)
    # mfccs = mfccs.T  

    # Pad or truncate to fixed length
    if mfccs.shape[1] < max_len:
        pad_width = max_len - mfccs.shape[1]
        mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode="constant")
    else:
        mfccs = mfccs[:, :max_len]

    # Add batch dimension
    return np.expand_dims(mfccs, axis=0)


In [11]:
# Preprocess new audio file
X_new = preprocess_audio("recording.wav")

# Predict probabilities
y_pred = model.predict(X_new)

# Get class index
pred_class = np.argmax(y_pred, axis=1)[0]




In [12]:
# Convert index back to label
pred_label = label_encoder.inverse_transform([pred_class])[0]

print("Predicted class:", pred_label)


Predicted class: calm


In [13]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer


class RobertaAnchorScorer:
    def __init__(self, lexicon_csv_path, threshold=0.7):
        # Load lexicon in format: word,hypo,hyper,flow
        df = pd.read_csv(lexicon_csv_path)
        self.lexicon = {}
        for _, row in df.iterrows():
            word = row["word"].lower().strip()
            self.lexicon[word] = {
                "hypo": float(row.get("hypo", 0.0)),
                "hyper": float(row.get("hyper", 0.0)),
                "flow": float(row.get("flow", 0.0))
            }

        # Consider all words in lexicon
        self.anchor_words = list(self.lexicon.keys())

        # Load SentenceTransformer (better semantic embeddings than raw RoBERTa)
        self.model = SentenceTransformer("all-MiniLM-L6-v2")
        self.threshold = threshold

        # Cache for embeddings
        self.cache = {}

        # Precompute embeddings for all anchor words
        self.anchor_embs = self._encode_texts(self.anchor_words)

    def _encode_texts(self, texts):
        new_texts = [t for t in texts if t not in self.cache]
        if new_texts:
            embs = self.model.encode(new_texts, convert_to_numpy=True)
            for t, e in zip(new_texts, embs):
                self.cache[t] = e
        return np.array([self.cache[t] for t in texts])

    def score_sentence(self, sentence):
        STOPWORDS = set(stopwords.words("english"))
        tokens = [t for t in word_tokenize(sentence.lower()) if t.isalpha() and t not in STOPWORDS]

        if not tokens:
            return {"percentages": {"hypo": 0, "hyper": 0, "flow": 0},
                    "dominant": None,
                    "label": "No valid tokens",
                    "matched": []}

        token_embs = self._encode_texts(tokens)

        scores = {"hypo": 0, "hyper": 0, "flow": 0}
        matched = []

        # Compare each token with ALL anchors
        for i, tok in enumerate(tokens):
            sims = cosine_similarity([token_embs[i]], self.anchor_embs)[0]
            for idx, sim in enumerate(sims):
                if sim >= self.threshold:
                    anchor = self.anchor_words[idx]
                    contrib = self.lexicon[anchor]
                    for state, val in contrib.items():
                        scores[state] += val * sim  # weight by similarity
                    matched.append({
                        "token": tok,
                        "matched_anchor": anchor,
                        "similarity": float(sim)
                    })

        # Normalize into percentages
        total = sum(scores.values())
        if total == 0:
            percentages = {s: 0 for s in scores}
            dominant = None
            label = "No relevant keywords found"
        else:
            percentages = {s: round(v / total * 100, 2) for s, v in scores.items()}
            dominant = max(percentages, key=percentages.get)
            label = (f"Mixed: {percentages['hypo']}% hypo, "
                     f"{percentages['hyper']}% hyper, "
                     f"{percentages['flow']}% flow "
                     f"(dominant: {dominant})")

        return {
            "percentages": percentages,
            "dominant": dominant,
            "label": label,
            "matched": matched
        }


In [None]:
pip install --upgrade torch torchvision torchaudio

In [None]:
!pip install -U sentence-transformers


In [None]:
pip install --upgrade transformers


In [14]:
scorer = RobertaAnchorScorer("../data/processed/lexicon_for_sentences.csv")

result = scorer.score_sentence(text)

# Pretty printing
print("\n=== Emotion State Analysis ===")
print(f"Hypoarousal:   {result['percentages']['hypo']}%")
print(f"Hyperarousal: {result['percentages']['hyper']}%")
print(f"Flow:         {result['percentages']['flow']}%")
print(f"Dominant State: {result['dominant']}")
print("\n--- Matched Words ---")
for m in result["matched"]:
    print(f"Token: {m['token']:>10} | Anchor: {m['matched_anchor']:>12} | Similarity: {m['similarity']:.3f}")



=== Emotion State Analysis ===
Hypoarousal:   0.0%
Hyperarousal: 88.54%
Flow:         11.46%
Dominant State: hyper

--- Matched Words ---
Token:       calm | Anchor:         calm | Similarity: 1.000
Token:    fearful | Anchor:      anxious | Similarity: 0.702
Token:    fearful | Anchor:      fearful | Similarity: 1.000
Token:     scared | Anchor:      fearful | Similarity: 0.765
Token:    annoyed | Anchor:      annoyed | Similarity: 1.000
Token:    annoyed | Anchor:    irritated | Similarity: 0.779
Token:  irritated | Anchor:      annoyed | Similarity: 0.779
Token:  irritated | Anchor:    irritated | Similarity: 1.000
Token:  irritated | Anchor:      annoyed | Similarity: 0.779
Token:  irritated | Anchor:    irritated | Similarity: 1.000
