# Setup

In [None]:
%pip install groq

Collecting groq
  Downloading groq-0.13.0-py3-none-any.whl.metadata (13 kB)
Downloading groq-0.13.0-py3-none-any.whl (108 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/108.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.8/108.8 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq
Successfully installed groq-0.13.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Import Dependencies

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, TFBertModel
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Input

from sklearn.metrics import classification_report
import keras

## Data Loading

In [None]:
from google.colab import userdata

In [None]:
DIR_PATH = userdata.get('DIR_PATH')

In [None]:
data = pd.read_csv(DIR_PATH+"/seq2seq_data.csv")
data["topic_category"] = data["topic_category"]-1
data

Unnamed: 0,topic_category,original_text,base_word_text
0,8.0,what makes friendship click?,what make friendship click
1,1.0,why does zebras have stripes?,why zebra stripe
2,3.0,what did the itsy bitsy sipder climb up?,what itsy bitsy sipder climb up
3,3.0,what is the difference between a bachelors and...,what difference between bachelor and master de...
4,2.0,why do women get pms?,why woman get pm
...,...,...,...
174712,8.0,imperative: tell me what guys only guys must do!,tell me what guy only guy must
174713,8.0,tell me the story of any fantasy figure i'd ch...,tell me story of any fantasy figure i d choose
174714,7.0,imperative: reveal a secret about life.,reveal secret about life
174715,5.0,imperative: demande à domenech ce qu'il en est...,demande à domenech ce quil en est de son méti...


# Inference

## Assembler

In [None]:
from groq import Groq

API_KEY = userdata.get('GROQ_API_KEY')

MODEL_ID = "gemma2-9b-it"

client = Groq(
    api_key=API_KEY
)

def generate_sentence(base_word, groq_model, groq_client):
  chat_completion = groq_client.chat.completions.create(
      messages=[
          {
              "role": "system",
              "content": "You are a linguist that try to make a full coherent sentence out of list of basic words."
          },
          {
              "role": "user",
              "content": f"""Please create a full coherent sentence out of this words,
                            the order of the word that appear in the sentence have to be in sequence just like how the word appeared in the list.
                            Please just output the word without any other text. You can add punctuation to the generated sentence to make the sentence more natural.
                            Here are the words: [{base_word}]""",
          }
      ],
      model=groq_model,
      temperature=0.8,
      max_tokens=1024,
      top_p=1,
      stop=None,
      stream=False,
  )
  return chat_completion.choices[0].message.content

## Topic Classification

In [None]:
def predict_topic(text, model, tokenizer, max_length=128):
    """
    Predict topic for a single text input

    Args:
        text (str): Input text to classify
        model (BertForSequenceClassification): Trained BERT model
        tokenizer (BertTokenizer): Tokenizer used during training
        max_length (int): Maximum sequence length

    Returns:
        Predicted topic label (integer)
    """
    model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Encode the text
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=1)

    categories = [
    "Society & Culture",
    "Science & Mathematics",
    "Health",
    "Education & Reference",
    "Computers & Internet",
    "Sports",
    "Business & Finance",
    "Entertainment & Music",
    "Family & Relationships",
    "Politics & Government"
    ]
    return categories[prediction.item()]

In [None]:
save_topic_directory = DIR_PATH

In [None]:
topic_model = BertForSequenceClassification.from_pretrained(save_topic_directory)
topic_tokenizer = BertTokenizer.from_pretrained(save_topic_directory)

In [None]:
test_text = "what makes friendship click"

In [None]:
predict_topic(test_text, topic_model, topic_tokenizer)

'Family & Relationships'

## Intent Classification


In [None]:
intent_classification_model = keras.models.load_model("cnn_model.keras")

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer, tokenizer_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences
import json

# Load the tokenizer from the JSON file
with open('cnn_tokenizer.json', 'r') as json_file:
    tokenizer_json = json.load(json_file)

intent_tokenizer = tokenizer_from_json(tokenizer_json)


def predict_intent(text, model, tokenizer, max_length=100):
  label_map = {0: "declarative", 1: "imperative", 2: "interrogative"}
  tokenized_text = tokenizer.texts_to_sequences([text])  # Pass the text in a list
  padded_text = pad_sequences(tokenized_text, maxlen=max_length)
  prediction = model.predict(padded_text)
  predicted_class = np.argmax(prediction, axis=-1)[0]
  predicted_label = label_map[predicted_class]
  return predicted_label

In [None]:
predict_intent("why is her dance so beautiful", intent_classification_model, intent_tokenizer)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step


'interrogative'

## Pipeline

In [None]:
def run_pipeline(base_words, groq_model, groq_client, topic_model, topic_tokenizer, intent_classification_model, intent_tokenizer):
  sentence = generate_sentence(base_words, groq_model, groq_client)
  print("Sentence:", sentence)
  topic = predict_topic(sentence, topic_model, topic_tokenizer)
  print("Topic:", topic)
  intent = predict_intent(sentence, intent_classification_model, intent_tokenizer)
  print("Intent:", intent)
  return

In [None]:
sample_data = df.sample(n=30, random_state=42)


base_words = "government decision bad citizen"

run_pipeline(base_words, MODEL_ID, client, topic_model, topic_tokenizer, intent_classification_model, intent_tokenizer)

Sentence: The government's decision is bad for citizens. 

Topic: Politics & Government
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
Intent: declarative
