# Install necessary libraries

In [None]:
%pip install -q -U bitsandbytes
%pip install -q -U transformers
%pip install -q -U accelerate

In [None]:
%pip install psycopg2-binary

## Connect to the Postgresql database

In [None]:
import psycopg2
import os
import sys
def connect():
  """ Connect to database """
  conn = None
  try:
    print("Connecting…")
    conn = psycopg2.connect(
                  host="ep-delicate-river-a5cq94ee-pooler.us-east-2.aws.neon.tech",
                  database="Vetassist",
                  user="niphemi.oyewole",
                  password="W7bHIgaN1ejh")
  except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        sys.exit(1)
  print("All good, Connection successful!")
  return conn

In [None]:
def sql_to_dataframe(conn, query, column_names):
   """
   Import data from a PostgreSQL database using a SELECT query
   """
   cursor = conn.cursor()
   try:
      cursor.execute(query)
   except (Exception, psycopg2.DatabaseError) as error:
      print("Error: %s" % error)
      cursor.close()
      return 1
   # The execute returns a list of tuples:
   tuples_list = cursor.fetchall()
   cursor.close()
   # Now we need to transform the list into a pandas DataFrame:
   df = pd.DataFrame(tuples_list, columns=column_names)
   return df

In [None]:
import pandas as pd
#creating a query variable to store our query to pass into the function
query = """ SELECT *
            FROM reddit_usernames_comments
        """
#creating a list with columns names to pass into the function
column_names = ["username", "comments"]
#opening the connection
conn = connect()
#loading our dataframe
df = sql_to_dataframe(conn, query, column_names)
#closing the connection
conn.close()
# Let’s see if we loaded the df successfully
df.head()

In [2]:
training_data = df.sample(frac=0.7)

In [3]:
df[~df.isin(training_data)].dropna(ignore_index=True)

Unnamed: 0,username,comments,output,classified_label
0,tikitessie,He makes us laugh every day :) favorite coworker,"Other. Explanation: This comment was labeled ""...",Other
1,theophania808,I've worked in the vet industry for years and ...,Veterinarian. Explanation: This comment was la...,Veterinarian
2,paxbanana0,"Those are long, probably stressful days at wor...",Medical Doctor or Veterinarian:\n\n This comm...,Medical Doctor
3,drawntage,"Hi, LVT here. Are they nitpicking or are they ...",Veterinarian. Explanation: This comment was la...,Veterinarian
4,Belikus,But does it mean that something is wrong with ...,"Other. Explanation: This comment was labeled ""...",Other
...,...,...,...,...
978,Specialist-Maize-666,This is what happened at my clinic.. I was the...,"Other. Explanation: This comment was labeled ""...",Other
979,Forgottenpassword7,I’m a rep who lurks this sub and has worked wi...,"Other. Explanation: This comment was labeled ""...",Other
980,Daktari2018,Good for you for sticking to standards of care...,"Other. Explanation: This comment was labeled ""...",Other
981,Sheepb1,"Yes feel free to ask someone to double check, ...","Other. Explanation: This comment was labeled ""...",Other


In [11]:
training_data["classified_label"].shape

(2293,)

## Clean comments column
Found out some of the comments was repeated in the string.

In [None]:
def clean_comments(comment: str):
  list_comments = comment.split("|")
  unique_comments = list(dict.fromkeys(list_comments))

  return "".join(unique_comments)

In [None]:
df["comments"] = df["comments"].apply(clean_comments)

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
import gc

In [None]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
)

In [None]:
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
#Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.pad_token_id =  tokenizer.unk_token_id
tokenizer.padding_side = 'left'


In [None]:
#Quantization configuration
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=True,
)

In [None]:
#Load the model and quantize it on the fly
model = AutoModelForCausalLM.from_pretrained(
          model_name, quantization_config=bnb_config, device_map="auto"
)

In [None]:
from transformers import GenerationConfig

def generate(instruction):
    prompt = "[INST] "+instruction+" [/INST]\n"
#     inputs = tokenizer(prompt, return_tensors="pt")
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].cuda()
    generation_output = model.generate(
            input_ids=input_ids,
            generation_config=GenerationConfig(pad_token_id=tokenizer.pad_token_id, temperature=1.0, top_p=1.0, top_k=50, num_beams=1),
            # return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=32
    )
    del input_ids
    decoded = tokenizer.batch_decode(generation_output, skip_special_tokens=True)
    return decoded[0].split("[/INST]")[-1].strip().replace("</s>", "")


In [None]:
def generate_label(comment: str):
  gc.collect()
  torch.cuda.empty_cache()
  prompt = f"""You will act as an expert in professional identification based on textual data. Your task is to determine the profession (label) of the author of the reddit comment.
      The labels are Medical Doctor or Veterinarian, if you cannot determine the profession of the author, return Other.
      Use the following guidelines to assign the correct label:
      - Medical Doctor
          This label should only include practicing doctors
      - Veterinarian
          This label should only include practicing vets
      - Other
          This label should include Vet students or vet techs.
          This label should include medical school students or nurses or medical professionals who aren’t doctors.
          When it is not possible to determine the profession of the author based on the provided comment.
      So for example,
      Reddit Comment:
      Female, Kentucky.  4 years out. Work equine only private practice. Base salary $85k plus bonuses/production which was $20k 2023. 6 days a week Jan-June/July then variable in the off season. No limit on PTO - took ~5 weeks last year. One paid conference a year (registration/travel/ 1/2 hotel/ transportation) or online CE program. All licensures & professional group fees covered. Cell phone allowance and mileage reimbursement.
      Label:[/INST]
      Veterinarian. Explanation: This comment was labeled "Veterinarian" because it specifically mentions working in a private practice with equines, indicating the author is a practicing veterinarian.</s>
      [INST] Reddit Comment: I tried that to start with but I still couldn’t reach the web-app running on 127.0.0.1 ---> /r/MysteriumNetwork/comments/vadoe5/proxy_127001_to_client_program/ic2phpd/
      Label:[/INST]
      Other. Explanation: This comment was labeled "Other" as it does not provide any information related to medical or veterinary practice, making it impossible to determine the profession of the author.</s>
      [INST] Reddit Comment: "There's a point where the knot won't tighten any further and a double ligate won't make a difference if the first is well put. It's a rule, not a norm. I usually ligate the ovary artery with a single because it's sturdy enough to withstand the pressure of the knot; I double ligate (not always) the uterus since that tissue can be very fragile and the first knot is just to be able to tighten properly the second ligate. It's things you pick up as you go, but do what makes you the most comfortable."
      Label:[/INST]
      Medical Doctor. Explanation: This comment was labeled "Medical Doctor" because it discusses specific surgical techniques related to human anatomy, indicating the author is a practicing medical doctor.
      Reddit Comment: "{comment}"
      Label:
  """
  output = generate(prompt)

  return output

In [None]:
df.loc[1500:, "output"] = df["comments"][1500:].apply(generate_label)

# Tried out Batch inferencing but got Cuda error.
I tried out different values for the batch size but still got cuda (out of memory) error

In [None]:
def add_prompt(comment: str):
    prompt = f"""You will act as an expert in professional identification based on textual data. Your task is to determine the profession (label) of the author of the reddit comment.
      The labels are Medical Doctor or Veterinarian, if you cannot determine the profession of the author, return Other.
      Use the following guidelines to assign the correct label:
      - Medical Doctor
          This label should only include practicing doctors
      - Veterinarian
          This label should only include practicing vets
      - Other
          This label should include Vet students or vet techs.
          This label should include medical school students or nurses or medical professionals who aren’t doctors.
          When it is not possible to determine the profession of the author based on the provided comment.
      So for example,
      Reddit Comment:
      Female, Kentucky.  4 years out. Work equine only private practice. Base salary $85k plus bonuses/production which was $20k 2023. 6 days a week Jan-June/July then variable in the off season. No limit on PTO - took ~5 weeks last year. One paid conference a year (registration/travel/ 1/2 hotel/ transportation) or online CE program. All licensures & professional group fees covered. Cell phone allowance and mileage reimbursement.
      Label:[/INST]
      Veterinarian. Explanation: This comment was labeled "Veterinarian" because it specifically mentions working in a private practice with equines, indicating the author is a practicing veterinarian.</s>
      [INST] Reddit Comment: I tried that to start with but I still couldn’t reach the web-app running on 127.0.0.1 ---> /r/MysteriumNetwork/comments/vadoe5/proxy_127001_to_client_program/ic2phpd/
      Label:[/INST]
      Other. Explanation: This comment was labeled "Other" as it does not provide any information related to medical or veterinary practice, making it impossible to determine the profession of the author.</s>
      [INST] Reddit Comment: "There's a point where the knot won't tighten any further and a double ligate won't make a difference if the first is well put. It's a rule, not a norm. I usually ligate the ovary artery with a single because it's sturdy enough to withstand the pressure of the knot; I double ligate (not always) the uterus since that tissue can be very fragile and the first knot is just to be able to tighten properly the second ligate. It's things you pick up as you go, but do what makes you the most comfortable."
      Label:[/INST]
      Medical Doctor. Explanation: This comment was labeled "Medical Doctor" because it discusses specific surgical techniques related to human anatomy, indicating the author is a practicing medical doctor.
      Reddit Comment: "{comment}"
      Label:
      """
    return "[INST] "+prompt+" [/INST]\n"

In [None]:
final_batch = []

In [None]:
def generate_list_from_dataframe(df):
    num_rows = len(df)
    BATCH_SIZE = 2
    for i in range(0, num_rows, BATCH_SIZE):
        # Slice DataFrame to get 10 rows
        subset_df = df.iloc[i:i+BATCH_SIZE]
        # Convert DataFrame subset to list
        subset_list = subset_df.tolist()
        yield subset_list

# Iterate over generated lists
for i, batch in enumerate(generate_list_from_dataframe(df["comments"])):  
    print(f"Batch {i}")
    gc.collect()
    torch.cuda.empty_cache()
    final_batch.extend(generate(batch))

# Cleaned out the output from the model using Regex

In [None]:
import re

def extract_profession(sentence):
    # Define the regex pattern to match "Medical Doctor", "Veterinarian", or "Other"
    pattern = r"(Medical Doctor|Veterinarian|Other)"

    # Search for the pattern in the sentence
    match = re.search(pattern, sentence)

    # If a match is found, return the matched profession, otherwise return Other
    if match:
        return match.group(0)
    else:
        return "Other"

# Example usage:
sentence = """'Other. Explanation: This comment was labeled "Other" as it does not provide any information related to medical or veterinary practice, making it impossible'"""
profession = extract_profession(sentence)
print("Extracted profession:", profession)

In [None]:
df.loc[:, "classified_label"] = df1.loc[:, "output"].apply(extract_profession)

In [None]:
df.to_csv("reddit_comments.csv", index=False)

# Train classifier model

In [1]:
import pandas as pd

df = pd.read_csv("/kaggle/working/reddit_comments.csv")

In [2]:
df.head()

Unnamed: 0,username,comments,output,classified_label
0,LoveAGoodTwist,"Female, Kentucky. 4 years out. Work equine on...",Veterinarian. Explanation: This comment was la...,Veterinarian
1,wahznooski,"As a woman of reproductive age, fuck Texas","Other. Explanation: This comment was labeled ""...",Other
2,Churro_The_fish_Girl,what makes you want to become a vet?,"Other. Explanation: This comment was labeled ""...",Other
3,abarthch,"I see of course there are changing variables, ...","Other. Explanation: This comment was labeled ""...",Other
4,VoodooKing,I have 412+ and faced issues because wireguard...,"Other. Explanation: This comment was labeled ""...",Other


In [3]:
from transformers import AutoModel, AutoTokenizer
from tqdm import tqdm
from typing import List
import torch
import torch.nn.functional as F

# Download Embedding model

In [4]:
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-large-en-v1.5')
model = AutoModel.from_pretrained('BAAI/bge-large-en-v1.5')
model.eval()

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 1024, padding_idx=0)
    (position_embeddings): Embedding(512, 1024)
    (token_type_embeddings): Embedding(2, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-23): 24 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inpl

In [5]:
# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [30]:
def get_embeddings(texts: List[str], batch_size: int):
    all_embeddings = []
    print(f"Total number of records: {len(texts)}")
    print(f"Num batches: {(len(texts) // batch_size) + 1}")
    
    # Extract embeddings for the texts in batches
    for start_index in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[start_index:start_index + batch_size]

        # Generate tokens and move input tensors to GPU
        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True)

        with torch.no_grad():
            model_output = model(**inputs.to(device))
            # Perform pooling. In this case, cls pooling.
            sentence_embeddings = model_output[0][:, 0]

        # Get the last hidden stated and pool them into a mean vector calculated across the sequence length dimension
        # This will reduce the output vector from [batch_size, sequence_length, hidden_layer_size]
        # to [batch_size, hidden_layer_size] thereby generating the embeddings for all the sequences in the batch
        sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)

        # Append to the embeddings list
        all_embeddings.extend(sentence_embeddings.tolist())

    return all_embeddings

In [None]:
from sklearn.model_selection import train_test_split
# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(df.loc[:, df.columns != "classified_label"], df["classified_label"],
                                                    test_size=0.3,
                                                    stratify=df["classified_label"])

# Get embeddings for the training and test set
train_embeddings = get_embeddings(texts=X_train["comments"].tolist(), batch_size=256)
train_embeddings_df = pd.DataFrame(train_embeddings)

test_embeddings = get_embeddings(texts=X_test["comments"].tolist(), batch_size=256)
test_embeddings_df = pd.DataFrame(test_embeddings)

Total number of records: 2293
Num batches: 9


100%|██████████| 9/9 [03:54<00:00, 26.08s/it]


Total number of records: 983
Num batches: 4


 75%|███████▌  | 3/4 [01:19<00:26, 26.45s/it]

In [26]:
X_train.to_csv("train-data.csv", index=False)

In [32]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
encoded_y_train = le.fit_transform(y_train)
encoded_y_test = le.transform(y_test)

# xgboost model

In [33]:
import xgboost as xgb
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score


def train_model(data: pd.DataFrame, labels: pd.Series):
    if torch.cuda.is_available():
        boost_device = "cuda"
    else:
        boost_device = "cpu"

    # Initialize the XGBoost Classifier
    xgb_clf = xgb.XGBClassifier(objective="multi:softmax",
                                device=boost_device,
                                random_state=3137,
                               early_stopping_rounds=10,
                               eval_metric=accuracy_score)
    xgb_clf.fit(data, labels, eval_set=[(test_embeddings_df,encoded_y_test )])
    
    return xgb_clf

In [34]:
le_name_mapping = dict(zip(le.transform(le.classes_), le.classes_))

In [35]:
le_name_mapping

{0: 'Medical Doctor', 1: 'Other', 2: 'Veterinarian'}

In [36]:
from sklearn.metrics import classification_report

# Train model
xgb_model = train_model(data=train_embeddings_df, labels=encoded_y_train)

# Predict from model
y_pred = xgb_model.predict(test_embeddings_df)
y_pred_labels = [le_name_mapping[x] for x in y_pred]

# Evaluate model
y_test_labels = [le_name_mapping[x] for x in encoded_y_test]
print(f"Classification report:\n{classification_report(y_test_labels, y_pred_labels)}")

[0]	validation_0-mlogloss:0.92320	validation_0-accuracy_score:0.68973
[1]	validation_0-mlogloss:0.81544	validation_0-accuracy_score:0.72940
[2]	validation_0-mlogloss:0.74969	validation_0-accuracy_score:0.72635
[3]	validation_0-mlogloss:0.70277	validation_0-accuracy_score:0.73652
[4]	validation_0-mlogloss:0.67132	validation_0-accuracy_score:0.73855
[5]	validation_0-mlogloss:0.64500	validation_0-accuracy_score:0.74059
[6]	validation_0-mlogloss:0.62673	validation_0-accuracy_score:0.73550
[7]	validation_0-mlogloss:0.61763	validation_0-accuracy_score:0.73754
[8]	validation_0-mlogloss:0.60887	validation_0-accuracy_score:0.74568
[9]	validation_0-mlogloss:0.60577	validation_0-accuracy_score:0.74161
Classification report:
                precision    recall  f1-score   support

Medical Doctor       0.33      0.14      0.20        99
         Other       0.80      0.81      0.80       608
  Veterinarian       0.53      0.63      0.58       276

      accuracy                           0.69      

In [24]:
accuracy_score(encoded_y_test, y_pred)

0.7029501525940997

In [39]:
from sklearn.svm import SVC

clf = SVC()
clf.fit(train_embeddings_df, encoded_y_train)
accuracy_score(encoded_y_test, clf.predict(test_embeddings_df))

# Predict from model
svc_y_pred = clf.predict(test_embeddings_df)
svc_y_pred_labels = [le_name_mapping[x] for x in svc_y_pred]

# Evaluate model
svc_y_test_labels = [le_name_mapping[x] for x in encoded_y_test]
print(f"Classification report:\n{classification_report(svc_y_test_labels, svc_y_pred_labels)}")

Classification report:
                precision    recall  f1-score   support

Medical Doctor       0.50      0.09      0.15        99
         Other       0.87      0.84      0.85       608
  Veterinarian       0.61      0.84      0.71       276

      accuracy                           0.76       983
     macro avg       0.66      0.59      0.57       983
  weighted avg       0.76      0.76      0.74       983



# SVM perfomed very well for this task

In [40]:
# SAVE-LOAD using pickle 
import pickle

# save
with open('model.pkl','wb') as f:
    pickle.dump(clf,f)