<a href="https://colab.research.google.com/github/Kile-kun/ML-Projects/blob/main/ML_Assessment(finch).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#required library to be installed
#!pip install -q -U google-generativeai,
!pip install emoji



In [2]:
# load all required library

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import psycopg2
import emoji
import pickle
import os

#import google.generativeai as genai
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline

In [3]:
# from google.colab import userdata
# GOOGLE_API_KEY = userdata.get("GOOGLE_API_KEY")

In [4]:
# all functions

def clean_text(text):
  """
  This function cleans the text data for better classification.

  Args:
      text: A string containing the text to be cleaned.

  Returns:
      A string containing the cleaned text.
  """
  # Lowercase the text
  text = text.lower()

  # Remove emojis
  text = emoji.demojize(text)

  # Remove punctuation
  import string
  text = "".join([char for char in text if char not in string.punctuation])

  # Remove stop words (optional)
  # from nltk.corpus import stopwords
  # stop_words = stopwords.words('english')
  # text = [word for word in text.split() if word not in stop_words]

  # Additional cleaning steps (e.g., stemming, lemmatization) can be added here

  return text
# it kept exceeding rate limit so we will go with the sql generated category
# def categorize_text(text, model_name="gemini-pro"):
#   """
#   This function categorizes the text based on your labeling scheme.

#   Args:
#       text: The text to be categorized.
#       model_name: The name of the Gemini model to use (default: "gemini-pro").

#   Returns:
#       The category ("Medical Doctor", "Veterinarian", or "Other").
#   """
#   # Configure the API with your API key
#   genai.configure(api_key=GOOGLE_API_KEY)

#   # Load the desired Gemini model
#   model = genai.GenerativeModel(model_name)

#   # Craft a prompt for Gemini to analyze the text and suggest a category
#   prompt = f""" Classify the provided text snippet into one of the following categories:
#   Medical Doctor: This category includes text written by or directly relevant to practicing medical doctors and consultants who advise doctors or clinics.
#   Examples: Doctors discussing diagnoses, treatments, or patient care specific to humans. Consultants providing medical advice to doctors or clinics.
#   Not Included: Medical students, nurses, or other healthcare professionals who are not medical doctors.
#   Veterinarian : This category includes text written by or directly relevant to practicing veterinarians and consultants who advise veterinarians or clinics.
#   Examples: Veterinarians discussing diagnoses, treatments, or animal care. Consultants providing medical advice to veterinarians or clinics.
#   Not Included: Veterinary students, veterinary technicians, or other animal healthcare professionals who are not veterinarians.
#   Other: This category includes any text that doesn't fit into the "Medical Doctor" or "Veterinarian" categories, but might still be relevant to health or medicine in a broader sense.
#   Examples: Text written by medical or veterinary students, nurses, technicians, or other healthcare professionals (who are not doctors or veterinarians).
#   Discussions of work-life balance or mental health in the context of healthcare professions. Information about healthcare industries or challenges faced by healthcare professionals (without mentioning specific medical practices).
#   Personal anecdotes or general health information not specific to humans or animals.
#   Remember:

#   If a text snippet mentions both human and animal healthcare, classify it based on the dominant focus.
#   Focus on the content and intended audience of the text, not the author's profession (unless the profession is directly relevant to the content).


#   Text: {text}
#   """

#   # Generate content using the prompt
#   response = model.generate_content(prompt)

#   # Extract the suggested category from the response (assuming specific format)
#   category = response.text.strip()

#   return category

# def categorize_series(text_series, model_name="gemini-pro"):
#   """
#   This function categorizes each text in a Pandas Series using Gemini LLM.

#   Args:
#       text_series: A Pandas Series containing text data.
#       model_name: The name of the Gemini model to use (default: "gemini-pro").

#   Returns:
#       A new Series containing the categorized labels.
#   """
#   return text_series.apply(categorize_text, args=(model_name,))


def build_classifier(X_train, y_train):
  """
  This function builds and trains a text classifier model.

  Args:
      X_train: A list of strings containing the training data (text).
      y_train: A list of labels corresponding to the training data.

  Returns:
      A trained classifier model.
  """
  # Clean training data
  X_train_cleaned = [clean_text(text) for text in X_train]

  # Feature extraction using TF-IDF
  vectorizer = TfidfVectorizer(max_features=1000)
  X_train_features = vectorizer.fit_transform(X_train_cleaned)

  # Train a Logistic Regression model
  model = LogisticRegression(solver='lbfgs')
  model.fit(X_train_features, y_train)

  return model, vectorizer

def evaluate_classifier(model, vectorizer, X_test, y_test):
  """
  This function evaluates the performance of the trained classifier model.

  Args:
      model: The trained classifier model.
      vectorizer: The TF-IDF vectorizer used for feature extraction.
      X_test: A list of strings containing the testing data (text).
      y_test: A list of labels corresponding to the testing data.

  Prints:
      The accuracy score and classification report of the model.
  """
  # Clean testing data
  X_test_cleaned = [clean_text(text) for text in X_test]

  # Transform testing data using the fitted vectorizer
  X_test_features = vectorizer.transform(X_test_cleaned)

  # Make predictions on the testing data
  y_pred = model.predict(X_test_features)

  # Calculate accuracy score
  accuracy = accuracy_score(y_test, y_pred)
  print(f"Accuracy: {accuracy:.4f}")

  # Generate classification report
  print(classification_report(y_test, y_pred))

def build_and_save_pipeline(X_train, y_train):
  """
  This function builds, trains, and saves a text classification pipeline to the current working directory.

  Args:
      X_train: A list of strings containing the training data (text).
      y_train: A list of labels corresponding to the training data.
  """
  # Define the pipeline steps
  pipeline = Pipeline([
      ('tfidf', TfidfVectorizer(max_features=1000)),
      ('logistic_regression', LogisticRegression(solver='lbfgs'))
  ])

  # Train the pipeline
  pipeline.fit(X_train, y_train)

  # Get the current working directory
  current_dir = os.getcwd()

  # Construct the save path with a filename
  save_path = os.path.join(current_dir, "my_text_classifier.pkl")  # Change filename if desired

  # Save the pipeline using pickle
  with open(save_path, 'wb') as f:
    pickle.dump(pipeline, f)
  print(f"Pipeline saved to: {save_path}")

In [5]:
# load the data

conn = psycopg2.connect(
    dbname="Vetassist",
    user="niphemi.oyewole",
    password="W7bHIgaN1ejh",
    host="ep-delicate-river-a5cq94ee-pooler.us-east-2.aws.neon.tech",
    port="5432"
)

# Create a cursor object
cur = conn.cursor()

# Execute a query to fetch data from your table (replace 'your_table' with the actual table name)
cur.execute("""
            select a.username, a.comments, b.subreddit as comment_label,
                    CASE WHEN LOWER(a.comments) LIKE '%doctor%' OR LOWER(a.comments) LIKE '%medical%' OR LOWER(a.comments) LIKE '%nurse%' THEN 'Medicine'
                    WHEN LOWER(a.comments) LIKE '%veterinarian%' OR LOWER(a.comments) LIKE '%vet%' THEN 'Veterinary'
                    ELSE 'Others'
                    END AS category
            from "Vetassist".public.reddit_usernames_comments a
            join "Vetassist".public.reddit_usernames b on a.username = b.username
            order by a.username
            """)

rows = cur.fetchall()

# Define column names for the DataFrame
columns = ['username', 'comments', 'comment_label', 'category']

# Create a DataFrame from the fetched rows
df = pd.DataFrame(rows, columns=columns)

# Print the DataFrame
print(df)

# Close the cursor and connection
cur.close()
conn.close()

                username                                           comments  \
0            --solaris--  I think a lot of other people made some good p...   
1      -Mother_of_Doggos  No, and it doesn’t suit you based on what you’...   
2               -Tasear-  It's just the adjustment period to any new pla...   
3               -Zyonia-  I am living this currently and when a trainee ...   
4     -p-OodlesOfNoodles  Well that's disappointing that there is no cha...   
...                  ...                                                ...   
3271            zamozate  isnt the point to get paid for maintaining the...   
3272             ze_nite  I think its because it cant find the `server.c...   
3273        zebra_chaser  First, from the information you provided, it s...   
3274       zombievettech  Instrument bands.  Pick a color not used in pa...   
3275              zylinx  Yeah I realized this ages ago. Cool idea but n...   

         comment_label    category  
0           Ve

In [6]:
df['category'].value_counts()

category
Others        1992
Veterinary     842
Medicine       442
Name: count, dtype: int64

In [7]:
df2 = df.sample(2000)
df2['comments'] = df2['comments'].apply(clean_text)

In [8]:
df2['category'].value_counts()

category
Others        1233
Veterinary     500
Medicine       267
Name: count, dtype: int64

In [9]:
# categories = categorize_series(df2['comments'].copy())
# df2['new_category']= categories

In [10]:
data = df2 # Load your data
X = data["comments"]
y = data["category"]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Build and train the classifier
model, vectorizer = build_classifier(X_train, y_train)

# Evaluate the classifier performance
evaluate_classifier(model, vectorizer, X_test, y_test)

Accuracy: 0.8500
              precision    recall  f1-score   support

    Medicine       0.79      0.53      0.63        80
      Others       0.88      0.97      0.93       370
  Veterinary       0.77      0.72      0.74       150

    accuracy                           0.85       600
   macro avg       0.82      0.74      0.77       600
weighted avg       0.84      0.85      0.84       600



In [11]:
df2.sample(5)

Unnamed: 0,username,comments,comment_label,category
2831,pkuba208,i watched the video and im a regular to his ch...,MysteriumNetwork,Others
2189,crumbledmoon,hi everyone\n\ni’m currently a senior in high ...,Veterinary,Veterinary
2789,onethunder26,can we have a lightweight app some people hav...,MysteriumNetwork,Others
2548,karriebean,as a tech i’m disappointed you don’t have a te...,Veterinary,Veterinary
1705,ThatGuy2956,i would recommend shotvet,Veterinary,Veterinary


In [12]:
# Save the Model
build_and_save_pipeline(X_train, y_train)

Pipeline saved to: /content/my_text_classifier.pkl


In [14]:
# Load the model from a pickle file
def load_model(filename):
  with open(filename, 'rb') as f:
    return pickle.load(f)

# Load the Model
loaded_model = load_model("my_text_classifier.pkl")

In [15]:
# Use the loaded model on new data
new_data = df2.sample(5)
prediction = loaded_model.predict(new_data['comments'])
new_data['prediction'] = prediction
new_data

Unnamed: 0,username,comments,comment_label,category,prediction
1127,New_Paramedic_9351,edited,Veterinary,Others,Others
3,-Zyonia-,i am living this currently and when a trainee ...,Veterinary,Others,Others
2789,onethunder26,can we have a lightweight app some people hav...,MysteriumNetwork,Others,Others
2375,fuzzypyro,have you tried a different isp maybe a mobile ...,MysteriumNetwork,Others,Others
2021,athanzzz,as most vpn user nowaday is using vpn for serv...,MysteriumNetwork,Others,Others
