In [1]:
from sqlalchemy import create_engine
from sqlalchemy import text

import pandas as pd
import numpy as np

In [2]:
# Import libraries for training and testing model
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
from sklearn.svm import SVC
import spacy
from sklearn.feature_selection import SelectKBest, chi2

In [3]:
# Libary to save model / load models
import joblib

In [4]:
# Create local db connection

username="postgres"
password="test_passord_^1234"
db_host="localhost"
port='5432'
database="winthrop_db"

engine = create_engine(f'postgresql+psycopg2://{username}:{password}@{db_host}:{port}/{database}')

In [5]:
def getNonTargetData(target_category):
    temp = '''
    select job_id, concat(title, '. ', description ) as description from job_categorization_vw 
        where predicted_category != '{}'
        '''
    non_target_query = temp.format(target_category)

    with engine.connect() as con:
        query = text(non_target_query)
        rs = con.execute(query)

        rows = rs.fetchall()

        non_target_data = pd.DataFrame(rows,columns=['job_id','description'])

    # Tag non-target data
    non_target_data['tag'] = 0

    return non_target_data

In [6]:
def getTargetData(target_category):
    temp = '''
    select job_id, concat(title, '. ', description ) as description from corrected_categorization_tb 
        where tag = 1 and predicted_category = '{}'
    '''
    target_query = temp.format(target_category)

    with engine.connect() as con:
        query = text(target_query)
        rs = con.execute(query)

        rows = rs.fetchall()

    target_data = pd.DataFrame(rows,columns=['job_id','description'])

    # Tag target data
    target_data['tag'] = 1

    return target_data

In [7]:
def combineData(target_data,non_target_data):
    full_data = pd.concat([target_data,non_target_data],axis=0)

    return full_data

In [8]:
def validation_metrics(y_test,y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')

    print(f"Accuracy: {accuracy}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
    print(cm)

In [9]:
def outputIncorrectrResults(y_test,y_pred,full_data):
    # Compare predictions with actual labels
    incorrect_indices = [i for i in range(len(y_test)) if y_pred[i] != y_test.iloc[i]]

    # Output incorrect predictions with job IDs
    incorrect_predictions = full_data.iloc[y_test.index[incorrect_indices]]
    incorrect_predictions['predicted_tag'] = y_pred[incorrect_indices]  # Add predicted tags to the dataframe

    # Print or use incorrect predictions with job IDs
    print("Incorrect Predictions with Job IDs:")
    for _, row in incorrect_predictions.iterrows():
        print(f"Job ID: {row['job_id']}")
        print(f"Description: {row['description']}")
        print(f"Actual Tag: {row['tag']}, Predicted Tag: {row['predicted_tag']}")
    print()

In [10]:
def saveModel(model,target_category):
    temp = '../models/{}_classification_model_v2.pkl'
    fileName = temp.format(target_category)
    joblib.dump(model,fileName)

In [11]:
def saveCountVec(vec,target_category):
    temp = '../models/{}_classification_vec_v2.pkl'
    fileName = temp.format(target_category)
    joblib.dump(vec,fileName)

In [12]:
def checkDistribution(y_train,y_test):
    # Calculate distribution of classes (or any relevant feature)
    train_distribution = y_train.value_counts(normalize=True) * 100
    test_distribution = y_test.value_counts(normalize=True) * 100

    # Print distributions
    print("Training Set Distribution (%):")
    print(train_distribution)
    print("\nTesting Set Distribution (%):")
    print(test_distribution)

    # Calculate exact counts of classes (or any relevant feature)
    train_counts = y_train.value_counts()
    test_counts = y_test.value_counts()

    # Print counts
    print("Training Set Counts:")
    print(train_counts)
    print("\nTesting Set Counts:")
    print(test_counts)

In [13]:
# Target Category
target_category = 'Laboratory and Research'

target_data = getTargetData(target_category)
non_target_data = getNonTargetData(target_category)

full_data = combineData(target_data,non_target_data)

In [14]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    full_data['description'], full_data['tag'], test_size=0.2, random_state=42,stratify=full_data['tag'])

# Transform text data to Count features
count_vectorizer = CountVectorizer()
X_train_count = count_vectorizer.fit_transform(X_train)
X_test_count = count_vectorizer.transform(X_test)

# Train a Logistic Regression model count vectorizer
model = LogisticRegression()
model.fit(X_train_count, y_train)

# Make predictions
y_pred = model.predict(X_test_count)

validation_metrics(y_test, y_pred)

Accuracy: 0.9552238805970149
Recall: 0.7686622320768661
F1 Score: 0.8213333333333332
[[122   1]
 [  5   6]]


In [19]:
y_pred_prob = model.predict_proba(X_test_count)

In [23]:
# Adjust threshold
threshold = 0.3  # You can change this value
y_pred = (y_pred_prob[:,1] >= threshold).astype(int)

In [25]:
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0])

In [24]:
validation_metrics(y_test, y_pred)

Accuracy: 0.9701492537313433
Recall: 0.8595713229859572
F1 Score: 0.8919354838709678
[[122   1]
 [  3   8]]


In [15]:
# Save Model
#saveModel(model,target_category)

In [16]:
# Save Vec
#saveCountVec(count_vectorizer,target_category)