In [1]:
import argparse
import datetime
import json
import os

import boto3
import joblib
import pandas as pd
from botocore.exceptions import ClientError
from imblearn.over_sampling import SMOTE
from sklearn.calibration import CalibratedClassifierCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.svm import LinearSVC

current_datetime = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")


class DataLoader:
    @staticmethod
    def load_data(folder_path):
        """Load data from a CSV file."""

        files = [f for f in os.listdir(folder_path) if f.endswith(".csv")]

        dfs = []

        for file in files:
            file_path = os.path.join(folder_path, file)

            try:
                df = pd.read_csv(file_path, sep=",")
            except Exception as e:
                print("Sfa")

            dfs.append(df)

        combined_df = pd.concat(dfs)
        combined_df = combined_df.drop(combined_df[combined_df["text"].isnull()].index)
        combined_df["category_id"] = combined_df["class"].factorize()[0]
        return combined_df


class DataPreprocessor:
    def __init__(self, min_df=5, ngram_range=(1, 2), stop_words="english"):
        self.tfidf = TfidfVectorizer(
            sublinear_tf=True,
            min_df=min_df,
            norm="l2",
            encoding="latin-1",
            ngram_range=ngram_range,
            stop_words=stop_words,
        )

    def preprocess_data(self, df):
        """Preprocess data and extract features and labels."""
        features = self.tfidf.fit_transform(df.text)
        labels = df.category_id

        return features, labels, self.tfidf


class ModelTrainer:
    def __init__(self, param_grid=None, scoring="f1_macro"):
        self.param_grid = param_grid or {
            "C": [0.5, 1, 5, 10, 15, 20],
            "loss": ["squared_hinge"],
            "penalty": ["l2"],
            "multi_class": ["ovr"],
            "max_iter": [100000],
            "class_weight": ["balanced"],
        }
        self.scoring = scoring

    def train_model(self, features, labels, test_size):
        """Train the model and return the trained model and test data."""
        x_train, x_test, y_train, y_test = train_test_split(
            features, labels, test_size=test_size, random_state=42, stratify=labels
        )

        print("Running sampling using SMOTE")

        smote = SMOTE()
        x_train, y_train = smote.fit_resample(x_train, y_train)

        print("Running hyperparameter tunning")

        grid_search = GridSearchCV(LinearSVC(), self.param_grid, cv=5, scoring=self.scoring, n_jobs=-1)
        grid_search.fit(x_train, y_train)
        best_params = grid_search.best_params_
        print(best_params)
        print("Best metrics founded training on best metrics")

        best_model = LinearSVC(**best_params,verbose=0)
        print("Calibrating the model for probability estimation")

        calibrated_model = CalibratedClassifierCV(estimator=best_model, cv=5, method="sigmoid", n_jobs=-1)

        best_model.fit(x_train, y_train)
        calibrated_model.fit(x_train, y_train)

        return best_model, calibrated_model, x_test, y_test, best_params


class ModelEvaluator:
    @staticmethod
    def evaluate_model(model, x_test, y_test, class_labels, features, labels, best_params):
        """Evaluate the trained model and return evaluation metrics."""

        print("Running model evaluation")

        y_pred = model.predict(x_test)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average="macro")
        recall = recall_score(y_test, y_pred, average="macro")
        f1 = f1_score(y_test, y_pred, average="macro")
        cm = confusion_matrix(y_test, y_pred)

        classification_rep = classification_report(y_test.tolist(), y_pred, target_names=class_labels, output_dict=True)
        cross_val_scores = cross_val_score(model, features, labels, scoring="accuracy", cv=5)
        evaluation_dict = {
            "accuracy_score": accuracy,
            "precision_score": precision,
            "recall_score": recall,
            "f1_score": f1,
            "classification_report": classification_rep,
            "cross_validation_scores": cross_val_scores.tolist(),
            "confusion_matrix": cm.tolist(),
            "best_parameter": best_params,
        }
        return evaluation_dict


class ModelSaver:
    @staticmethod
    def save_model(best_model, calibrated_model, tfidf, id_to_category, model_dir):
        """Save the trained model and related objects to the model directory."""

        print("Dumping into a joblib file")

        model_dict = {
            "tfidf": tfidf,
            "prediction_model": best_model,
            "id_to_category": id_to_category,
            "probability_model": calibrated_model,
        }
        os.makedirs(model_dir, exist_ok=True)
        model_file_path = os.path.join(model_dir, f"model_{current_datetime}.joblib")
        joblib.dump(model_dict, model_file_path)

    @staticmethod
    def save_evaluation_results(evaluation_dict, model_dir):
        """Save the evaluation results to a JSON file in the model directory."""

        print("Saving evaluation results")
        evaluation_file_path = os.path.join(model_dir, f"evaluation_{current_datetime}.json")
        with open(evaluation_file_path, "w") as f:
            json.dump(evaluation_dict, f)


def fetch_secret_list(secret_name, region_name, *args, **kwargs) -> dict:
    print("Fetching secrets from secret manager")

    session = boto3.session.Session()
    client = session.client(service_name="secretsmanager", region_name=region_name)
    try:
        get_secret_value_response = client.get_secret_value(SecretId=secret_name)

    except ClientError as e:
        print("Error in fetching keys from aws secret manager: %s", e)
        raise (e)

    response = json.loads(get_secret_value_response["SecretString"])
    return response


def download_file_from_s3(bucket_name, object_key):
    print("Downloading files from s3")

    s3 = boto3.client("s3")
    try:
        file_name = os.path.basename(object_key)
        local_file_path = os.path.join(os.getcwd(), file_name)
        s3.download_file(bucket_name, object_key, local_file_path)
        print("File downloaded successfully : %s ", local_file_path)
        return local_file_path
    except Exception as e:
        print("Error downloading file: %s", str(e))
        raise (e)

In [2]:

test_size = 0.2

# train_folder_path = "./training_data"
train_folder_path = "./smallerSet"
data_loader = DataLoader()
df = data_loader.load_data(train_folder_path)

data_preprocessor = DataPreprocessor()
features, labels, tfidf = data_preprocessor.preprocess_data(df)

# model_trainer = ModelTrainer()
# best_model, calibrated_model, x_test, y_test, best_params = model_trainer.train_model(features, labels, test_size)

# category_id_df = df[["class", "category_id"]].drop_duplicates().sort_values("category_id")
# id_to_category = dict(category_id_df[["category_id", "class"]].values)

# model_saver = ModelSaver()
# model_saver.save_model(best_model, calibrated_model, tfidf, id_to_category, "trained_model")

# class_labels = []
# max_index = max(id_to_category.keys())
# class_labels = [id_to_category.get(i) for i in range(max_index + 1)]

# model_evaluator = ModelEvaluator()
# eval_results = model_evaluator.evaluate_model(
#         best_model, x_test, y_test, class_labels, features, labels, best_params)


Running sampling using SMOTE
Running hyperparameter tunning




{'C': 10, 'class_weight': 'balanced', 'loss': 'squared_hinge', 'max_iter': 100000, 'multi_class': 'ovr', 'penalty': 'l2'}
Best metrics founded training on best metrics
Calibrating the model for probability estimation




Dumping into a joblib file
Running model evaluation




In [3]:
eval_results

{'accuracy_score': 0.9921517331589274,
 'precision_score': 0.9859286296583468,
 'recall_score': 0.986754778689647,
 'f1_score': 0.9861973642773599,
 'classification_report': {'Appraisals': {'precision': 0.9883154819863681,
   'recall': 0.9950980392156863,
   'f1-score': 0.9916951636541279,
   'support': 1020.0},
  'Credit_Report': {'precision': 1.0,
   'recall': 1.0,
   'f1-score': 1.0,
   'support': 298.0},
  'Financial_Statement': {'precision': 0.9949494949494949,
   'recall': 0.9704433497536946,
   'f1-score': 0.9825436408977556,
   'support': 406.0},
  'PFS': {'precision': 0.993421052631579,
   'recall': 0.967948717948718,
   'f1-score': 0.9805194805194807,
   'support': 156.0},
  'Rent_Roll_or_Operating_Statement': {'precision': 0.9425287356321839,
   'recall': 0.9879518072289156,
   'f1-score': 0.9647058823529412,
   'support': 83.0},
  'Tax_Return': {'precision': 0.9963570127504554,
   'recall': 0.9990867579908675,
   'f1-score': 0.9977200182398541,
   'support': 1095.0},
  'acc

In [None]:
# data["class"].unique()

# interrupt training in next cell

In [None]:
from sklearn.svm import LinearSVC
from sklearn.datasets import make_classification
import joblib
import os

# Generate some sample data
X, y = features,labels
isComplete=False
# Initialize LinearSVC model
model = LinearSVC(max_iter=1)
max_iter=100
# Checkpoint directory
checkpoint_dir = "checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)

# Training loop with checkpointing
checkpoint_interval = 10
for epoch in range(max_iter):
    # Train for one epoch
    model.fit(X, y)
    
    # Checkpointing
    if epoch % checkpoint_interval == 0:
        checkpoint_filename = os.path.join(checkpoint_dir, f"checkpoint_epoch_{epoch}.pkl")
        joblib.dump(model, checkpoint_filename)
        print(f"Checkpoint saved at epoch {epoch}")

# Save final model
final_model_filename = "final_model.pkl"
isComplete=True
joblib.dump(model, final_model_filename)
print("Final model saved")


In [None]:
import os

# Specify the directory path
folder_path = './checkpoints'

# List all files in the directory
files = os.listdir(folder_path)

# Sort the files alphabetically
sorted_files = sorted(files)
last_file=""
# Select the last file
if sorted_files:
    last_file = sorted_files[-1]
    print("Last file in the folder:", last_file)
else:
    print("The folder is empty.")

In [None]:
from sklearn.svm import LinearSVC
from sklearn.datasets import make_classification
import joblib
import os



# Load the last checkpointed model
checkpoint_dir = "checkpoints"
if last_file and not isComplete:
    latest_checkpoint = last_file  # Example filename
    last=int(last_file[-6:-4])
    print(last)
    model = joblib.load(os.path.join(checkpoint_dir, latest_checkpoint))

    # Load the dataset
    X, y = features,labels

    # Resume training
    for epoch in range(last+1,max_iter):  #Start from the next epoch after the last checkpoint
        # Train for one epoch
        model.fit(X, y)
        
        # Checkpointing (if desired)
        if epoch % checkpoint_interval == 0:
            checkpoint_filename = os.path.join(checkpoint_dir, f"checkpoint_epoch_{epoch}.pkl")
            joblib.dump(model, checkpoint_filename)
            print(f"Checkpoint saved at epoch {epoch}")
 
    # Save final model
    final_model_filename = "final_model.pkl"
    isComplete=True
    joblib.dump(model, final_model_filename)
    print("Final model saved")

In [None]:
features

In [None]:
import pickle

# Load the model from the current directory
with open("final_model.pkl", "rb") as f:
    model = joblib.load(f)

In [None]:

class_labels = []
max_index = max(id_to_category.keys())
class_labels = [id_to_category.get(i) for i in range(max_index + 1)]

model_evaluator = ModelEvaluator()      
eval_results = model_evaluator.evaluate_model(
        model, x_test, y_test, class_labels, features, labels, best_params)


In [None]:
eval_results

In [None]:
print(best_params)

In [None]:
from sklearn.base import BaseEstimator, Callback

class DataPointHitTracker(Callback):

  def __init__(self):
    self.data_hit_flags = set()

  def fit(self, X, y):
    # Iterate through data points and set flags
    for x in X:
      self.data_hit_flags.add(tuple(x))  # Assuming data is hashable (e.g., numerical)
    return self

  def on_epoch_end(self, estimator, X, y):
    # Optional: Access data and flags after each epoch
    pass


In [None]:
from sklearn.svm import LinearSVC

# Create the callback instance
tracker = DataPointHitTracker()

# Train the model with the callback
model = LinearSVC().fit(X, y, callback=[tracker])

# Access the hit flags after training
hit_flags = tracker.data_hit_flags

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

class IterationCallback(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.iteration_count = 0
        self.iterated_records = []
    
    def fit(self, X, y=None):
        self.iteration_count = 0
        self.iterated_records = []
        return self
    
    def partial_fit(self, X, y=None):
        self.iteration_count += 1
        # Keep track of the indices of records that have been iterated over
        self.iterated_records.extend(range(len(X)))
        return self
    
    def transform(self, X):
        return X
    
# Example usage
X_train = features  # Your training data
y_train = labels  # Your training labels

# Initialize SGDClassifier with your custom callback
sgd_clf = SGDClassifier(loss='log_loss', random_state=42)
callback = IterationCallback()

# Train the classifier
sgd_clf.partial_fit(X_train, y_train, classes=np.unique(y_train), callback=callback)

# Access the iteration count and iterated records
print("Total iterations:", callback.iteration_count)
print("Records iterated over:", callback.iterated_records)

In [None]:
from sklearn.linear_model import SGDClassifier

# Initialize SGDClassifier
sgd_clf = SGDClassifier(loss='log_loss', random_state=42)


X_train=features
y_train=labels
# Set parameters for training
epochs = 10
batch_size = 32
total_records = X_train.shape[0]
total_iterations = 0
iterated_records = []

# Training loop
for epoch in range(epochs):
    for batch_start in range(0, total_records, batch_size):
        batch_end = min(batch_start + batch_size, total_records)
        X_batch = X_train[batch_start:batch_end]
        y_batch = y_train[batch_start:batch_end]
        # print(X_batch)
        # Train on the current batch
        sgd_clf.partial_fit(X_batch, y_batch, classes=np.unique(y_train))
        
        # Update iteration tracking
        total_iterations += 1
        iterated_records.extend(range(batch_start, batch_end))

# Access iteration count and iterated records
print("Total iterations:", total_iterations)
print("Records iterated over:", iterated_records)

In [None]:
df=pd.DataFrame(features)

In [None]:
df

In [None]:
import scipy.sparse as sp
sp.issparse(features)

In [None]:

model_trainer = ModelTrainer()
best_model, calibrated_model, x_test, y_test, best_params = model_trainer.train_model(features, labels, test_size)

category_id_df = df[["class", "category_id"]].drop_duplicates().sort_values("category_id")
id_to_category = dict(category_id_df[["category_id", "class"]].values)

model_saver = ModelSaver()
model_saver.save_model(best_model, calibrated_model, tfidf, id_to_category, "trained_model")

class_labels = []
max_index = max(id_to_category.keys())
class_labels = [id_to_category.get(i) for i in range(max_index + 1)]

model_evaluator = ModelEvaluator()
eval_results2 = model_evaluator.evaluate_model(
        best_model, x_test, y_test, class_labels, features, labels, best_params)