In [4]:
import os
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import mlflow

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_recall_fscore_support

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

import warnings
warnings.filterwarnings("ignore")


* 'schema_extra' has been renamed to 'json_schema_extra'


In [9]:
class TicketProcessor: 
    def __init__(self,data_path: str): 

        self.data_path = data_path
        self.vectorizer = TfidfVectorizer()
        
    def read_data(self, file_name: str): 
        self.data_tickets = pd.read_csv(os.path.join(self.data_path, file_name))
        return self.data_tickets
    
    def feature_extraction(self,text_column: str, target_column = "categoria_producto_servicio_encoded"): 
       self.vectorizer.fit(self.data_tickets[text_column])
       self.X = self.vectorizer.transform(self.data_tickets[text_column])
       self.y = self.data_tickets[target_column]
       return self.X, self.y
    
    def split_data(self,test_size = 0.2, random_state = 42): 
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size = test_size, random_state = random_state)
        
        return self.X_train, self.X_test, self.y_train, self.y_test
    
    def read_labels_dict(self, file_name: str): 
        file_path = os.path.join(self.data_path, file_name)
        with open(file_path, "r") as f: 
            self.labels_dict = json.load(f) 
        return self.labels_dict
       

In [14]:
class TicketClassification:
    def __init__(self, set_experiment_name: str = "tickets_classification"): 

        mlflow.set_tracking_uri("sqlite:///mlflow.db")
        mlflow.set_experiment(set_experiment_name)
        self.ticket_processor =  TicketProcessor(data_path = "data/data_processed")
        self.data_ticket = self.ticket_processor.read_data(file_name = "tickets_servicio_processed.csv")
        #print(self.data_ticket.shape)
        self.X, self.y = self.ticket_processor.feature_extraction(text_column = "processed_text")
        #print(self.X.shape)
        print(len(self.ticket_processor.vectorizer.vocabulary_))




if __name__ == "__main__":
    TicketClassification()



  (0, 60)	0.3478490927580123
  (0, 53)	0.3478490927580123
  (0, 48)	0.3478490927580123
  (0, 39)	0.3478490927580123
  (0, 38)	0.3478490927580123
  (0, 23)	0.3478490927580123
  (0, 20)	0.3478490927580123
  (0, 14)	0.17890799126763965
  (0, 2)	0.3478490927580123
65
