In [1]:
from modules.create_csv import crear_csv
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
import numpy as np
import string
import spacy

Para instalar spaCy
```
python -m venv .env
.env\Scripts\activate
pip install -U pip setuptools wheel
pip install -U spacy
python -m spacy download en_core_web_sm
python -m spacy download es_core_news_sm
```

In [2]:
datos = crear_csv("./data/frases.json")
datos.head()

Unnamed: 0,reclamo,etiqueta
0,Falta agua en el dispenser del módulo 1.,maestranza
1,la compu 23 del laboratorio 3 se tilda.,soporte informático
2,La ventana del Departamento informática quedó ...,maestranza
3,la cerradura del laboratorio de física se rompió,secretaría técnica
4,la pileta del baño del módulo 1 pierde agua,secretaría técnica


In [3]:
X = datos['reclamo']
y = datos['etiqueta']

In [4]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y, random_state=42)

In [18]:
frase = X_train[2]

Comparo tokenización nltk con spacy

In [58]:
word_tokenize(frase)

['La',
 'ventana',
 'del',
 'Departamento',
 'informática',
 'quedó',
 'abierta',
 'y',
 'está',
 'todo',
 'sucio']

In [66]:
nlp = spacy.load('es_core_news_sm', exclude=["ner"])
doc = nlp(frase)
tokens = [token.text for token in doc]
tokens

['La',
 'ventana',
 'del',
 'Departamento',
 'informática',
 'quedó',
 'abierta',
 'y',
 'está',
 'todo',
 'sucio']

In [67]:
nlp.pipe_names

['tok2vec', 'morphologizer', 'parser', 'attribute_ruler', 'lemmatizer']

Comparo stemming con lemmatization

In [45]:
#stemming
stop_words = set(stopwords.words('spanish'))
normalizer = SnowballStemmer('spanish')
[normalizer.stem(token) for token in tokens if token not in stop_words and token not in string.punctuation]

['la', 'ventan', 'departament', 'informat', 'qued', 'abiert', 'suci']

In [68]:
lemmatizer = nlp.get_pipe("lemmatizer")
print(lemmatizer.mode) 

rule


In [69]:
#Lemmatization
[token.lemma_ for token in doc]

['el',
 'ventana',
 'del',
 'Departamento',
 'informática',
 'quedar',
 'abierto',
 'y',
 'estar',
 'todo',
 'sucio']

In [71]:
class TextVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.__word2idx = {}
        self.__nlp = spacy.load("es_core_news_sm", exclude=["ner"])

    def __get_tokens(self, texto):        
        texto = texto.lower()
        doc = self.__nlp(texto)
        word_tokens = [token.lemma_ for token in doc]
        # tokens = word_tokenize(texto)    
        # word_tokens = [self.spanish_normalizer.stem(token) for token in tokens\
        #                     if token not in self.stop_words and token not in string.punctuation]
        return ' '.join(word_tokens)

    # Text to Vector
    def __text_to_vector(self, texto):
        word_vector = np.zeros(len(self.vocabulario_))
        texto = self.__get_tokens(texto) #agrego esta linea
        for word in texto.split(" "):
            if self.__word2idx.get(word) is None:
                continue
            else:
                word_vector[self.__word2idx.get(word)] += 1
        return np.array(word_vector)

    def fit(self, X, y=None):
        X_procesado = []
        for reclamo in X:
            X_procesado.append(self.__get_tokens(reclamo))

        total_counts = Counter()
        for reclamo in X_procesado:
            for word in reclamo.split(" "):
                total_counts[word] += 1
        self.vocabulario_ = [elem[0] for elem in total_counts.most_common()]
        for i, word in enumerate(self.vocabulario_):
            self.__word2idx[word] = i 

        return self

    def transform(self, X, y=None):        
        word_vectors = np.zeros((len(X), len(self.vocabulario_)), dtype=np.int_)
        for i, texto in enumerate(X):
            word_vectors[i] = self.__text_to_vector(texto)

        return word_vectors

In [74]:
pipe = Pipeline([
            ('vectorizer', TextVectorizer()),
            ('scaler', StandardScaler()),
            ('classifier', RandomForestClassifier(max_depth=20, max_features='log2', n_estimators=10))
        ])
clf = pipe.fit(X_train, y_train)
clf.score(X_test, y_test)

0.8947368421052632

In [75]:
grid = [
    {
        # 'classifier': [SVC()],
        'classifier__C': [0.01, 0.1, 0.5, 1, 5, 10, 15, 100], 
        'classifier__kernel': ['rbf', 'sigmoid', 'linear', 'poly'], 
        'classifier__gamma': [0.001, 0.01, 0.05, 0.1, 0.5, 1],
        'classifier__degree': [2, 3]
    },
    # {
    #     'classifier': [MLPClassifier()],
    #     'classifier__hidden_layer_sizes': [(5,), (10,), (5, 5), (10, 5), (10, 10)],
    #     'classifier__activation': ['tanh', 'relu'],
    #     'classifier__solver': ['adam'],
    #     'classifier__alpha': [0.0001, 0.001, 0.01],
    #     'classifier__learning_rate': ['constant', 'adaptive'],
    #     'classifier__max_iter': [200, 400, 600]
    # },
    # {
    #     'classifier': [RandomForestClassifier(random_state=42)],
    #     'classifier__random_state': [0,10, 100, 42],
    #     'classifier__n_estimators': [5, 10, 15, 20],
    #     'classifier__max_features': ['sqrt', 'log2'],
    #     'classifier__max_depth': [None, 5, 10, 20],
    #     'classifier__min_samples_split': [2, 5, 10],
    #     'classifier__min_samples_leaf': [1, 2, 4]
    # }
]

pipe = Pipeline([
    ("vectorizer", TextVectorizer()),
    ("scaler", StandardScaler()),
    ("classifier", SVC())
])

grid_search = GridSearchCV(pipe, grid, cv=3, n_jobs=-1, verbose=1)

In [76]:
best_clf = grid_search.fit(X_train, y_train)
best_clf

Fitting 3 folds for each of 384 candidates, totalling 1152 fits


KeyboardInterrupt: 

In [None]:
best_clf.score(X_test, y_test)