In [None]:
# https://scikit-learn.org/stable/auto_examples/model_selection/grid_search_text_feature_extraction.html#sphx-glr-auto-examples-model-selection-grid-search-text-feature-extraction-py
import os
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import xgboost as xgb

from sklearn.decomposition import PCA, NMF, TruncatedSVD
from sklearn.preprocessing import RobustScaler, MinMaxScaler, MaxAbsScaler
from sklearn.metrics import balanced_accuracy_score, classification_report
from sklearn.svm import SVC

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from scipy.stats import uniform
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from src.models.train_model import fit_model, get_metrics, get_preds

In [None]:
DATA_DIR ='../data'
RAW_DIR =  os.path.join(DATA_DIR, 'raw/')
EXTERNAL_DIR = os.path.join(DATA_DIR, 'external/')
TRAIN_DATA = os.path.join(RAW_DIR, 'train.csv')
STOPWORDS_DIR = os.path.join(EXTERNAL_DIR, 'stopwords.txt')

In [None]:
from src.features.build_features import get_stopwords

In [None]:
df = pd.read_csv(TRAIN_DATA, delimiter='|')
df

In [None]:
STOPWORDS = get_stopwords(STOPWORDS_DIR)

In [None]:
df.Intencion.value_counts().nlargest(20).plot.barh()

In [None]:
df.Intencion.value_counts().nsmallest(20).plot.barh()

In [None]:
# eliminamos la que tiene solo 1,2 etiqueta para poder entrenar en 2 splits
categories = df.Intencion.value_counts()[df.Intencion.value_counts()>2].index

In [None]:
df = df[df.Intencion.isin(categories)]
X = df.Pregunta
y = df.Intencion

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42, stratify=y)

In [None]:
preprocess =  Pipeline([
    ('tfidf', TfidfVectorizer(
        stop_words=STOPWORDS,
        ngram_range=(1,2),
        # max_features=10000, 
#         min_df=0.005,
        sublinear_tf=True
    )),
])

In [None]:
preprocess.fit(X_train,y_train)

In [None]:
preprocess.transform(X_train).shape,
# preprocess.steps[0][1].get_feature_names()

In [None]:
# # Create regularization penalty space
# penalty = ['l2']

# # Create regularization hyperparameter distribution using uniform distribution
# C = [i/10 for i in range(0, 50, 5)]

# # Create hyperparameter options
# hyperparameters = dict(C=C, penalty=penalty)

# logistic = LogisticRegression(C=3.73, n_jobs=4, class_weight='balanced')

# clf = GridSearchCV(logistic, hyperparameters, cv=2, verbose=3, n_jobs=2, scoring='balanced_accuracy')

# fit_model(clf, preprocess, X_train, y_train)

# print('Best Penalty:', clf.best_estimator_.get_params()['penalty'])
# print('Best C:', clf.best_estimator_.get_params()['C'])

In [None]:
# get_metrics(clf.best_estimator_, preprocess, X_train, y_train, X_test, y_test)

In [None]:
best_params = { 
    "C":3.73, "n_jobs":4,"class_weight":'balanced'
}# clf.best_estimator_.get_params()

In [None]:
model = Pipeline([
  ("preprocess", preprocess),
  ("model", LogisticRegression(**best_params))
])

X = df.Pregunta
y = df.Intencion

model.fit(X, y)


In [None]:
test = pd.read_csv(TEST_DATA, delimiter=',')
test
preds = pd.DataFrame(zip(test["id"], model.predict(test.Pregunta)))
preds.iloc[:,1] = preds.iloc[:,1].str.replace('Cat_', '')

In [None]:
from datetime import datetime
now = datetime.now().strftime("%Y%m%d_%H%M")
preds.to_csv(f'../data/results/answers_{now}.csv', header=False, index=False)