In [3]:
import pandas as pd 
import numpy as np 
import time
import tqdm
import re

import bs4 
import glob
import json

pd.set_option("max.columns", 131)

from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.preprocessing import MaxAbsScaler, StandardScaler
from scipy.sparse import csr_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from lightgbm import LGBMClassifier

#https://strftime.org/
%matplotlib inline
%pylab inline

Populating the interactive namespace from numpy and matplotlib


# Read the data

In [4]:
df = pd.read_csv('artigos_medium_classificados.csv', sep = ',', dtype = {'author_url':str}).dropna(subset = ['classificacao', 'author_url'])
df.shape

(1155, 8)

In [5]:
df.classificacao.sum()

155.0

# Pre-processing

In [6]:
df['date'] = pd.to_datetime(df.date, format = '%d/%m/%Y')

In [7]:
df.claps = df.claps.str.replace(r'[Kk]', 'e3',regex = True,).astype(float)

In [8]:
df.title = df.title.str.replace(r'[\xa0]', '',regex = True)

In [9]:
df.title = df.title.str.replace(r'\W', ' ', regex = True)

In [10]:
df.title[0]

'Implementing an Autoencoder in PyTorch'

In [11]:
df['author_name'] = df.author_url.apply(lambda x: x.split('/@')[1])

In [12]:
features = df.drop(columns = ['classificacao', 'author_url'])
y = df['classificacao'].copy()


def tokenize(text):
    text = re.sub(r"[^a-zA-z0-9]"," ", unidecode(text))
    clean_tokens = []
    
    tokens = word_tokenize(text, language='portuguese')
    tokens = [w for w in tokens if w not in stopwords.words("portuguese")]
    lemmatizer = RSLPStemmer()
        
    for tok in tokens:
        clean_tok = lemmatizer.stem(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

In [13]:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.5, random_state=42)

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

title_train = X_train['title']
title_test = X_test['title']

title_vec = TfidfVectorizer(min_df=2, ngram_range=(1,2))
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_test)

In [15]:
from scipy.sparse import hstack, vstack

X_train_title = hstack([X_train[['claps', 'responses']], title_bow_train])
X_test_title = hstack([X_test[['claps', 'responses']], title_bow_val])
X_train_title.shape, X_test_title.shape, y_train.shape, y_test.shape

((577, 728), (578, 728), (577,), (578,))

## Random Forest

In [16]:
mdl_rf = RandomForestClassifier(n_estimators=1000, random_state=0, min_samples_leaf=1, class_weight="balanced", n_jobs=6)
mdl_rf.fit(X_train_title, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=6, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [17]:
p_rf = mdl_rf.predict_proba(X_test_title)[:, 1]

In [18]:
average_precision_score(y_test, p_rf), roc_auc_score(y_test, p_rf)

(0.2493791837273932, 0.6974339484168589)

## LGBM

In [19]:
params = [0.08265121231498246, 7, 1, 0.7251351011494334, 0.07547006552546137, 839, 2, 2]
lr = params[0]
max_depth = params[1]
min_child_samples = params[2]
subsample = params[3]
colsample_bytree = params[4]
n_estimators = params[5]

min_df = params[6]
ngram_range = (1, params[7])


In [20]:
title_vec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_test)


In [21]:
X_train_title = hstack([X_train[['claps', 'responses']], title_bow_train])
X_test_title = hstack([X_test[['claps', 'responses']], title_bow_val])
X_train_title.shape, X_test_title.shape, y_train.shape, y_test.shape

((577, 728), (578, 728), (577,), (578,))

In [22]:

mdl_lgbm = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth, 
                     min_child_samples=min_child_samples, subsample=subsample,
                     colsample_bytree=colsample_bytree, bagging_freq=1,n_estimators=n_estimators, random_state=0, 
                     class_weight="balanced", n_jobs=6)
mdl_lgbm.fit(X_train_title, y_train)

p_lgbm = mdl_lgbm.predict_proba(X_test_title)[:, 1]



In [23]:
average_precision_score(y_test, p_lgbm), roc_auc_score(y_test, p_lgbm)

(0.18187582043339956, 0.5788163136925981)

## Logistic regression

In [24]:
from sklearn.pipeline import make_pipeline

In [25]:
Xtrain_wtitle2 = csr_matrix(X_train_title.copy())
Xval_wtitle2 = csr_matrix(X_test_title.copy())

#scaler = StandardScaler()
#scaler = MaxAbsScaler()

#Xtrain_wtitle2[:, :2] = scaler.fit_transform(Xtrain_wtitle2[:, :2].todense())
#Xval_wtitle2[:, :2] = scaler.transform(Xval_wtitle2[:, :2].todense())
#Xtrain_wtitle2 = scaler.fit_transform(Xtrain_wtitle2)
#Xval_wtitle2 = scaler.transform(Xval_wtitle2)

lr_pipeline = make_pipeline(MaxAbsScaler(), LogisticRegression(C=0.5, penalty='l2',n_jobs=6, random_state=0))
lr_pipeline.fit(Xtrain_wtitle2, y_train)

Pipeline(memory=None,
         steps=[('maxabsscaler', MaxAbsScaler(copy=True)),
                ('logisticregression',
                 LogisticRegression(C=0.5, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=6, penalty='l2',
                                    random_state=0, solver='lbfgs', tol=0.0001,
                                    verbose=0, warm_start=False))],
         verbose=False)

In [26]:
p_lr = lr_pipeline.predict_proba(Xval_wtitle2)[:, 1]

In [27]:
average_precision_score(y_test, p_lr), roc_auc_score(y_test, p_lr)

(0.2621203517936057, 0.6937513105472846)

## Ensemble

In [28]:
p = (p_lr + p_rf + p_lgbm)/3
average_precision_score(y_test, p), roc_auc_score(y_test, p)

(0.23829788441295843, 0.68536380792619)

In [29]:
pd.DataFrame({"LR": p_lr, "RF": p_rf, "LGBM": p_lgbm}).corr()

Unnamed: 0,LR,RF,LGBM
LR,1.0,0.803908,0.482546
RF,0.803908,1.0,0.575971
LGBM,0.482546,0.575971,1.0


In [30]:
p = 0.5*p_rf + 0.5*p_lgbm
average_precision_score(y_test, p), roc_auc_score(y_test, p)

(0.22637550181919064, 0.6743552107360034)

## Salvar modelos

In [31]:
import joblib as jb

In [34]:
jb.dump(mdl_lgbm, "../prod/lgbm_20210302.pkl.z")
jb.dump(mdl_rf, "../prod/random_forest_20210302.pkl.z")
jb.dump(lr_pipeline, "../prod/logistic_reg_20210302.pkl.z")
jb.dump(title_vec, "../prod/title_vectorizer_20210302.pkl.z")

['../prod/title_vectorizer_20210302.pkl.z']