In [1]:
##Libs
import pandas as pd

#Transformers e Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelBinarizer, StandardScaler

#Train/test split
from sklearn.model_selection import train_test_split 

#Modelo
from sklearn.ensemble import RandomForestClassifier

#Métricas
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

In [2]:
#Carregando datasets
df_train = pd.read_csv('../input/projeto4_telecom_treino.csv', index_col=0)
df_test = pd.read_csv('../input/projeto4_telecom_teste.csv', index_col=0)

In [3]:
#Computar custo total e minuto total
class ComputeTotal(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        return pd.DataFrame(data = X.total_day_charge +
                            X.total_eve_charge +
                            X.total_night_charge +
                            X.total_intl_charge, columns = ['total_charge'])
    
    def fit_transform(self, X, y = None):
        return self.fit(X, y).transform(X, y)
    
#Codificar campos Yes/No para 0 e 1
class YesNoEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.encoder = LabelBinarizer(neg_label=0, pos_label=1)
    
    def fit(self, X, y = None):
        self.encoder.fit(X)
        return self
    
    def transform(self, X, y = None):
        return self.encoder.transform(X)
    
    def fit_transform(self, X, y = None):
        return self.fit(X, y).transform(X, y)

In [4]:
#Transformers
prep_ct = ColumnTransformer(transformers=[('Sum charges', ComputeTotal(), ['total_day_charge', 'total_eve_charge', 'total_night_charge', 'total_intl_charge']),
                                          ('Encode intl plan', YesNoEncoder(), ['international_plan']),
                                          ('Encode vm plan', YesNoEncoder(), ['voice_mail_plan']),
                                          ('Scale num', StandardScaler(), ['account_length']),
                                          ('Service calls', 'passthrough', ['number_customer_service_calls'])],
                           remainder='drop')

#Binarize Y
def binarize_churn(y):
    return LabelBinarizer(neg_label = 0, pos_label = 1).fit_transform(y).ravel()

#Scores
def score_model(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred)
    print('Accuracy: %.5f\nF1 Score: %.5f' % (acc, f1))
    print(cm)

#Pipeline
def fit_pipe_model(model, dataFrame):
    pipe = Pipeline(steps=[['Prep_X', prep_ct],
                           ['Model', model]])
    
    #Data split
    X_train, X_valid, y_train, y_valid = train_test_split(dataFrame.drop('churn', axis = 1),
                                                          binarize_churn(dataFrame.churn), train_size = 0.7)
    
    #Train
    pipe.fit(X_train, y_train)
    
    #Validation
    print('** Train validation scores **')
    pipe_preds = pipe.predict(X_valid)
    score_model(y_valid, pipe_preds)
    
    return(pipe)

In [6]:
#Model
rf_model = fit_pipe_model(RandomForestClassifier(n_estimators=100, random_state = 10), df_train)

  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)


** Train validation scores **
Accuracy: 0.94700
F1 Score: 0.79377
[[845   9]
 [ 44 102]]


  res = transformer.transform(X)


In [7]:
#Test scores
rf_preds = rf_model.predict(df_test)
score_model(binarize_churn(df_test.churn), rf_preds)

Accuracy: 0.94961
F1 Score: 0.78351
[[1431   12]
 [  72  152]]


  res = transformer.transform(X)
