### Montando um dataframe de flow_recipients com balanceamento de classe

**Targets**  
1 - connected + client  
2 - connected  
3 - converting  
4 - lost + finished e parou no meio do fluxo  
5 - lost + finished e passou por todo o fluxo


**Baseline Acurácia**  
rf: 0.49  
xgb: 0.41

### Recuperando Tasks Normais
**Features:**  
 c_email	
 c_contact_id	
 t_company_id	
 t_done_at	
 t_service	
 fr_status	
 fr_contact_id	
 fr_flow_id	
 fa_steps	
 fr_last_flow_action_taken	
 target

In [None]:
import os
import utils
import pandas as pd

def get_dataframe(limit = 50000):

    dfs = []
    
    query = '''
            select
                c.email as c_email,
                c.id as c_contact_id,
                t.company_id as t_company_id,
                t.done_at as t_done_at,
                t.service as t_service,
                fr.status as fr_status,
                fr.contact_id as fr_contact_id,
                fr.flow_id as fr_flow_id,
                count(fa.flow_id) as fa_steps,
                (fr.last_flow_action_taken + 1) as fr_last_flow_action_taken,
                (target)
            from 
                flow_recipients fr join tasks t on fr.contact_id = t.referenceable_id 
                join flow_actions fa on fr.flow_id = fa.flow_id
                join contacts c on fr.contact_id = c.id 
            (where)
            group by
                c.id,
                t.company_id,
                fr.contact_id,
                c.email,
                fa.flow_id, 
                t.type,
                t.done_at, 
                t.service, 
                t.type, 
                fr.status, 
                fr.flow_id, 
                fr.last_flow_action_taken
            (having)
            limit
                {}
        '''.format(limit)
    
    # connected + client
    target = '''
                'connected_client' as target
            '''
    
    where = '''
            where
                fr.last_flow_action_taken is not null
                and (fr.status = 'finished' or fr.status = 'connected')
                and fr.contact_final_stage = 'client' 
                and t.referenceable_type = 'Contact' 
                and t.done_at is not null
                and t.type = 'ManualTask'
                and t.created_by_type = 'FlowAction'
            '''
    
    query_to_execute = query.replace('(target)', target).replace('(where)', where).replace('(having)', '')
    df = pd.read_sql(query_to_execute, os.environ['REEVAPI_URL'])
    dfs.append(df)
    
    # connected
    target = '''
                'connected' as target
            '''
    
    where = '''
            where
                fr.last_flow_action_taken is not null
                and fr.status = 'connected' 
                and fr.contact_final_stage <> 'client' 
                and t.referenceable_type = 'Contact' 
                and t.done_at is not null
                and t.type = 'ManualTask'
                and t.created_by_type = 'FlowAction'
           '''
    
    query_to_execute = query.replace('(target)', target).replace('(where)', where).replace('(having)', '')
    df = pd.read_sql(query_to_execute, os.environ['REEVAPI_URL'])
    dfs.append(df)

    # converting
    target = '''
                'converting' as target
            '''
    
    where = '''
            where
                fr.last_flow_action_taken is not null
                and fr.status = 'converting'  
                and t.referenceable_type = 'Contact' 
                and t.done_at is not null
                and t.type = 'ManualTask'
                and t.created_by_type = 'FlowAction'
        '''
    
    query_to_execute = query.replace('(target)', target).replace('(where)', where).replace('(having)', '')
    df = pd.read_sql(query_to_execute, os.environ['REEVAPI_URL'])
    dfs.append(df)

    # lost + finished e parou no meio do fluxo
    target = '''
                'lost' as target
            '''
    
    where = '''
            where
                fr.last_flow_action_taken is not null
                and fr.status = 'finished' 
                and fr.contact_final_stage = 'lost' 
                and t.referenceable_type = 'Contact' 
                and t.done_at is not null
                and t.type = 'ManualTask'
                and t.created_by_type = 'FlowAction'
            '''

    having = '''
                having count(fa.flow_id) <> (fr.last_flow_action_taken + 1)
            '''
    
    query_to_execute = query.replace('(target)', target).replace('(where)', where).replace('(having)', having)
    df = pd.read_sql(query_to_execute, os.environ['REEVAPI_URL'])
    dfs.append(df)

    # lost + finished e passou por todo o fluxo
    target = '''
                'lost_whole_flow' as target
            '''
    
    where = '''
            where
                fr.last_flow_action_taken is not null
                and fr.status = 'finished' 
                and fr.contact_final_stage = 'lost' 
                and t.referenceable_type = 'Contact' 
                and t.done_at is not null
                and t.type = 'ManualTask'
                and t.created_by_type = 'FlowAction'
            '''

    having = '''
                having count(fa.flow_id) = (fr.last_flow_action_taken + 1)
            '''
            
    query_to_execute = query.replace('(target)', target).replace('(where)', where).replace('(having)', having)    
    df = pd.read_sql(query_to_execute, os.environ['REEVAPI_URL'])
    dfs.append(df)

    df = pd.concat(dfs)

    return df

In [None]:
df_tasks = get_dataframe()

In [None]:
df_tasks['ozzy'] = 0
df_tasks = df_tasks.reindex(sorted(df_tasks.columns), axis=1)
df_tasks.head(3)

### Recuperando Ozzy Tasks Tipo 1 (fluxo associado)   
**Features:**  
 c_email	
 c_contact_id	
 t_company_id	
 t_done_at	
 t_service	
 fr_status	
 fr_contact_id	
 fr_flow_id	
 fa_steps	
 fr_last_flow_action_taken	
 target

In [None]:
# Recupera toda as ozzy tasks já executadas (ManualTask) e seus eventos associados
import os
import utils
import pandas as pd

query = '''select 
            e.eventable_id as contact_id,
            e.task_id,
            e.eventable_type,
            t.done_at
           from events e join tasks t on e.task_id = t.id 
           where (t.metadata->'ozzy')::boolean is True
            and t.type = 'ManualTask'
           '''
df_tmp = pd.read_sql(query, os.environ['REEVAPI_URL'])
df_tmp['done_at'] = pd.to_datetime(df_tmp['done_at'], errors='coerce')


In [None]:
# Criando um conjunto apenas com os ids de contatos que tiveram uma ozzy task associada
contacts = set()

for i in range(0, df_tmp.shape[0]):
    contacts.add(int(df_tmp.contact_id[i]))

# Criando uma lista de contatos e datas de execução da ozzy task associada
contacts_dates_tasks = []

for i in range(0, df_tmp.shape[0]):
    contacts_dates_tasks.append((int(df_tmp.contact_id[i]), df_tmp.done_at[i], df_tmp.task_id[i]))


In [None]:
query = '''
        select 
            c.email as c_email,
            c.id as c_contact_id,
            fr.status as fr_status,
            fr.contact_id as fr_contact_id,
            fr.created_at as fr_created_at,
            fr.status_updated_at as fr_status_updated_at,
            fr.flow_id as fr_flow_id,
            fr.contact_final_stage as fr_contact_final_stage,
            count(fa.flow_id) as fa_steps,
            (fr.last_flow_action_taken + 1) as fr_last_flow_action_taken
        from 
            flow_recipients fr
        join 
            flow_actions fa on fr.flow_id = fa.flow_id
        join
            contacts c on fr.contact_id = c.id
        where
            fr.contact_id in (**{}**)
        group by
            c.email,
            c.id,
            fr.flow_id,
            fr.contact_id,
            fr.status,  
            fr.created_at,
            fr.status_updated_at,
            fr.contact_final_stage,
            fr.last_flow_action_taken
           '''.format(str(contacts))

query = query.replace('**{', '').replace('}**', '')
df_tmp = pd.read_sql(query, os.environ['REEVAPI_URL'])
df_tmp['fr_created_at'] = pd.to_datetime(df_tmp['fr_created_at'], errors='coerce')
df_tmp['fr_status_updated_at'] = pd.to_datetime(df_tmp['fr_status_updated_at'], errors='coerce')

# removendo NaNs e tratando valores float
df_tmp.fr_last_flow_action_taken = df_tmp.fr_last_flow_action_taken.fillna(0)
df_tmp.fr_last_flow_action_taken = df_tmp.fr_last_flow_action_taken.astype(int)

df_tmp['t_id'] = ''

In [None]:
# filtrando pela data/hora de execução da ozzy task e adicionando id da task
import datetime

dfs = []

for i in contacts_dates_tasks:
    temp = df_tmp[(df_tmp['c_contact_id'] == i[0]) & (df_tmp['fr_created_at'] < i[1]) & (df_tmp['fr_status_updated_at'] > i[1])]
    temp['t_id'] = i[2]
    dfs.append(temp)
    
df_ozzies = pd.concat(dfs)
df_ozzies = df_ozzies.reset_index(drop=True)


In [None]:
df_ozzies['ozzy'] = 1

In [None]:
# Adicionando a feature Target
def get_target(fr_status, fr_contact_final_stage, fa_steps, fr_last_flow_action_taken):
    # connected + client
    if fr_status == 'connected' and fr_contact_final_stage == 'client':
        return 'connected_client'

    # connected
    if fr_status == 'connected' and fr_contact_final_stage != 'client':
        return  'connected'
        
    # converting
    if fr_status == 'converting':
        return 'converting'
        
    # lost + finished e parou no meio do fluxo
    if fr_contact_final_stage == 'lost' and fa_steps != fr_last_flow_action_taken:
        return 'lost'

    # lost + finished e passou por todo o fluxo
    if fr_contact_final_stage == 'lost' and fa_steps == fr_last_flow_action_taken:
        return 'lost_whole_flow'

    
df_ozzies['target']= df_ozzies.apply(lambda x: get_target(x['fr_status'],x['fr_contact_final_stage'],x['fa_steps'],x['fr_last_flow_action_taken']),axis=1)

In [None]:
df_ozzies.head()

In [None]:
# importante notar que há muitos flow_recipients que não se enquadram em nenhum dos targets
# deixando apenas tasks que foram ser associadas a um target
df_ozzies = df_ozzies[df_ozzies.target.notnull()]
df_ozzies = df_ozzies.reset_index(drop=True)

In [None]:
set_task_ids = set()

for i in range(0, df_ozzies.shape[0]):
    set_task_ids.add(int(df_ozzies.t_id[i]))

def get_ozzy_extra_features(df):
    query = '''
        select 
            t.id as t_id,
            t.company_id as t_company_id,
            t.done_at as t_done_at,
            t.service as t_service
        from 
            tasks t
        where
            t.id in (**{}**)
           '''.format(str(set_task_ids))

    query = query.replace('**{', '').replace('}**', '')
    df_tmp = pd.read_sql(query, os.environ['REEVAPI_URL'])
    
    df = pd.merge(df, df_tmp, on='t_id')
    
    return df

df_ozzies = get_ozzy_extra_features(df_ozzies)

In [None]:
df_ozzies.drop(['fr_contact_final_stage', 'fr_created_at', 'fr_status_updated_at', 't_id'], axis=1, inplace=True)
df_ozzies = df_ozzies.reindex(sorted(df_ozzies.columns), axis=1)

### Recuperando Ozzy Tasks Tipo 2 (sem fluxo associado)   
**Features:**  
 c_email	
 c_contact_id	
 t_company_id	
 t_done_at	
 t_service	
 fr_status	
 fr_contact_id	
 fr_flow_id	
 fa_steps	
 fr_last_flow_action_taken	
 target

In [None]:
# Recupera toda as ozzy tasks já executadas (ManualTask) e seus eventos associados
import os
import utils
import pandas as pd

query = '''select 
            e.eventable_id as contact_id,
            e.task_id,
            e.eventable_type,
            t.done_at
           from events e join tasks t on e.task_id = t.id 
           where (t.metadata->'ozzy')::boolean is True
            and t.type = 'ManualTask'
           '''
#df_tmp = pd.read_sql(query, os.environ['REEVAPI_URL_FORK'])
#df_tmp['done_at'] = pd.to_datetime(df_tmp['done_at'], errors='coerce')

# Criando um conjunto apenas com os ids de contatos que tiveram uma ozzy task associada
#contacts = set()

#for i in range(0, df_tmp.shape[0]):
#    contacts.add(int(df_tmp.contact_id[i]))

# Criando uma lista de contatos e datas de execução da ozzy task associada
#contacts_dates_tasks = []

#for i in range(0, df_tmp.shape[0]):
#    contacts_dates_tasks.append((int(df_tmp.contact_id[i]), df_tmp.done_at[i], df_tmp.task_id[i]))

    
query = '''
        select 
            c.email as c_email,
            c.id as c_contact_id,
            fr.status as fr_status,
            fr.contact_id as fr_contact_id,
            fr.created_at as fr_created_at,
            fr.status_updated_at as fr_status_updated_at,
            fr.flow_id as fr_flow_id,
            fr.contact_final_stage as fr_contact_final_stage,
            count(fa.flow_id) as fa_steps,
            (fr.last_flow_action_taken + 1) as fr_last_flow_action_taken
        from 
            flow_recipients fr
        join 
            flow_actions fa on fr.flow_id = fa.flow_id
        join
            contacts c on fr.contact_id = c.id
        where
            fr.contact_id in (**{}**)
        group by
            c.email,
            c.id,
            fr.flow_id,
            fr.contact_id,
            fr.status,  
            fr.created_at,
            fr.status_updated_at,
            fr.contact_final_stage,
            fr.last_flow_action_taken
           '''.format(str(contacts))

#query = query.replace('**{', '').replace('}**', '')
#df_tmp = pd.read_sql(query, os.environ['REEVAPI_URL_FORK'])
#df_tmp['fr_created_at'] = pd.to_datetime(df_tmp['fr_created_at'], errors='coerce')
#df_tmp['fr_status_updated_at'] = pd.to_datetime(df_tmp['fr_status_updated_at'], errors='coerce')

# removendo NaNs e tratando valores float
#df_tmp.fr_last_flow_action_taken = df_tmp.fr_last_flow_action_taken.fillna(0)
#df_tmp.fr_last_flow_action_taken = df_tmp.fr_last_flow_action_taken.astype(int)

# recupera o id de todas as tasks que não possuem um flow_recipient associado
#set_task_ids2 = set()

#for i in contacts_dates_tasks:
#    temp = df_tmp[(df_tmp['c_contact_id'] == i[0]) & (df_tmp['fr_created_at'] < i[1]) & (df_tmp['fr_status_updated_at'] > i[1])]
    
#    if temp.shape[0] == 0:
#        set_task_ids2.add(i[2])

In [None]:
query = '''
        select 
            c.email as c_email,
            c.id as c_contact_id,
            fr.status as fr_status,
            fr.contact_id as fr_contact_id,
            fr.created_at as fr_created_at,
            fr.status_updated_at as fr_status_updated_at,
            fr.flow_id as fr_flow_id,
            fr.contact_final_stage as fr_contact_final_stage,
            count(fa.flow_id) as fa_steps,
            (fr.last_flow_action_taken + 1) as fr_last_flow_action_taken
        from 
            flow_recipients fr
        join 
            flow_actions fa on fr.flow_id = fa.flow_id
        join
            contacts c on fr.contact_id = c.id
        where
            fr.contact_id in (**{}**)
        group by
            c.email,
            c.id,
            fr.flow_id,
            fr.contact_id,
            fr.status,  
            fr.created_at,
            fr.status_updated_at,
            fr.contact_final_stage,
            fr.last_flow_action_taken
           '''.format(str(contacts))

### Juntando os dataframes em um único df

In [None]:
df = pd.concat([df_tasks, df_ozzies])
df = df.reset_index(drop=True)
df.head(3)

### Adicionando Features Extras

In [None]:
def tx_conversion(data):
    def extract_value(x):
        return x['by_stage']['conversion']['value']
    
    # query que levanta os status de flows
    query = '''
            select
                id as fr_flow_id, recipients_statistics
            from 
                flows
        '''
    
    df2 = pd.read_sql(query, os.environ['REEVAPI_URL'])
    
    df = pd.merge(data, df2, on='fr_flow_id')
    
    df['tx_conversion'] = df.apply(lambda x: extract_value(x['recipients_statistics']),axis=1)
    
    return df
    
df = tx_conversion(df)

In [None]:
def previously_contact_any_company(data):
    query = '''
            select 
                email 
            from 
                contacts c join flow_recipients fr on fr.contact_id = c.id
            where
                fr.last_flow_action_taken is not null
                and (fr.status = 'connected' or fr.status = 'converting')
            '''
    df2 = pd.read_sql(query, os.environ['REEVAPI_URL'])
    
    emails = dict()

    for i in range(0, df2.shape[0]):
        if df2.email[i] not in emails: emails[df2.email[i]] = 0
        emails[df2.email[i]] += 1
        
    def contacted_before(x):
        if x in emails:
            return emails[x]
        return 0
    
    data['previously_contact_any_company'] = data.apply(lambda x: contacted_before(x['c_email']),axis=1)
    
    return data
    
df = previously_contact_any_company(df)

In [None]:
def previously_contact_my_company(data):
    query = '''
            select 
                email, company_id 
            from 
                contacts c join flow_recipients fr on fr.contact_id = c.id
            where
                fr.last_flow_action_taken is not null
                and (fr.status = 'connected' or fr.status = 'converting')
            '''
    df2 = pd.read_sql(query, os.environ['REEVAPI_URL'])
    
    opportunities = dict()

    for i in range(0, df2.shape[0]):
        opportunity = df2.email[i] + '_' + str(df2.company_id[i])
        
        if opportunity not in opportunities: opportunities[opportunity] = 0
        opportunities[opportunity] += 1
        
    def contacted_before(email, company_id):
        opportunity = email + '_' + str(company_id)
        if opportunity in opportunities:
            return opportunities[opportunity]
        return 0
    
    data['previously_contact_my_company'] = data.apply(lambda x: contacted_before(x['c_email'], x['t_company_id']),axis=1)
    
    return data
    
df = previously_contact_my_company(df)

In [None]:
def is_inbound(data):
    query = '''
            select 
                 id as c_contact_id, 
                 metadata
            from 
                contacts
            '''
    df2 = pd.read_sql(query, os.environ['REEVAPI_URL'])
        
    def process_inbound(x):
        if 'rd_station' in str(x):
            return 'rd_station'
        
        if 'sharp_spring' in str(x):
            return 'sharp_spring'
        
        if 'hubspot' in str(x):
            return 'hubspot'
        return 'not_inbound'
        
    
    df2['is_inbound'] = df2.apply(lambda x: process_inbound(x['metadata']),axis=1)
    
    df2 = df2.drop(['metadata'], axis=1)
    
    df = pd.merge(data, df2, on='c_contact_id')
    
    return df
    
df = is_inbound(df)

def is_inbound2(data):
    query = '''
            select 
                 id as c_contact_id,
                 metadata
            from 
                contacts
            '''
    df2 = pd.read_sql(query, os.environ['REEVAPI_URL'])
        
    def process_inbound(x):
        if 'rd_station' in str(x):
            return True
        
        if 'sharp_spring' in str(x):
            return True
        
        if 'hubspot' in str(x):
            return True
        return False
        
    
    df2['is_inbound2'] = df2.apply(lambda x: process_inbound(x['metadata']),axis=1)
    
    df2 = df2.drop(['metadata'], axis=1)
    
    df = pd.merge(data, df2, on='c_contact_id')
    
    return df
    
df = is_inbound2(df)

### Pré-processamento

In [None]:
# drop columns
df = df.drop(['c_email', 'c_contact_id', 't_company_id', 'fr_contact_id', 'fr_status', 'fr_flow_id', 't_done_at', 'recipients_statistics'], axis=1)

In [None]:
from sklearn import preprocessing

# label enconder
le = preprocessing.LabelEncoder()
df['t_service'] = le.fit_transform(df['t_service'])
df['is_inbound'] = le.fit_transform(df['is_inbound'])
df['is_inbound2'] = le.fit_transform(df['is_inbound2'])

df.head()

In [None]:
def transform_target(target):
    if target == 'connected_client':
        return 4
    if target == 'connected':
        return 3
    if target == 'converting':
        return 2
    if target == 'lost':
        return 1
    if target == 'lost_whole_flow':
        return 0
    
df['target'] = df.apply(lambda x: transform_target(x['target']), axis=1)

### Salva o status atual do dataframe

In [None]:
# salva o dataframe
import pickle

file = open('df_tasks_priorization_classfication_model', 'wb')
pickle.dump(df, file)
file.close()

In [None]:
# carrega o dataframe
import pickle
file = open('df_tasks_priorization_classfication_model', 'rb')
df = pickle.load(file)
file.close()

In [None]:
df

### Análise de correlação

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12,10))
cor = df.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()

### Correlação maior de 20% com o target

In [None]:
cor_target = abs(cor["target"])

relevant_features = cor_target[cor_target>0.2]
relevant_features

### Particiona em treino, teste e validação

In [None]:
import xgboost as xgb

from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score

In [None]:
y = df['target']
X = df.drop(['target'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1)
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.4, random_state=1)

In [None]:
X_train

In [None]:
rf = RandomForestClassifier(n_estimators=400, max_features=3, criterion='gini', n_jobs=8)
rf.fit(X_train, y_train)

pred_rf = rf.predict(X_test)
print("Accuracy: " + str(accuracy_score(y_test, pred_rf)))

prob_rf = rf.predict_proba(X_test)
print("RandomForest\nLogloss: " + str(log_loss(y_test, prob_rf)))

scores = cross_val_score(rf, X, y, cv=5)
print("Accuracy CrossValidation: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
xgb = xgb.XGBClassifier(max_depth=15, n_estimators=250, learning_rate=0.05, subsample=0.7, colsample_bytree=0.5,silent=1, objective='multi:softprob')
xgb.fit(X_train, y_train)

pred_xgb = xgb.predict(X_test)
print("Accuracy: " + str(accuracy_score(y_test, pred_xgb)))

prob_xgb = xgb.predict_proba(X_test)
print("XGBoost\nLogloss: " + str(log_loss(y_test, prob_xgb)))

scores = cross_val_score(xgb, X, y, cv=5)
print("Accuracy CrossValidation: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

### Criando um ensemble com parâmetros variados para Random Forest e XGBoost

In [None]:
# random forest
from sklearn.ensemble import RandomForestClassifier

rf1 = RandomForestClassifier(n_estimators=400, max_features=4, criterion='gini', n_jobs=8)
rf2 = RandomForestClassifier(n_estimators=400, max_features=6, criterion='gini', n_jobs=8)
rf3 = RandomForestClassifier(n_estimators=500, max_features=4, criterion='gini', n_jobs=8)
rf4 = RandomForestClassifier(n_estimators=500, max_features=6, criterion='gini', n_jobs=8)
rf5 = RandomForestClassifier(n_estimators=600, max_features=4, criterion='gini', n_jobs=8)
rf6 = RandomForestClassifier(n_estimators=600, max_features=6, criterion='gini', n_jobs=8)

rf1.fit(X_train, y_train)
rf2.fit(X_train, y_train)
rf3.fit(X_train, y_train)
rf4.fit(X_train, y_train)
rf5.fit(X_train, y_train)
rf6.fit(X_train, y_train)

In [None]:
pred_rf1 = rf1.predict(X_test)
pred_rf2 = rf2.predict(X_test)
pred_rf3 = rf3.predict(X_test)
pred_rf4 = rf4.predict(X_test)
pred_rf5 = rf5.predict(X_test)
pred_rf6 = rf6.predict(X_test)

print("rf1 accuracy: " + str(accuracy_score(y_test, pred_rf1)))
print("rf2 accuracy: " + str(accuracy_score(y_test, pred_rf2)))
print("rf3 accuracy: " + str(accuracy_score(y_test, pred_rf3)))
print("rf4 accuracy: " + str(accuracy_score(y_test, pred_rf4)))
print("rf5 accuracy: " + str(accuracy_score(y_test, pred_rf5)))
print("rf6 accuracy: " + str(accuracy_score(y_test, pred_rf6)))

print()

prob_rf1 = rf1.predict_proba(X_test)
prob_rf2 = rf2.predict_proba(X_test)
prob_rf3 = rf3.predict_proba(X_test)
prob_rf4 = rf4.predict_proba(X_test)
prob_rf5 = rf5.predict_proba(X_test)
prob_rf6 = rf6.predict_proba(X_test)

prob_mean_rf = ((prob_rf1 + prob_rf2 + prob_rf3 + prob_rf4 + prob_rf5 + prob_rf6)/6)

print("rf1 logloss: " + str(log_loss(y_test, prob_rf1)))
print("rf2 logloss: " + str(log_loss(y_test, prob_rf2)))
print("rf3 logloss: " + str(log_loss(y_test, prob_rf3)))
print("rf4 logloss: " + str(log_loss(y_test, prob_rf4)))
print("rf5 logloss: " + str(log_loss(y_test, prob_rf5)))
print("rf6 logloss: " + str(log_loss(y_test, prob_rf6)))
print("rf mix logloss: " + str(log_loss(y_test, prob_mean_rf)))

In [None]:
# xgboost
import xgboost as xgb

xgb1 = xgb.XGBClassifier(max_depth=6, n_estimators=250, learning_rate=0.05, subsample=0.6, colsample_bytree=0.5,silent=1, objective='multi:softprob')
xgb2 = xgb.XGBClassifier(max_depth=6, n_estimators=250, learning_rate=0.05, subsample=0.7, colsample_bytree=0.6,silent=1, objective='multi:softprob')
xgb3 = xgb.XGBClassifier(max_depth=6, n_estimators=250, learning_rate=0.1, subsample=0.6, colsample_bytree=0.5,silent=1, objective='multi:softprob')
xgb4 = xgb.XGBClassifier(max_depth=6, n_estimators=250, learning_rate=0.1, subsample=0.7, colsample_bytree=0.6,silent=1, objective='multi:softprob')
xgb5 = xgb.XGBClassifier(max_depth=6, n_estimators=250, learning_rate=0.15, subsample=0.6, colsample_bytree=0.5,silent=1, objective='multi:softprob')
xgb6 = xgb.XGBClassifier(max_depth=6, n_estimators=250, learning_rate=0.15, subsample=0.8, colsample_bytree=0.7,silent=1, objective='multi:softprob')

xgb1.fit(X_train, y_train)
xgb2.fit(X_train, y_train)
xgb3.fit(X_train, y_train)
xgb4.fit(X_train, y_train)
xgb5.fit(X_train, y_train)
xgb6.fit(X_train, y_train)

In [None]:
pred_xgb1 = xgb1.predict(X_test)
pred_xgb2 = xgb2.predict(X_test)
pred_xgb3 = xgb3.predict(X_test)
pred_xgb4 = xgb4.predict(X_test)
pred_xgb5 = xgb5.predict(X_test)
pred_xgb6 = xgb6.predict(X_test)

print("xgb1 accuracy: " + str(accuracy_score(y_test, pred_xgb1)))
print("xgb2 accuracy: " + str(accuracy_score(y_test, pred_xgb2)))
print("xgb3 accuracy: " + str(accuracy_score(y_test, pred_xgb3)))
print("xgb4 accuracy: " + str(accuracy_score(y_test, pred_xgb4)))
print("xgb5 accuracy: " + str(accuracy_score(y_test, pred_xgb5)))
print("xgb6 accuracy: " + str(accuracy_score(y_test, pred_xgb6)))

print()

prob_xgb1 = xgb1.predict_proba(X_test)
prob_xgb2 = xgb2.predict_proba(X_test)
prob_xgb3 = xgb3.predict_proba(X_test)
prob_xgb4 = xgb4.predict_proba(X_test)
prob_xgb5 = xgb5.predict_proba(X_test)
prob_xgb6 = xgb6.predict_proba(X_test)

prob_mean_xgb = ((prob_xgb1 + prob_xgb2 + prob_xgb3 + prob_xgb4 + prob_xgb5 + prob_xgb6)/6)

print("xgb1 logloss: " + str(log_loss(y_test, prob_xgb1)))
print("xgb2 logloss: " + str(log_loss(y_test, prob_xgb2)))
print("xgb3 logloss: " + str(log_loss(y_test, prob_xgb3)))
print("xgb4 logloss: " + str(log_loss(y_test, prob_xgb4)))
print("xgb5 logloss: " + str(log_loss(y_test, prob_xgb5)))
print("xgb6 logloss: " + str(log_loss(y_test, prob_xgb6)))
print("xgb mix logloss: " + str(log_loss(y_test, prob_mean_xgb)))

In [None]:
prob_mean = ((prob_rf1 + prob_rf2 + prob_rf3 + prob_rf4 + prob_rf5 + prob_rf6 + prob_xgb1 + prob_xgb2 + prob_xgb3 + prob_xgb4 + prob_xgb5 + prob_xgb6)/12)
print("mix logloss: " + str(log_loss(y_test, prob_mean)))

### Definindo um score para uma task   

Como definir a melhor estratégia?
 regra de negócio? ou existe uma forma precisa de fazer isso?
 
***

**Estratégias:**   
1. ordenando por classes e posteriormente pela probabilidade dentro da classe
2. utilizando o somatório do peso das duas classes mais positivas contra as duas mais negativas
3. verificar a diferença entre as classes, quando a diferença for baixa, ser otimista (considerar as classes positivas)

In [None]:
def teste_scores():
    # teste scores
    _4 = []
    _3 = []
    _2 = []
    _1 = []
    _0 = []

    for i in range(0, pred_rf.shape[0]):
        score = get_score(pred_rf[i])

        if (int(score)) == 0:
            _0.append(score)
        elif (int(score)) == 1:
            _1.append(score)
        elif (int(score)) == 2:
            _2.append(score)
        elif (int(score)) == 3:
            _3.append(score)
        elif (int(score)) == 4:
            _4.append(score)

    print('MIN')
    print('classe 4', ('0.' + str(min(_4)).split('.')[1]))
    print('classe 3', ('0.' + str(min(_3)).split('.')[1]))
    print('classe 2', ('0.' + str(min(_2)).split('.')[1]))
    print('classe 1', ('0.' + str(min(_1)).split('.')[1]))
    print('classe 0', ('0.' + str(min(_0)).split('.')[1]))

    print('QTD')
    print('classe 4', len(_4))
    print('classe 3', len(_3))
    print('classe 2', len(_2))
    print('classe 1', len(_1))
    print('classe 0', len(_0))
    
    return ([len(_4),len(_3),len(_2),len(_1),len(_0)])

### Estratégia 1

In [None]:
# 1. ordenando por classes e posteriormente pela probabilidade dentro da classe
import numpy as np

def get_score(pred):
    value = max(pred)
    index = np.where(pred == max(pred))[0][0]

    value = (index + value - 0.0001)
    
    return value

strategy_1 = teste_scores()

### Estratégia 2

In [None]:
# 2. utilizando o somatório do peso das duas classes mais positivas contra as duas mais negativas
import numpy as np

def get_score(pred):
    # 4 connected_client
    # 3 connected
    # 2 converting
    # 1 lost
    # 0 lost_whole_flow
    positive = (pred[4] + pred[3])/2
    neutral = pred[2]
    negative = (pred[1] + pred[0])/2
    
    # caso positivo (classes 4 e 3
    if positive > neutral and positive > negative:
        if pred[4] > pred[3]:
            return 4 + (pred[4] - 0.0001)
        return 3 + (pred[3] - 0.0001)
    
    # caso neutro (classe 2)
    if neutral > negative:
        return 2 + (neutral - 0.0001)
    
    # caso negativo (classes 1 e 0)
    if pred[1] > pred[0]:
        return 1 + (pred[1] - 0.0001)
    return 0 + (pred[0] - 0.0001)

strategy_2 = teste_scores()

In [None]:
0 30 30
1 5 0
2
3 27 29
4 10 29

### Estratégia 3

In [None]:
# 3. verificar a diferença entre as classes, quando a diferença for baixa, ser otimista (considerar as classes positivas)
import numpy as np

def get_score(pred, threshold=0.5):
    # 4 connected_client
    # 3 connected
    # 2 converting
    # 1 lost
    # 0 lost_whole_flow
    threshold = 0.05
    
    value = max(pred)
    index = np.where(pred == max(pred))[0][0]
    
    if index != 4:
        if (value - pred[4]) < threshold:
            return 4 + (pred[4] - 0.0001)
    
    if index != 3:
        if (value - pred[3]) < threshold:
            return 3 + (pred[3] - 0.0001)
        
    value = (index + value - 0.0001)
    
    return value
strategy_3 = teste_scores()

In [None]:
import matplotlib.pyplot as plt

barWidth = 0.25

plt.figure(figsize=(10,5))

r1 = np.arange(len(strategy_1))
r2 = [x + barWidth for x in r1]
r3 = [x + barWidth for x in r2]

plt.bar(r1, strategy_1, width=barWidth, label='Classificação Normal')
plt.bar(r2, strategy_2, width=barWidth, label='Classes Pos X Neg')
plt.bar(r3, strategy_3, width=barWidth, label='Otimista')

plt.xlabel('Classes')
plt.xticks([r + barWidth for r in range(len(strategy_1))], ['connected_client', 'connected', 'converting', 'lost', 'lost_whole_flow'])
plt.ylabel('Número de tasks')
plt.title('Comparação Estratégias')

plt.legend()
plt.show()

### procurando pelo melhor threshold para a estratégia 3

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def get_score(pred, threshold=0.5):
    # 4 connected_client
    # 3 connected
    # 2 converting
    # 1 lost
    # 0 lost_whole_flow
    
    value = max(pred)
    index = np.where(pred == max(pred))[0][0]
    
    if index != 4:
        if (value - pred[4]) < threshold:
            return 4 + (pred[4] - 0.0001)
    
    if index != 3:
        if (value - pred[3]) < threshold:
            return 3 + (pred[3] - 0.0001)
        
    value = (index + value - 0.0001)
    
    return value

def get_thresholds(pred):
    thresholds = [0.01, 0.05, 0.1, 0.15, 0.2, 0.4, 0.5, 0.7, 0.9, 1.0]
    
    for threshold in thresholds:
        _4 = 0
        _3 = 0
        _2 = 0
        _1 = 0
        _0 = 0

        for i in range(0, pred.shape[0]):
            score = get_score(pred[i], threshold)

            if (int(score)) == 0:
                _0 += 1
            elif (int(score)) == 1:
                _1 += 1
            elif (int(score)) == 2:
                _2 += 1
            elif (int(score)) == 3:
                _3 += 1
            elif (int(score)) == 4:
                _4 += 1
                
        plt.plot([_0, _1, _2, _3, _4], label=threshold)
        
    plt.title('Comparação Thresholds ')
    plt.legend()
    plt.xlabel('Classes')
    plt.ylabel('Número de tasks')
    plt.show()

get_thresholds(prob_mean_rf)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def get_score(pred, threshold=0.5):
    # 4 connected_client
    # 3 connected
    # 2 converting
    # 1 lost
    # 0 lost_whole_flow
    
    value = max(pred)
    index = np.where(pred == max(pred))[0][0]
    
    if index != 4:
        if (value - pred[4]) < threshold:
            return 4 + (pred[4] - 0.0001)
    
    if index != 3:
        if (value - pred[3]) < threshold:
            return 3 + (pred[3] - 0.0001)
        
    value = (index + value - 0.0001)
    
    return value

def get_thresholds(pred):
    thresholds = [0.01, 0.05, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.7, 0.9, 1.0]
    
    for threshold in thresholds:
        _4 = 0
        _3 = 0
        _2 = 0
        _1 = 0
        _0 = 0

        for i in range(0, pred.shape[0]):
            score = get_score(pred[i], threshold)

            if (int(score)) == 3:
                _3 += 1
            elif (int(score)) == 4:
                _4 += 1
                
        plt.plot([_3, _4], label=threshold)
        
    plt.title('Comparação Thresholds ')
    plt.legend()
    plt.xlabel('Classes 3 e 4')
    plt.ylabel('Número de tasks')
    plt.show()

get_thresholds(prob_mean_rf)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def get_score(pred, threshold=0.5):
    # 4 connected_client
    # 3 connected
    # 2 converting
    # 1 lost
    # 0 lost_whole_flow
    
    value = max(pred)
    index = np.where(pred == max(pred))[0][0]
    
    if index != 4:
        if (value - pred[4]) < threshold:
            return 4 + (pred[4] - 0.0001)
    
    if index != 3:
        if (value - pred[3]) < threshold:
            return 3 + (pred[3] - 0.0001)
        
    value = (index + value - 0.0001)
    
    return value

def get_thresholds(pred):
    thresholds = [0.01, 0.05, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.7, 0.9, 1.0]
    result = []
    
    for threshold in thresholds:
        _4 = 0
        _3 = 0
        _2 = 0
        _1 = 0
        _0 = 0

        for i in range(0, pred.shape[0]):
            score = get_score(pred[i], threshold)

            if (int(score)) == 4:
                _4 += 1
                
        result.append(_4)
        
    plt.title('Comparação Thresholds ')
    plt.plot(result)
    plt.xlabel(thresholds)
    plt.legend()
    plt.show()

get_thresholds(prob_mean_rf)

### Dicas de efeitos em para outputs no notebook
https://stackoverflow.com/questions/23271575/printing-bold-colored-etc-text-in-ipython-qtconsole

** Features possíveis **   
taxa conversão  
taxa de abertura (só para email)  
taxa de resposta (só para email)  
tipos de tarefa do fluxo (tipos de tarefa)  
informações gerais sobre o fluxo  
contatos com interação prévia   
origem do contato   
quantas tarefas foram executadas anteriormente para esse contato   
características do template associado à tarefa   
empresa do contato   
quantidade de contatos da mesma empresa na base   