In [1]:
import seaborn as sb
import matplotlib.pyplot as plt
import pandas as pnd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

In [None]:

listas = ['month_2.csv', 'month_3.csv', 'month_4.csv', 'month_5.csv', 'month_6.csv']
df = []
for arquivo in listas:
    df += [pnd.read_csv(arquivo)]
# Concatena todos os dataframes em um único dataframe chamado df
df = pnd.concat(df)
# Chama o dataframe contido na variável chamada df
dadosCadastrais = pnd.read_csv('informacao_cadastral.csv')
usuariosUnicos = dadosCadastrais[dadosCadastrais.situacao == 'CONSUMINDO GÁS']['clientCode'].unique() 
# Organiza os dados dos usuários filtrados pela data
mesFiltrado = df[df['clientCode'].isin(usuariosUnicos)].sort_values(by='datetime') 
# Filtra meterSN diferente de '>N<A'
df = mesFiltrado[mesFiltrado['meterSN'] != '>N<A']
df['gain'].fillna(1, inplace=True)
df['pulseCount'] = df['pulseCount'] * df['gain']

df['diffPulseCount'] = df.groupby(['clientCode', 'meterSN']).pulseCount.diff()
df['diffPulseCount'].fillna(0, inplace=True)
df.reset_index(drop=True, inplace=True)
df = df[['clientCode', 'meterSN', "pulseCount", 'diffPulseCount','datetime']]
df['mediaCliente'] = df.groupby(['clientCode', 'meterSN']).diffPulseCount.transform('mean')
df['desvioPadraoCliente'] = df.groupby(['clientCode', 'meterSN']).diffPulseCount.transform('std')
df

In [None]:
df['tipo'] = 0
print(df)
df.loc[df['diffPulseCount'] > df['mediaCliente'] + 3 * df['desvioPadraoCliente'], 'tipo'] = 3
df.loc[df['diffPulseCount'] < 0, 'tipo'] = 1
df.loc[(df['pulseCount'] == 0) & (df['diffPulseCount'] < 0), 'tipo'] = 2
df.tipo.value_counts()

In [None]:
features = ['pulseCount', 'diffPulseCount']
x = df[features]
y = df['tipo']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
classifier = DecisionTreeClassifier()
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
y_prob = classifier.predict_proba(x_test)
print("**************************************************************************************")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(roc_auc_score(y_test, y_prob, multi_class='ovr'))
print("**************************************************************************************")

# Agora estou tentando parametros diferentes para o teste, visando tentar melhorar a acurácia

In [None]:
#Verificando os valores padrões da árvore de decisão
classifier.get_params()
#Criando um grid de parametros; Obrigado https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV e https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html amo ler documentação
grid_parametros = {
    'max_depth': [3, 5, 7, 9, 11, 13, None],
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'min_samples_split': [2, 5, 10],
    'max_features': [None, 'sqrt', 'log2'],
    'random_state': [42, None]
}
busca_grid = GridSearchCV(classifier, grid_parametros, cv=5, scoring='accuracy', n_jobs=-1)
busca_grid.fit(x_train, y_train)
melhor_classificador = busca_grid.best_estimator_
print('************************************************************************************************')
print(melhor_classificador)

In [None]:
features = ['pulseCount', 'diffPulseCount']
x = df[features]
y = df['tipo']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
classifier = DecisionTreeClassifier(criterion='entropy',max_depth=9, min_samples_split=10)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
print("**************************************************************************************")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print("**************************************************************************************")

In [None]:
classifier = RandomForestClassifier(n_jobs=-1)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

In [None]:
classifier.get_params()
grid_parametros = {
    'max_depth': [ 5, 11, 13, None],
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [2, 5, 10],
    'max_features': [None, 'sqrt', 'log2'],
    'random_state': [42],
    'n_jobs': [-1]
}
busca_grid = GridSearchCV(classifier, grid_parametros, cv=5, scoring='accuracy', n_jobs=-1)
busca_grid.fit(x_train, y_train)
melhor_classificador = busca_grid.best_estimator_
print('************************************************************************************************')
print(melhor_classificador)

In [None]:
features = ['pulseCount', 'diffPulseCount']
x = df[features]
y = df['tipo']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
classifier = RandomForestClassifier(criterion='entropy', max_depth=11, min_samples_split=5, n_jobs=-1,random_state=42, n_estimators=100)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
print('***************************************************************************************')
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print('***************************************************************************************')