In [88]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [57]:
data = pd.read_csv("../datasets/audit_data/audit_risk.csv")
data.head()

Unnamed: 0,Sector_score,LOCATION_ID,PARA_A,Score_A,Risk_A,PARA_B,Score_B,Risk_B,TOTAL,numbers,...,RiSk_E,History,Prob,Risk_F,Score,Inherent_Risk,CONTROL_RISK,Detection_Risk,Audit_Risk,Risk
0,3.89,23,4.18,0.6,2.508,2.5,0.2,0.5,6.68,5.0,...,0.4,0,0.2,0.0,2.4,8.574,0.4,0.5,1.7148,1
1,3.89,6,0.0,0.2,0.0,4.83,0.2,0.966,4.83,5.0,...,0.4,0,0.2,0.0,2.0,2.554,0.4,0.5,0.5108,0
2,3.89,6,0.51,0.2,0.102,0.23,0.2,0.046,0.74,5.0,...,0.4,0,0.2,0.0,2.0,1.548,0.4,0.5,0.3096,0
3,3.89,6,0.0,0.2,0.0,10.8,0.6,6.48,10.8,6.0,...,0.4,0,0.2,0.0,4.4,17.53,0.4,0.5,3.506,1
4,3.89,6,0.0,0.2,0.0,0.08,0.2,0.016,0.08,5.0,...,0.4,0,0.2,0.0,2.0,1.416,0.4,0.5,0.2832,0


In [58]:
data.shape

(776, 27)

In [59]:
data.columns.values

array(['Sector_score', 'LOCATION_ID', 'PARA_A', 'Score_A', 'Risk_A',
       'PARA_B', 'Score_B', 'Risk_B', 'TOTAL', 'numbers', 'Score_B.1',
       'Risk_C', 'Money_Value', 'Score_MV', 'Risk_D', 'District_Loss',
       'PROB', 'RiSk_E', 'History', 'Prob', 'Risk_F', 'Score',
       'Inherent_Risk', 'CONTROL_RISK', 'Detection_Risk', 'Audit_Risk',
       'Risk'], dtype=object)

### *hay 3 cadenas enves de valores numericos y un valor con NaN, hacemos limpieza de datos, podemos colocar 0 o el promedio de la columna

In [60]:
data["Risk"].value_counts()#hay 471 empresas que no tienen rasgos fraudulentos y 305 que si los tienen

0    471
1    305
Name: Risk, dtype: int64

# Dividimos las variables predictoras y la prediccion. (Pre Procesamiento)

In [61]:
Y = ['Risk']
X = ['Sector_score', 'LOCATION_ID', 'PARA_A', 'Score_A', 'Risk_A',
       'PARA_B', 'Score_B', 'Risk_B', 'TOTAL', 'numbers', 'Score_B.1',
       'Risk_C', 'Money_Value', 'Score_MV', 'Risk_D', 'District_Loss',
       'PROB', 'RiSk_E', 'History', 'Prob', 'Risk_F', 'Score',
       'Inherent_Risk', 'CONTROL_RISK', 'Detection_Risk', 'Audit_Risk']


In [62]:
lr = LogisticRegression(solver='lbfgs',class_weight='balanced', max_iter=10000)

## Escojemos las columnas mas importantes para la prediccion, yo escojo solo 8 enves de las 27 columnas en total

In [63]:
rfe = RFE(lr, n_features_to_select=8)
rfe = rfe.fit(data[X],data[Y].values.ravel())

In [64]:
print(rfe.support_)

[False False False False False False False False  True False False False
  True False  True  True False  True False False False False  True  True
 False  True]


In [65]:
print(rfe.ranking_)#podemos ver que variables son importantes(1) para tomar en cuenta en el modelo

[11 10  3  6 12  4  7  5  1 14 15 13  1  9  1  1  8  1 16 19 18  2  1  1
 17  1]


In [66]:
z=zip(X,rfe.support_, rfe.ranking_)
list(z)

[('Sector_score', False, 11),
 ('LOCATION_ID', False, 10),
 ('PARA_A', False, 3),
 ('Score_A', False, 6),
 ('Risk_A', False, 12),
 ('PARA_B', False, 4),
 ('Score_B', False, 7),
 ('Risk_B', False, 5),
 ('TOTAL', True, 1),
 ('numbers', False, 14),
 ('Score_B.1', False, 15),
 ('Risk_C', False, 13),
 ('Money_Value', True, 1),
 ('Score_MV', False, 9),
 ('Risk_D', True, 1),
 ('District_Loss', True, 1),
 ('PROB', False, 8),
 ('RiSk_E', True, 1),
 ('History', False, 16),
 ('Prob', False, 19),
 ('Risk_F', False, 18),
 ('Score', False, 2),
 ('Inherent_Risk', True, 1),
 ('CONTROL_RISK', True, 1),
 ('Detection_Risk', False, 17),
 ('Audit_Risk', True, 1)]

In [50]:
cols=['TOTAL','Money_Value','Risk_D','District_Loss','RiSk_E','Inherent_Risk','CONTROL_RISK','Audit_Risk']#escojemos las variables nesesarias para tomar la mejor PREDICCION

In [69]:
X = data[cols]
Y =data["Risk"]

### Implementación del modelo en Python con scikit-learn

In [70]:
logit_model = linear_model.LogisticRegression()
logit_model.fit(X,Y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [71]:
logit_model.score(X,Y)#la r**2 es muy bueno se acerca a 1

0.9948453608247423

In [74]:
pd.DataFrame(list(zip(X.columns, np.transpose(logit_model.coef_))))
#nos conviene tomar en cuenta la columna Risk_D y Audit_Risk para saber si es una empresa fraudulenta
#la columna Inherent_Risk no aporta nada al modelo, de lo contrario si tomamos en cuenta Inherent_Risk nos hara variar mas en el momento de tomar la desicion de que si la empresa es o no fraudulenta.

Unnamed: 0,0,1
0,TOTAL,[0.6568779666843574]
1,Money_Value,[0.03381724179395011]
2,Risk_D,[0.9725578269667436]
3,District_Loss,[0.6203595711227479]
4,RiSk_E,[0.14225976365385265]
5,Inherent_Risk,[-0.6049914607276792]
6,CONTROL_RISK,[0.11314381099715838]
7,Audit_Risk,[3.52850580313835]


## Validación del modelo logístico

In [80]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, random_state=0)#tomamos el 20% para test y el 80% para ENTRENAR

In [81]:
lm = linear_model.LogisticRegression()
lm.fit(X_train, Y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [83]:
probs = lm.predict_proba(X_test)
probs

array([[1.49947610e-10, 1.00000000e+00],
       [0.00000000e+00, 1.00000000e+00],
       [9.89874673e-01, 1.01253274e-02],
       [9.84852109e-01, 1.51478907e-02],
       [1.58482453e-03, 9.98415175e-01],
       [9.50604032e-01, 4.93959680e-02],
       [9.82647557e-01, 1.73524429e-02],
       [9.05226101e-01, 9.47738989e-02],
       [9.78238992e-01, 2.17610082e-02],
       [9.90011019e-01, 9.98898084e-03],
       [0.00000000e+00, 1.00000000e+00],
       [0.00000000e+00, 1.00000000e+00],
       [9.86365089e-01, 1.36349107e-02],
       [9.90284015e-01, 9.71598501e-03],
       [0.00000000e+00, 1.00000000e+00],
       [9.85334659e-01, 1.46653408e-02],
       [9.79898098e-01, 2.01019025e-02],
       [1.11093779e-04, 9.99888906e-01],
       [3.00632474e-04, 9.99699368e-01],
       [1.37678793e-01, 8.62321207e-01],
       [9.91541998e-01, 8.45800151e-03],
       [2.94143513e-01, 7.05856487e-01],
       [9.90934603e-01, 9.06539745e-03],
       [9.90216467e-01, 9.78353341e-03],
       [1.906808

In [84]:
prediction = lm.predict(X_test)

In [93]:
prediction# Por DEFECTO si el valor es menor a 0.5 se coloca 0 y si el mayor a 0.5 se coloca 1

array([1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0], dtype=int64)

In [100]:
prob = probs[:,1]
prob_df = pd.DataFrame(prob)
threshold = 0.1#colocamos el rango a 0.1 enves de 0.5 como arriba, pero vemos que el resultado no cambia, no importa si es 0.1 o 0.5
prob_df["prediction"] = np.where(prob_df[0]>threshold, 1, 0)
prob_df.head()

Unnamed: 0,0,prediction
0,1.0,1
1,1.0,1
2,0.010125,0
3,0.015148,0
4,0.998415,1


In [101]:
pd.crosstab(prob_df.prediction, columns="count")

col_0,count
prediction,Unnamed: 1_level_1
0,93
1,63


In [102]:

print(confusion_matrix(Y_test,prediction))

[[100   1]
 [  1  54]]


In [103]:
#solo 1 de cada prediccion esta erroneo, el modelo esta correctamente validado