In [0]:
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sn
import pickle as pkl
import io

# Read recipe inputs
preproces_data = dataiku.Dataset("preproces_data")
preproces_data_df = preproces_data.get_dataframe()

In [0]:
preproces_data_df['target'].value_counts()

In [0]:
preproces_data_df.iloc[:,-1].value_counts()

In [0]:
df_dummie=pd.get_dummies(preproces_data_df.iloc[:,0:-1])

In [0]:
X = df_dummie.iloc[:,0:-1]
y = preproces_data_df.iloc[:,-1]
np.random.seed(123)
xTrain, xTest, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)

In [0]:
y_train.value_counts()

In [0]:
logreg = LogisticRegression( C=0.9 , class_weight='balanced')
logreg.fit(xTrain, y_train)
print(logreg)

In [0]:
y_pred = logreg.predict(xTest)
# y_pred = np.where(logreg.predict_proba(X.iloc[:,1:])[:, 1] > 0.50362, 1, 0)
print('Accuracy of logistic regression classifier on test set: {:.2f}' .format(accuracy_score(y_test, y_pred)))

In [0]:
cf = confusion_matrix(y_test, y_pred)
print(pd.DataFrame(cf))

In [0]:
print(classification_report(y_test, y_pred))

In [0]:
logit_roc_auc = roc_auc_score(y_test, logreg.predict_proba(xTest)[:, 1])
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(xTest)[:, 1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

In [0]:
sn.heatmap(cf, annot=True,  fmt='.2f', xticklabels = ["0", "1"] , yticklabels = ["0", "1"],  cmap = "GnBu_r")
plt.ylabel('True Detractor',fontsize=12)
plt.xlabel('Predicted Detractor',fontsize=12)

In [0]:
clf_rp=classification_report(y_test, y_pred,output_dict=True)

In [0]:
clf_rp_df=pd.DataFrame(clf_rp).transpose()

In [0]:
clf_rp_df

In [0]:
# Write recipe outputs
model_results = dataiku.Folder("dDyDX8Zk")
model_results_info = model_results.get_info()

In [0]:
#save model
path=model_results_info['accessInfo']['root']

In [0]:
path

In [0]:
#save plot
bs = io.BytesIO()
plt.savefig(bs, format="png")
model_results.upload_stream("confusion_matrix_plot.png", bs.getvalue())

In [0]:
with model_results.get_writer("Model_metrics.csv") as writer:
    writer.write(clf_rp_df.to_csv(index = False).encode("utf-8"))

In [0]:
with open("model.pkl","wb") as m:
    pkl.dump(logreg,m)