In [1]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("white")

import scipy
from scipy.stats import iqr
from scipy.stats import chi2_contingency
from scipy.stats import chi2
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd

from underthesea import word_tokenize, pos_tag, sent_tokenize
import regex
import demoji
from pyvi import ViPosTagger, ViTokenizer
import string

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression

# for report:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# vẽ đường cong ROC
from sklearn.metrics import roc_auc_score, roc_curve

import warnings
warnings.filterwarnings("ignore")

In [2]:
# data = pd.read_excel('Data/Online Retail.xlsx', sheet_name = 'Online Retail', engine='openpyxl')
# df = pd.read_csv('Data/poverty.txt', sep = '\t')

In [3]:
## Hàm kiểm tra và tính số lượng, tỷ trọng outliers
def check_outlier(col):
    Q1 = np.percentile(col, 25)
    print('Q1:       ', Q1)
    Q3 = np.percentile(col, 75)
    print('Q3:       ', Q3)
    IQR = scipy.stats.iqr(col)
    print('IQR:      ', IQR)
    highOutliers = (col >= Q3 + 1.5*IQR).sum()
    lowOutliers  = (col <= Q1 - 1.5*IQR).sum()
    print('# Number of upper outliers: ', highOutliers)
    print('# Number of lower outliers: ', lowOutliers)
    print('# Percentage of ouliers:    ', (highOutliers + lowOutliers)/col.shape[0])

In [4]:
## Hàm remove outliers
def remove_outlier(variable, data_param):
# Detection
    Q1 = np.percentile(data_param[variable], 25)
    Q3 = np.percentile(data_param[variable], 75)
    IQR = scipy.stats.iqr(data_param[variable])    
    # Upper bound
    upper = np.where(data_param[variable] >= (Q3 + 1.5*IQR))
    # Lower bound
    lower = np.where(data_param[variable] <= (Q1 - 1.5*IQR))
    # Removing the Outliers
    data_param.drop(upper[0], inplace = True)
    data_param.drop(lower[0], inplace = True)
    data_param.reset_index(drop=True, inplace=True)
    return data_param

In [None]:
def process_text(text, emoji_dict, teen_dict, wrong_lst):
    document = text.lower()
    document = document.replace("’",'')
    document = regex.sub(r'\.+', ".", document)
    
#     # Remove punctuation
#     document = regex.sub('[^\w\s]', ' ', document)
#     punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
#     for char in punctuation:
#         document = document.replace(char, ' ')

#     # Remove numbers, only keep letters
#     document = regex.sub('[\w]*\d+[\w]*', '', document)

#     # Some lines start with a space, remove them
#     document = regex.sub('^[\s]{1,}', '', document)    
    
#     # Remove multiple spaces with one space
#     document = regex.sub('[\s]{2,}', ' ', document)

#     # Some lines end with a space, remove them
#     document = regex.sub('[\s]{1,}$', '', document)    

#     # Remove end of line characters
#     document = regex.sub(r'[\r\n]+', ' ', document)    

#     # Remove HTTP links
#     document = regex.sub(r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*', '',
#                                           document)    
    new_sentence =''
    for sentence in sent_tokenize(document):
        # if not(sentence.isascii()):
        ###### CONVERT EMOJICON
        sentence = ''.join(emoji_dict[word]+' ' if word in emoji_dict else word for word in list(sentence))
        ###### CONVERT TEENCODE
        sentence = ' '.join(teen_dict[word] if word in teen_dict else word for word in sentence.split())
        ###### DEL Punctuation & Numbers
        pattern = r'(?i)\b[a-záàảãạăắằẳẵặâấầẩẫậéèẻẽẹêếềểễệóòỏõọôốồổỗộơớờởỡợíìỉĩịúùủũụưứừửữựýỳỷỹỵđ]+\b'
        sentence = ' '.join(regex.findall(pattern,sentence))
        ###### DEL wrong words   
        sentence = ' '.join('' if word in wrong_lst else word for word in sentence.split())
        new_sentence = new_sentence+ sentence + '. '                    
    document = new_sentence  
    #print(document)
    ###### DEL excess blank space
    document = regex.sub(r'\s+', ' ', document).strip()
    #...
    return document

In [None]:
# Chuẩn hóa unicode tiếng việt
def loaddicchar():
    uniChars = "àáảãạâầấẩẫậăằắẳẵặèéẻẽẹêềếểễệđìíỉĩịòóỏõọôồốổỗộơờớởỡợùúủũụưừứửữựỳýỷỹỵÀÁẢÃẠÂẦẤẨẪẬĂẰẮẲẴẶÈÉẺẼẸÊỀẾỂỄỆĐÌÍỈĨỊÒÓỎÕỌÔỒỐỔỖỘƠỜỚỞỠỢÙÚỦŨỤƯỪỨỬỮỰỲÝỶỸỴÂĂĐÔƠƯ"
    unsignChars = "aaaaaaaaaaaaaaaaaeeeeeeeeeeediiiiiooooooooooooooooouuuuuuuuuuuyyyyyAAAAAAAAAAAAAAAAAEEEEEEEEEEEDIIIOOOOOOOOOOOOOOOOOOOUUUUUUUUUUUYYYYYAADOOU"

    dic = {}
    char1252 = 'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ'.split(
        '|')
    charutf8 = "à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ".split(
        '|')
    for i in range(len(char1252)):
        dic[char1252[i]] = charutf8[i]
    return dic
 
# Đưa toàn bộ dữ liệu qua hàm này để chuẩn hóa lại
def covert_unicode(txt):
    dicchar = loaddicchar()
    return regex.sub(
        r'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ',
        lambda x: dicchar[x.group()], txt)

In [None]:
def process_special_word(text):
    new_text = ''
    text_lst = text.split()
    i= 0
    if 'không' in text_lst:
        while i <= len(text_lst) - 1:
            word = text_lst[i]
            #print(word)
            #print(i)
            if  word == 'không':
                next_idx = i+1
                if next_idx <= len(text_lst) -1:
                    word = word +'_'+ text_lst[next_idx]
                i= next_idx + 1
            else:
                i = i+1
            new_text = new_text + word + ' '
    else:
        new_text = text
    return new_text.strip()

In [None]:
def process_postag_thesea(text):
    new_document = ''
    for sentence in sent_tokenize(text):
        sentence = sentence.replace('.','')
        ###### POS tag
        lst_word_type = ['N','Np','A','AB','V','VB','VY','R']
        # lst_word_type = ['A','AB','V','VB','VY','R']
        sentence = ' '.join( word[0] if word[1].upper() in lst_word_type else '' for word in pos_tag(process_special_word(word_tokenize(sentence, format="text"))))
        new_document = new_document + sentence + ' '
    ###### DEL excess blank space
    new_document = regex.sub(r'\s+', ' ', new_document).strip()
    return new_document

In [None]:
def remove_stopword(text, stopwords):
    ###### REMOVE stop words
    document = ' '.join('' if word in stopwords else word for word in text.split())
    #print(document)
    ###### DEL excess blank space
    document = regex.sub(r'\s+', ' ', document).strip()
    return document

In [5]:
# Function to analyze one continuous variable
def oneContAnalysis(col):
    print('\n===================== Các chỉ số thống kê =========================')
    print('\nStats:    ', col.describe(include='all'))
    print('\nMedian:   ', col.median())
    print('Mode:     ', col.mode())
    print('Range:    ', np.ptp(col))
    Q1 = np.percentile(col, 25)
    Q3 = np.percentile(col, 75)
    print('Q1 = ', Q1)
    print('Q3 = ', Q3)
    iqr = scipy.stats.iqr(col)
    print('IQR:      ', iqr)
    print('Variance: ', col.var())
    print('Std:      ', col.std())
    
    skew = round(col.skew(), 2)
    print('Skewness: ', skew)
    if (skew > 0):
        print('\t\tPhân phối lệch phải')
    else: 
        print('\t\tPhân phối lệch trái')
    kur = round(col.kurtosis(), 2)
    print('Kurtosis: ', kur)
    if (kur > 0):
        print('\t\t phân phối nhọn hơn phân phối chuẩn')
    else: 
        print('\t\t phân phối ít nhọn hơn phân phối chuẩn')
    
    print('\n======= Visualization========\n### Histogram')
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    sns.distplot(col)
    plt.subplot(1, 2, 2)
    plt.hist(col)
    plt.show()
    print('### Boxplot')
    plt.figure(figsize=(4,8))
    plt.boxplot(col)
    plt.show()
    
    highOutliers = (col > Q3 + 1.5*iqr).sum()
    lowOutliers = (col < Q1 - 1.5*iqr).sum()
    print('# Number of upper outliers: ', highOutliers)
    print('# Number of lower outliers: ', lowOutliers)
    print('# Percentage of ouliers:    ', (highOutliers + lowOutliers)/col.shape[0])

    
# Function to analyze one categorical variable
def oneCategoryAnalysis(col):
    count = col.groupby(col).size()
    print(count)
    plt.figure(figsize=(10, 5))
    count.plot.bar()

In [6]:
# Function to analyze relationship between categorical-continuous variables.
def contCatAnalysis (cat, cont):
    data = pd.DataFrame([cat, cont]).T
    
    plt.figure(figsize=(12,10))
    sns.boxplot(x=cat, y = cont, palette="Set3") 
    plt.show()
    
    model = ols('cont ~ C(cat)', data=data).fit()
    anova_table = sm.stats.anova_lm(model, typ=2)
    print(anova_table)
    
    # perform multiple pairwise comparison (Tukey HSD)
    m_comp = pairwise_tukeyhsd(endog=cont, 
    groups=cat, 
    alpha=0.05)
    print('\n', m_comp)


In [7]:
# Function to analyze relationship between two categorical variables.

def catCatAnalysis(cat1, cat2):                      # takes 2 columns
    # Contingency table: Ho: 2 bien independent
    table_FB = pd.crosstab(cat1, cat2)
    print(table_FB)
    
    plt.figure(figsize=(12,10))
    table_FB.plot(kind='bar', stacked=True)
    plt.show()
    
    # Chi-Square Test
    stat, p, dof, expected = chi2_contingency(table_FB)
    print('dof=%d' % dof)
    print('p=', p)
    
    # interpret test-statistic
    prob = 0.95
    critical = chi2.ppf(prob, dof)
    print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat))
    
    # interpret p-value
    alpha = 1.0 - prob
    print('significance=%.3f, p=%.3f' % (alpha, p))
    if p <= alpha:
        print('Dependent (reject H0)')
    else:
        print('Independent (fail to reject H0)')


In [8]:
############### Function phân tích mối quan hệ continuous-continuous

def contContAnalysis(v1, v2):                        # names of 2 columns, df global                   
    print('\n#', v1, 'VS', v2, '\n')    
    print(df[[v1, v2]].corr())    
    sns.pairplot(df[[v1, v2]])


In [9]:
# # Apply SelectKBest class to extract all best features

# bestfeatures = SelectKBest(score_func=f_regression, k=2)
# fit = bestfeatures.fit(X,y)
# dfscores = pd.DataFrame(fit.scores_)
# dfcolumns = pd.DataFrame(X.columns)
# # Concat two dataframes for better visualization 
# featureScores = pd.concat([dfcolumns,dfscores],axis=1)
# featureScores.columns = ['Specs','Score']  #naming the dataframe columns
# # Sorting in descending order 
# featureScores.sort_values("Score", ascending = False, inplace = True)
# print(featureScores)  

# # Correlation Matrix with Heatmap
# corrmat = df.corr()
# top_corr_features = corrmat.index
# print(corrmat)

# plt.figure(figsize=(15,7))
# # plot heat map
# g=sns.heatmap(df[top_corr_features].corr(),cmap="RdYlGn", annot=True) 
# # annot=True: nếu muốn in cả giá trị
# plt.show()


In [10]:
######## dummies, scaler, polynomials......

# X = pd.get_dummies(data = X, drop_first = True)
# y = df['loan_status'].apply(lambda x: 0 if (x == 'PAIDOFF') else 1)

# scaler = MinMaxScaler()
# X[['Principal','terms','age']] = scaler.fit_transform(X[['Principal','terms','age']])

# pf = PolynomialFeatures(degree=2)
# X = pf.fit_transform(X)

In [11]:
def linearModel(X, y):    # X DataFrame, y series
   
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    yhat_train = model.predict(X_train)
    yhat_test = model.predict(X_test)
    
    # tính R^2
    print('The train R-squared:', model.score(X_train, y_train))
    print('The test R-squared:', model.score(X_test, y_test))
    
    mse_train = mean_squared_error(y_true=y_train, y_pred=yhat_train)
    mse_test = mean_squared_error(y_true=y_test, y_pred=yhat_test)
    mae_train = mean_absolute_error(y_true=y_train, y_pred=yhat_train)
    mae_test = mean_absolute_error(y_true=y_test, y_pred=yhat_test)

    print('\nThe MSE of y and predicted in train:', mse_train)
    print('The MSE of y and predicted in test:', mse_test)
    print('The MAE of y and predicted in train:', mae_train)
    print('The MAE of y and predicted in test:', mae_test)
    
    print('\nSlope: ', model.coef_)
    print('Intercept: ', model.intercept_)
    
    plt.figure(figsize=(5,5))
    plt.scatter(yhat_test, y_test)
    plt.xlabel('Model Predictions - y_predict')
    plt.ylabel('True values - y_test')
    plt.plot([np.min(y),np.max(y) ], [np.min(y),np.max(y) ], color='r')
    plt.show()
    
    f, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(16, 8))
    # train
    sns.kdeplot(y_train, color='r', ax=ax1)
    sns.kdeplot(yhat_train, color='b', ax=ax1)
    ax1.set_title('Actual vs Predicted in Train values')

    # test
    sns.kdeplot(y_test, color='r', label='Actual Values', ax=ax2)
    sns.kdeplot(yhat_test, color='b', label='Predicted Values', ax=ax2)
    ax2.set_title('Actual vs Predicted in Test values')
    plt.legend()
    plt.show()
    
    return model
    

In [12]:
def visualize_model_reg(y,y_pred):
    plt.figure(figsize=(12,6))
    plt.subplot(121)
    plt.scatter(y_pred,y)
    plt.xlabel('Model Predictions')
    plt.ylabel('True Value')
    plt.plot([0,np.max(y)+2*np.min(y)],[0,np.max(y)+2*np.min(y)],'-',color='r')
    plt.subplot(122)
    sns.distplot(y, hist=False,color='r',label='True Value')
    sns.distplot(y_pred, hist=False,color='b',label='Model Predictions',axlabel='Distribution')
    plt.legend()
    plt.show()

In [13]:
def static_score_model_reg(y,y_pred):
    r2 = r2_score(y,y_pred)
    mse = mean_squared_error(y,y_pred)
    msa = mean_absolute_error(y,y_pred)
    return r2, mse, msa

In [14]:
# visualize_model_reg(df['Actual Value'],df['Predict Value'])

In [15]:
# r2, mse, mae = static_score_model_reg(df['Actual Value'],df['Predict Value'])
# print('Model score:',r2)
# print('Model MSE:',mse)
# print('Model MSA:',mae)

In [16]:
# slope = model.coef_
# slope

In [17]:
# intercept = model.intercept_
# intercept

In [18]:
# x test data....list
# y_predict = model.predict(np.array(x).reshape(1, -1))

In [19]:
###############################################################################################################################

In [20]:
def logisticModel(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
    
    model = LogisticRegression(solver = 'lbfgs', multi_class='multinomial')
    model.fit(X_train, y_train)
    
    yhat_train = model.predict(X_train)
    yhat_test = model.predict(X_test)
    
    yhat_train_proba = model.predict_proba(X_train)
    yhat_test_proba = model.predict_proba(X_test)
    
    print('\nTrain:', accuracy_score(y_train, yhat_train))
    print('Test:', accuracy_score(y_test, yhat_test))
    
    # tính R^2
    print('\nThe train R-squared (accuracy_score):', model.score(X_train, y_train))
    print('The test R-squared (accuracy_score):', model.score(X_test, y_test))
        
    
    # confusion matrix
    cm = confusion_matrix(y_true=y_test, y_pred=yhat_test)
    print('\nConfusion Matrix:\n', cm)
    
    
    # target_names = [....how many classification..]
    print(classification_report(y_true=y_test, y_pred=yhat_test))  # target_names = target_names
    
    roc_auc_score(y_true=y_test, y_score=yhat_test_proba[:, 1])
    
    fpr, tpr, thresholds = roc_curve(y_true=y_test, y_score=yhat_test_proba[:,1])
    
    plt.figure(figsize=(6,6))
    plt.plot([0,1], [0,1], 'r--')
    plt.plot(fpr, tpr, marker='.')
    plt.title('ROC Curve')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.show()
    
    cm_df = pd.DataFrame(cm) # index = target_names, columns = target_names
    plt.figure(figsize = (8,6))
    sns.heatmap(cm_df, annot=True, fmt='g', cmap='Blues')
    plt.title('Logistic Regression\nAccuracy: {0:.3f}'.format(accuracy_score(y_test, yhat_test)))
    plt.ylabel('True Values')
    plt.xlabel('Predicted Values')
    plt.show()
    
    return model         #scatter plot

In [21]:
def logisticModelMulti_y(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
    
    model = LogisticRegression(solver = 'lbfgs', multi_class='multinomial')
    model.fit(X_train, y_train)
    
    yhat_train = model.predict(X_train)
    yhat_test = model.predict(X_test)
    
#     yhat_train_proba = model.predict_proba(X_train)
#     yhat_test_proba = model.predict_proba(X_test)
    
#     print('\nTrain:', accuracy_score(y_train, yhat_train))
#     print('Test:', accuracy_score(y_test, yhat_test))
    
    # tính R^2
    print('\nThe train R-squared (accuracy_score):', model.score(X_train, y_train))
    print('The test R-squared (accuracy_score):', model.score(X_test, y_test))
        
    
    # confusion matrix
    cm = confusion_matrix(y_true=y_test, y_pred=yhat_test)
    print('\nConfusion Matrix:\n', cm)
        
    # target_names = [....how many classification..]
    print(classification_report(y_true=y_test, y_pred=yhat_test))  # target_names = target_names
    
    cm_df = pd.DataFrame(cm) # index = target_names, columns = target_names
    plt.figure(figsize = (8,6))
    sns.heatmap(cm_df, annot=True, fmt='g', cmap='Blues')
    plt.title('Logistic Regression\nAccuracy: {0:.3f}'.format(accuracy_score(y_test, yhat_test)))
    plt.ylabel('True Values')
    plt.xlabel('Predicted Values')
    plt.show()
    
    return model

In [22]:
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import accuracy_score

def Static_score_model_class(y, y_pred, aver=None):
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred, average = aver)
    recall = recall_score(y, y_pred, average = aver)
    f1 = f1_score(y, y_pred, average = aver)
    
    return accuracy, recall, precision, f1

# print('Accuracy: ', Res[0])
# print('Recall: ', Res[1])
# print('Precision: ', Res[2])
# print('f1-Score: ', Res[3])

# # target_names = [....how many classification..]
#     print(classification_report(y_true=y_test, y_pred=yhat_test))  # target_names = target_names

In [23]:
def Visualize_confusion_matrix(y, y_pred):
    cm = confusion_matrix(y, y_pred)
    cm_df = pd.DataFrame(cm)                 # index=target_names, columns=target_names = target_names
    
    plt.figure(figsize=(6,5))
    sns.heatmap(cm_df, annot=True, fmt='g', cmap='Blues')
    
    plt.title('Logistic Regression\nAccuracy: {0:.3f}'.format(accuracy_score(y, y_pred)))
    plt.ylabel('True Values')
    plt.xlabel('Predicted Values')
    plt.show()
    return

In [24]:
def Visualize_data(X1, X2, y, title):
    plt.figure(figsize = (6,5))
    sns.scatterplot(X1, X2, hue=y, cmap='Sequential')
    
    plt.title(title)
    plt.show()
    return
# from mpl_toolkits.mplot3d import Axes3D                       classification: X 3 inputs
# fig = plt.figure(figsize=(6,6))
# ax = fig.add_subplot(111, projection='3d')
# ax.scatter(df['x1'], df['x2'], df['class'], c=df['class'])
# plt.show()

In [25]:
# vẽ đường ROC
from sklearn.metrics import roc_auc_score, roc_curve

def ROC_AUC(y, y_prob):
    # Calculate roc curves
    fpr, tpr, threshold = roc_curve(y, y_prob)
    # Calculate scores
    model_auc = roc_auc_score(y, y_prob)
    
    # Calculate the roc curve for the model
    plt.figure(figsize=(6,6))
    plt.plot([0,1], [0,1], linestyle='--', label='No Skill' )
    plt.plot(fpr, tpr, marker='.', label='Model - AUC%.3f' % (model_auc))
    
    plt.title('ROC Curve')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend()
    plt.show(block=False)
    return

In [26]:
from sklearn.metrics import auc, precision_recall_curve   # when unbalanced 0 and 1....resample ???
def Precision_Recall_AUC(y, y_prob):
    precision, recall, threshold = precision_recall_curve(y, y_prob)
    model_auc = auc(recall, precision)
    
    ns = len(y[y==1]) / len(y)
    plt.plot([0,1], [ns,ns], linestyle='--', label='No Skill' )
    plt.plot(recall, precision, marker='.', label='Model - AUC%.3f' % (model_auc))
    
    plt.title('Precision_Recall Curve')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.legend()
    plt.show()
    return

### Xac dinh nguong toi uu

In [27]:
def Threshold_ROC(y,y_prob):
    # calculate roc curves
    fpr, tpr, threshold = roc_curve(y, y_prob)
    scores=tpr-fpr
    pos= np.argmax(scores)
    return threshold[pos],scores[pos]

In [28]:
def Threshold_PrecisionRecall(y, y_prob):
    precision, recall, threshold = precision_recall_curve(y, y_prob)
    score = (2 * precision * recall)/(precision + recall)
    pos = np.argmax(score) 
    
    return threshold[pos], score[pos]

In [29]:
import pickle

def Save_Object(obj,filename):
    with open(filename, 'wb') as file:  
        pickle.dump(obj, file)
    return 

In [30]:
# with open(filename, 'rb') as file:
#     pickle_model = pickle_load(fle)