In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from sklearn.model_selection import train_test_split

In [2]:
# Tạo hàm phân tích đơn biến cho biến Continous
def PT_donbien_Continous(df, x):                  # df: dataframe, x: column name
    print('Thống kê chung:\n', x.describe())
    print('mode = %f' %(x.mode()[0]))
    print('median = %f' %(x.median()))
    print('variance = %.3f' %(x.var()))
    print('standard = %.3f' %(x.std()))
    print('range = %.3f' %(np.ptp(x)))
    
    Q1 = np.percentile(x,25)
    print('Q1 = %.3f' %Q1)
    
    Q3 = np.percentile(x,75)
    print('Q3 = %.3f' %Q3)
    
    IQR = scipy.stats.iqr(x)
    print('IQR = %.3f' %IQR)
    
    skew = scipy.stats.skew(x)
    if skew > 0:
        print('Skew = %.3f > 0 => Phân phối lệch phải' %skew)
    elif skew == 0:
        print('Skew = %.3f = 0 => Phân phối chuẩn' %skew)
    else:
        print('Skew = %.3f < 0 => Phân phối lệch trái' %skew)
        
    kurtosis = scipy.stats.kurtosis(x)
    if kurtosis > 0:
        print('Kurtosis = %.3f > 0 => Phân phối nhọn hơn phân phối chuẩn' %kurtosis)
    elif kurtosis == 0:
        print('Kurtosis = %.3f = 0 => Phân phối chuẩn' %kurtosis)
    else:
        print('Kurtosis = %.3f < 0 => Phân phối phẳng hơn phân phối chuẩn' %kurtosis)
        
    # Biểu đồ phân phối của biến
    plt.figure(figsize = (10,5))
    plt.subplot(1,2,1)
    plt.hist(x,log=True)
    plt.subplot(1,2,2)
    sns.distplot(x)
    plt.show()
    
    # Biểu đồ phân tán
    plt.figure(figsize = (4,8))
    plt.boxplot(x)
    plt.show()
    
    # Số lượng Outlier trên IQR
    n_Outlier_upper = df[x > (Q3 + 1.5*IQR)].shape[0]
    print('Số lượng Outlier trên IQR: %d' %(n_Outlier_upper))
    
    # Số lượng Outlier dưới IQR
    n_Outlier_lower = df[x < (Q1 - 1.5*IQR)].shape[0]
    print('Số lượng Outlier dưới IQR: %d' %(n_Outlier_lower))
    
    # Tỷ lệ của Outlier trong biến
    Outlier_percent = (df[x > (Q3 + 1.5*IQR)].shape[0] + df[x < (Q1 - 1.5*IQR)].shape[0])/len(x)
    print('Tỷ lệ của Outlier trong biến: %.3f' %(Outlier_percent))

In [3]:
# Tạo hàm phân tích đơn biến cho biến Categorical
def PT_donbien_Categorical(df,cols):
    count = cols.value_counts()
    print('\nCác giá trị duy nhất của biến: \n',count)
    
    sns.set()
    count.plot.bar()
    plt.show()

In [4]:
def Tim_K_Classification(X_train, y_train, X_test, y_test):

    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.metrics import accuracy_score

    list_k = []
    list_score = []
    for K_value in range(2,int(y_train.shape[0]**0.5)): 
        list_k.append(K_value)
        neigh = KNeighborsClassifier(n_neighbors = K_value)
        neigh.fit(X_train, y_train) 
        y_pred = neigh.predict(X_test)
        score = neigh.score(X_test, y_test)
        list_score.append(score)
        print("Accuracy is ", score,"% for K-Value:",K_value)
        
    vi_tri = list_score.index(max(list_score))
    k = list_k[vi_tri]
    print("\nThe optimal number of neighbors is", k, "with", list_score[vi_tri])
    plt.plot(list_k, list_score)
    plt.xlabel('Number of Neighbors K')
    plt.ylabel('Test Accuracy')
    plt.show()

In [5]:
def Tim_K_Regression(X_train, y_train, X_test, y_test):

    from sklearn.neighbors import KNeighborsRegressor
    from sklearn.metrics import accuracy_score

    list_k = []
    list_score = []
    for K_value in range(2,int(y_train.shape[0]**0.5)): 
        list_k.append(K_value)
        neigh = KNeighborsRegressor(n_neighbors = K_value)
        neigh.fit(X_train, y_train) 
        y_pred = neigh.predict(X_test)
        score = neigh.score(X_test, y_test)
        list_score.append(score)
        print("Accuracy is ", score,"% for K-Value:",K_value)
        
    vi_tri = list_score.index(max(list_score))
    k = list_k[vi_tri]
    print("\nThe optimal number of neighbors is", k, "with", list_score[vi_tri])
    plt.plot(list_k, list_score)
    plt.xlabel('Number of Neighbors K')
    plt.ylabel('Test Accuracy')
    plt.show()

In [6]:
def select_bestmodel_Classifier(x,y):
    
    from sklearn.linear_model import LogisticRegression
    from sklearn.naive_bayes import GaussianNB
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.svm import SVC
    import datetime
    #from sklearn import model_selection
    #from sklearn.model_selection import KFold
    
    models = [
        LogisticRegression(multi_class='multinomial', solver='saga'),
        GaussianNB(),
        KNeighborsClassifier(),
        DecisionTreeClassifier(criterion='entropy'),
        RandomForestClassifier(n_estimators=200),
        SVC(kernel='rbf')
    ]
    CV=30
    entries = []
    for model in models:
        scores_train = []
        scores_test = []
        abs_scores = []
        time_scores = []
        for j in range(CV):
            X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2)
            t1 = datetime.datetime.now()
            model.fit(X_train,y_train)
            t2 = datetime.datetime.now()

            model_name = model.__class__.__name__
            s_train = model.score(X_train,y_train)
            s_test = model.score(X_test, y_test)
            scores_train.append(s_train)
            scores_test.append(s_test)
            abs_scores.append(abs(s_train-s_test))
            time_scores.append(round((t2-t1).microseconds/1000,1))

        entries.append([model_name, np.array(scores_train).mean(), np.array(scores_test).mean(), np.array(abs_scores).mean(), np.array(time_scores).mean()])

    cv_df = pd.DataFrame(entries, columns=['model_name','score_train_mean','score_test_mean','score_abs_mean','time'])
    return cv_df

In [7]:
def select_bestmodel_Regression(x,y):
    
    from sklearn.linear_model import LinearRegression
    from sklearn.naive_bayes import GaussianNB
    from sklearn.neighbors import KNeighborsRegressor
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.svm import SVR
    from sklearn import model_selection
    
    import datetime

    models = [
        LinearRegression(),
        GaussianNB(),
        KNeighborsRegressor(),
        DecisionTreeRegressor(),
        RandomForestRegressor(n_estimators=200),
        SVR(kernel='rbf', C=100, gamma=0.1)
    ]
    CV=30
    entries = []
    for model in models:
        scores_train = []
        scores_test = []
        abs_scores = []
        time_scores = []
        for j in range(CV):
            X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2)
            t1 = datetime.datetime.now()
            model.fit(X_train,y_train)
            t2 = datetime.datetime.now()

            model_name = model.__class__.__name__
            s_train = model.score(X_train,y_train)
            s_test = model.score(X_test, y_test)
            scores_train.append(s_train)
            scores_test.append(s_test)
            abs_scores.append(abs(s_train-s_test))
            time_scores.append(round((t2-t1).microseconds/1000,1))

        entries.append([model_name, np.array(scores_train).mean(), np.array(scores_test).mean(), np.array(abs_scores).mean(), np.array(time_scores).mean()])

    cv_df = pd.DataFrame(entries, columns=['model_name','score_train_mean','score_test_mean','score_abs_mean','time'])
    return cv_df

In [8]:
def K_Fold(x,y):
    
    import datetime

    models = [
        LogisticRegression(solver='liblinear'),
        GaussianNB(),
        KNeighborsClassifier(),
        DecisionTreeClassifier(),
        RandomForestClassifier(n_estimators=200),
        SVC(kernel='linear')
    ]
    entries = []
    for model in models:
        model_name = model.__class__.__name__
        clf_k = model
        kfold = KFold(n_splits=30)   
        t1 = datetime.datetime.now()
        results = model_selection.cross_val_score(clf_k, x, y, cv=kfold)
        t2 = datetime.datetime.now()
        print('Accuracy: %.2f%% (%.2f%%)' %(results.mean()*100.0, results.std()*100.0))
        entries.append([model_name, np.array(round(results.mean(),4)), np.array(round((t2-t1).microseconds/1000,1))])

    cv_df_2 = pd.DataFrame(entries, columns=['model_name','score_mean','time'])

    return cv_df_2


In [9]:
def save_model(model, model_name):
    import pickle
    pkl_filename = model_name
    with open(pkl_filename, 'wb') as file:
        pickle.dump(model, file)

In [10]:
def read_model(pkl_filename):
    import pickle
    with open(pkl_filename, 'rb') as file: 
        nba_model = pickle.load(file)
        return nba_model