In [None]:
import sklearn as sk
import pandas as pd
import numpy as np
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.linear_model import LinearRegression, SGDRegressor, LogisticRegression, SGDClassifier
from sklearn.metrics import pairwise_distances_argmin_min




def dist_kern(a,b):
    return(1+np.linalg.norm(a-b))
    
    

def CLoess_classifier(x_train, y_train, x_predict, dist_kernel = dist_kern, training = "standard", No_of_clusters = 10, standard_scale = True):
    
    x_train = np.array(x_train)
    y_train = np.array(y_train)
    x_predict = np.array(x_predict)
    
    if len(y_train.shape)==1:
        y_train = y_train.reshape(y_train.shape[0],1)
    
    #Step 1: Cluster
    cluster_model = MiniBatchKMeans(n_clusters = No_of_clusters, batch_size = 1024, max_iter=25)
    cluster_model.fit(x_train)
    list_of_models = []
    
    #Step 2: Train for each cluster
    predictions = np.zeros((x_predict.shape[0], y_train.shape[1]))
    for i in range(No_of_clusters):

        eval_point = cluster_model.cluster_centers_[i]
        
        weights=np.array([])
        for j in range(x_train.shape[0]):
            weights = np.append(weights, 1/dist_kernel(x_train[j,:],eval_point))
        
        
        if(training =="sgd"):
            model= SGDClassifier()
        else:
            model= LogisticRegression(n_jobs = -1, penalty = "none")
        model.fit(x_train, np.ravel(y_train), sample_weight = 1/weights) 
        
        list_of_models.append(model)
        
    #Step 3: Obtain the centroid  closest to the input point
    closest, _ = pairwise_distances_argmin_min(x_predict, cluster_model.cluster_centers_)
    
    #step 4: Predict 
    for k in range(x_predict.shape[0]):
        predictions[k,:] = list_of_models[closest[k]].predict_proba(x_predict[[k],:])[0][1]
    
    return(predictions, cluster_model.cluster_centers_, list_of_models)
    
    
    
    
    
    
    
def CLoess_regressor(x_train, y_train, x_predict, dist_kernel = dist_kern, training = "standard", No_of_clusters = 10, standard_scale = True):
    
    x_train = np.array(x_train)
    y_train = np.array(y_train)
    x_predict = np.array(x_predict)
    
    if len(y_train.shape)==1:
        y_train = y_train.reshape(y_train.shape[0],1)
    
    #Step 1: Cluster
    
    cluster_model = MiniBatchKMeans(n_clusters = No_of_clusters, batch_size = 1024, max_iter=25)
    cluster_model.fit(x_train)
    list_of_models = []
    
    #Step 2: Train for each cluster
    predictions = np.zeros((x_predict.shape[0], y_train.shape[1]))
    for i in range(No_of_clusters):

        eval_point = cluster_model.cluster_centers_[i]
        
        weights=np.array([])
        for j in range(x_train.shape[0]):
            weights = np.append(weights, 1/dist_kernel(x_train[j,:],eval_point))
        
        
        if(training =="sgd"):
            model= SGDRegressor()
        else:
            model= LinearRegression(n_jobs = -1, penalty = "none")
            
        model.fit(x_train, np.ravel(y_train), sample_weight = 1/weights) 
        
        list_of_models.append(model)
        
    #Step 3: Obtain the centroid  closest to the input point
    closest, _ = pairwise_distances_argmin_min(x_predict, cluster_model.cluster_centers_)
    
    #step 4: Predict
    for k in range(x_predict.shape[0]):
        predictions[k,:] = list_of_models[closest[k]].predict(x_predict[[k],:])
    
    return(predictions, cluster_model.cluster_centers_, list_of_models)
       
    


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

#Data from https://www.kaggle.com/c/santander-customer-transaction-prediction/data
X_santander = pd.read_csv('santander-customer-transaction-prediction/train.csv').iloc[:,2:]
Y_santander = pd.read_csv('santander-customer-transaction-prediction/train.csv').iloc[:,1]

X_train, X_val, Y_train, y_val =  sk.model_selection.train_test_split(X_santander,Y_santander,test_size = 0.2, random_state=11 )


val_predicted, centroids , list_of_models = CLoess_classifier(X_train,Y_train,X_val)

In [21]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)

model_baseline = LogisticRegression(penalty = 'none', solver = 'sag', max_iter=1000, n_jobs = -1)
model_baseline.fit(scaler.transform(X_train),Y_train)


baseline = model_baseline.predict(scaler.transform(X_val))

print('Logistic regression baseline ROC-AUC Score: {}'.format(roc_auc_score(y_val,baseline)))
print('C-LOESS ROC-AUC Score: {}'.format(roc_auc_score(y_val,val_predicted)))


Logistic regression baseline ROC-AUC Score: 0.6220699029105919
C-LOESS ROC-AUC Score: 0.8475336865599182
