In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

Read large data from files.

In [2]:
def readin_large_data():
    """Read large data set into a 2D numpy array
    return: numpy array contains the small data set, shape: (50,000, 15000)
    """   
    PATH = os.getcwd()
    
    data_path = os.path.join(PATH, 'data_large', 'orange_large_train.data','orange_large_train.data.chunk1')
    
    with open(data_path) as f:
        header = f.readline().strip('\n').split('\t')
    
    data_type = {key:np.float64 for key in header[:14740]}
    data_type.update({key:str for key in header[14740:]})
    X = pd.read_table(data_path, dtype=data_type)

    for i in range(2, 6):
        data_path = os.path.join(PATH,'data_large', 'orange_large_train.data','orange_large_train.data.chunk' + str(i))
        
        temp = pd.read_table(data_path, header=None, dtype=data_type)
        temp.columns=header
        X = X.append(temp)
    
    labels = []
    for target in ['upselling', 'churn', 'appetency']:
        PATH = os.getcwd()
        
        LABEL_PATH = os.path.join(PATH, 'data_large','orange_large_train_'+target+'.labels')

        label = pd.read_csv(LABEL_PATH, header=None, delimiter='\t')
        label[label==1] = True
        label[label==-1] = False
        labels.append(label)
    
    y = np.hstack(labels)
    y = np.hstack([y, ~np.any(y, axis=1)[:,np.newaxis]])
    y = y.astype(int)
    
    return X, y

In [None]:
X, y = readin_large_data()

print(X.shape)
print(y.shape)

  if self.run_code(code, result):


Split the dataset to training set, validation set and test set by 8:1:1.

In [None]:
from sklearn.cross_validation import train_test_split

X_, X_test, y_, y_test = train_test_split(X, y, test_size=0.1, random_state=666)
X_train, X_val, y_train, y_val = train_test_split(X_, y_, test_size=1/9, random_state=888)

print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)
print(X_test.shape)
print(y_test.shape)

Clip the variables where the amounts of the missing data are large, and fill in the missing data in remaining variables.

In [5]:
# For numerical data
numerical_train = X_train.iloc[:, :-260]
numerical_val = X_val.iloc[:, :-260]
numerical_test = X_test.iloc[:, :-260]

numerical_missing_rate = numerical_train.isnull().sum() / numerical_train.shape[0]
useful_numerical_train = numerical_train.loc[:, numerical_missing_rate < 0.5]
useful_numerical_val = numerical_val.loc[:, numerical_missing_rate < 0.5]
useful_numerical_test = numerical_test.loc[:, numerical_missing_rate < 0.5]

clean_numerical_train = useful_numerical_train.fillna(useful_numerical_train.mean())
clean_numerical_val = useful_numerical_val.fillna(useful_numerical_train.mean())
clean_numerical_test = useful_numerical_test.fillna(useful_numerical_train.mean())

# For categorical data
categorical_train = X_train.iloc[:, -260:]
categorical_val = X_val.iloc[:, -260:]
categorical_test = X_test.iloc[:, -260:]

categorical_missing_rate = categorical_train.isnull().sum() / categorical_train.shape[0]
useful_categorical_train = categorical_train.loc[:, categorical_missing_rate < 0.5]
useful_categorical_val = categorical_val.loc[:, categorical_missing_rate < 0.5]
useful_categorical_test = categorical_test.loc[:, categorical_missing_rate < 0.5]

clean_categorical_train = useful_categorical_train.fillna(useful_categorical_train.mode().iloc[0])
clean_categorical_val = useful_categorical_val.fillna(useful_categorical_train.mode().iloc[0])
clean_categorical_test = useful_categorical_test.fillna(useful_categorical_train.mode().iloc[0])

In [6]:
print(useful_numerical_train.shape)
print(useful_numerical_val.shape)
print(useful_numerical_test.shape)
print(clean_numerical_train.shape)
print(clean_numerical_val.shape)
print(clean_numerical_test.shape)
print(useful_categorical_train.shape)
print(useful_categorical_val.shape)
print(useful_categorical_test.shape)
print(clean_categorical_train.shape)
print(clean_categorical_val.shape)
print(clean_categorical_test.shape)

(40000, 14438)
(5000, 14438)
(5000, 14438)
(40000, 14438)
(5000, 14438)
(5000, 14438)
(40000, 36)
(5000, 36)
(5000, 36)
(40000, 36)
(5000, 36)
(5000, 36)


Do one-hot to categorical data, only remaining the variables whose category number is in [2, 100]

In [7]:
def do_one_hot(categorical_data, train_len, threshold=100):
    useful_index = []
    for i in range(categorical_data.shape[1]):
        variances = categorical_data.iloc[:,i]
        uni_len = len(variances[:train_len].unique())
        if uni_len>=2 and uni_len<=threshold:
            useful_index.append(i)
    return pd.get_dummies(categorical_data.iloc[:,useful_index])
    
one_hot_data = do_one_hot(pd.concat([clean_categorical_train, clean_categorical_val, clean_categorical_val], ignore_index=True), clean_categorical_train.shape[0])

one_hot_train = one_hot_data.iloc[:clean_categorical_train.shape[0],:]
one_hot_val = one_hot_data.iloc[clean_categorical_train.shape[0]:clean_categorical_train.shape[0]+clean_categorical_val.shape[0],:]
one_hot_test = one_hot_data.iloc[-clean_categorical_test.shape[0]:,:]

In [8]:
print(one_hot_train.shape)
print(one_hot_val.shape)
print(one_hot_test.shape)

(40000, 442)
(5000, 442)
(5000, 442)


Combine the numerical data and categorical data.

In [9]:
clean_numerical_train.index = one_hot_train.index
clean_numerical_val.index = one_hot_val.index
clean_numerical_test.index = one_hot_test.index

In [10]:
X_train_preprocessed = pd.concat([clean_numerical_train,one_hot_train], axis=1, ignore_index=True)
X_val_preprocessed = pd.concat([clean_numerical_val,one_hot_val], axis=1, ignore_index=True)
X_test_preprocessed = pd.concat([clean_numerical_test,one_hot_test], axis=1, ignore_index=True)

In [11]:
print(X_train_preprocessed.shape)
print(X_val_preprocessed.shape)
print(X_test_preprocessed.shape)

(40000, 14880)
(5000, 14880)
(5000, 14880)


Standardize the data.

In [12]:
from sklearn import preprocessing

mean_train = X_train_preprocessed.mean(axis=0)
std_train = X_train_preprocessed.std(axis=0)

X_train_scaled = (X_train_preprocessed - mean_train)/(std_train+1e-8)
X_val_scaled = (X_val_preprocessed - mean_train)/(std_train+1e-8)
X_test_scaled = (X_test_preprocessed - mean_train)/(std_train+1e-8)

Save labels

In [None]:
np.save('y_train.npy', y_train)
np.save('y_val.npy', y_val)
np.save('y_test.npy', y_test)

Reduce dimensions. (Choose one of the following methods.）

PCA:

In [13]:
from sklearn.decomposition import PCA

LOW_DIMENSION = 1000

pca = PCA(n_components=LOW_DIMENSION).fit(X_train_scaled)
X_train_pca = pca.transform(X_train_scaled)
X_val_pca = pca.transform(X_val_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [14]:
print(X_train_pca.shape)
print(X_val_pca.shape)
print(X_test_pca.shape)

(40000, 1000)
(5000, 1000)
(5000, 1000)


In [15]:
np.save('X_train_pca_'+str(LOW_DIMENSION), X_train_pca)
np.save('X_val_pca_'+str(LOW_DIMENSION), X_val_pca)
np.save('X_test_pca_'+str(LOW_DIMENSION), X_test_pca)

Isomap:

In [None]:
from sklearn.manifold import Isomap

# try some configurations of the following hyperparameters.
LOW_DIMENSION = 2
NUM_NEIGHBORS = 3

isomap = Isomap(n_neighbors=NUM_NEIGHBORS, n_components=LOW_DIMENSION).fit(X_train_scaled)
X_train_iso = isomap.transform(X_train_scaled)
X_val_iso = isomap.transform(X_val_scaled)
X_test_iso = isomap.transform(X_test_scaled)

In [None]:
print(X_train_iso.shape)
print(X_val_iso.shape)
print(X_test_iso.shape)

In [None]:
np.save('X_train_iso_'+str(LOW_DIMENSION)+'_'+str(NUM_NEIGHBORS), X_train_iso)
np.save('X_val_iso_'+str(LOW_DIMENSION)+'_'+str(NUM_NEIGHBORS), X_val_iso)
np.save('X_test_iso_'+str(LOW_DIMENSION)+'_'+str(NUM_NEIGHBORS), X_test_iso)

LLE:

In [None]:
from sklearn.manifold import LocalLinearEmbedding

# try some configurations of the following hyperparameters.
LOW_DIMENSION = 2
NUM_NEIGHBORS = 3

lle = LocalLinearEmbedding(n_neighbors=NUM_NEIGHBORS, n_components=LOW_DIMENSION).fit(X_train_scaled)
X_train_lle = lle.transform(X_train_scaled)
X_val_lle = lle.transform(X_val_scaled)
X_test_lle = lle.transform(X_test_scaled)

In [None]:
print(X_train_iso.shape)
print(X_val_iso.shape)
print(X_test_iso.shape)

In [None]:
np.save('X_train_lle_'+str(LOW_DIMENSION)+'_'+str(NUM_NEIGHBORS), X_train_lle)
np.save('X_val_lle_'+str(LOW_DIMENSION)+'_'+str(NUM_NEIGHBORS), X_val_lle)
np.save('X_test_lle_'+str(LOW_DIMENSION)+'_'+str(NUM_NEIGHBORS), X_test_lle)