## Dependencias

In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import cufflinks as cf

from sklearn.manifold import MDS
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

from scipy import stats

cf.go_offline()
pd.set_option('display.float_format', lambda x: '%.3f' % x)

## Lectura de datos

In [3]:
ruta = "/media/jose/090f6b94-de30-4aaf-9f8a-4e18b120d7f6/bd/01. Simples/pure_ml/train.csv"

In [4]:
df = pd.read_csv(ruta)

In [5]:
df.shape

(9294, 55)

In [6]:
df.head()

Unnamed: 0,ID,C_01,C_02,C_03,C_04,C_05,C_06,C_07,C_08,C_09,...,C_45,C_46,C_47,C_48,C_49,C_50,C_51,C_52,D_1,TARGET
0,674,41.903,-87.643,41.902,-87.648,41.901,-87.642,41.902,-87.649,41.9,...,1114,607.756,18.212,0.442,12.578,0.089,0.386,0.013,cat11,0.0
1,7397,,,,,,,,,,...,981,800.979,514.138,0.43,16.391,0.07,0.174,0.261,cat01,0.0
2,11558,41.891,-87.641,41.897,-87.651,41.892,-87.646,41.899,-87.652,41.893,...,706,699.008,231.262,0.358,12.857,0.095,0.101,0.055,cat01,0.0
3,463,41.895,-87.665,41.897,-87.657,41.899,-87.667,41.898,-87.659,41.901,...,947,633.39,391.8,0.335,14.844,0.088,0.271,0.155,cat04,0.0
4,2409,41.896,-87.649,41.897,-87.648,41.896,-87.646,41.897,-87.645,41.896,...,917,714.956,80.775,0.454,12.955,0.078,0.149,0.031,cat07,0.0


## Variables del espacio  original $\mathbb{R}^{52}$

In [7]:
varc = [v for v in df.columns if v[:2]=='C_']
len(varc)

52

In [8]:
X = df[varc].copy()

## Tratamiento de ausentes

In [9]:
im = SimpleImputer(strategy='median')
im.fit(X)

In [10]:
Xi = pd.DataFrame(im.transform(X), columns=varc)


In [11]:
ks = pd.DataFrame(map(lambda v: (v, stats.ks_2samp(Xi[v], X[v].dropna(
)).statistic), varc), columns=['var', 'ks']).sort_values('ks', ascending=False)
fuera = list(ks[ks['ks'] > 0.1]['var'])
varc = [v for v in varc if v not in fuera]
len(varc)

32

In [12]:
im.fit(X[varc])

In [13]:
Xi = pd.DataFrame(im.transform(X[varc]), columns=varc)


In [14]:
Xi.head()

Unnamed: 0,C_17,C_18,C_19,C_20,C_22,C_23,C_24,C_25,C_26,C_27,...,C_42,C_43,C_44,C_45,C_46,C_47,C_49,C_50,C_51,C_52
0,279.8,253.0,636.522,17.2,12.512,0.076,0.458,0.004,592.67,524.0,...,0.406,0.011,1416.46,1114.0,607.756,18.212,12.578,0.089,0.386,0.013
1,424.6,261.0,964.138,492.35,15.684,0.079,0.161,0.287,790.93,493.0,...,0.185,0.257,1504.42,981.0,800.979,514.138,16.391,0.07,0.174,0.261
2,217.58,124.0,690.0,167.55,12.97,0.108,0.048,0.032,498.69,296.0,...,0.104,0.025,1065.77,706.0,699.008,231.262,12.857,0.095,0.101,0.055
3,276.03,190.0,653.368,302.0,15.971,0.078,0.1,0.147,656.13,426.0,...,0.249,0.184,1385.09,947.0,633.39,391.8,14.844,0.088,0.271,0.155
4,257.86,236.0,706.78,277.75,12.613,0.077,0.144,0.059,548.68,482.0,...,0.165,0.034,1067.92,917.0,714.956,80.775,12.955,0.078,0.149,0.031


## Reducción de dimensionalidad

In [15]:
Xi.dropna().shape,Xi.shape

((9294, 32), (9294, 32))

In [16]:
Xi.shape[0]**2

86378436

In [17]:
Xi = Xi.sample(500).reset_index(drop=True)

In [22]:
sc = MinMaxScaler()
mds = MDS(n_components=2,n_jobs=-1)
sc.fit(Xi)
Xm = pd.DataFrame(mds.fit_transform(sc.transform(Xi)), columns=['d1', 'd2'])





In [23]:
Xm.iplot(kind='scatter', x='d1', y='d2',
         mode='markers', size=5, color='purple')
