## Dependencias

In [49]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import cufflinks as cf

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

from scipy import stats

cf.go_offline()
pd.set_option('display.float_format', lambda x: '%.3f' % x)

## Lectura de datos

In [7]:
ruta = "/media/jose/090f6b94-de30-4aaf-9f8a-4e18b120d7f6/bd/01. Simples/pure_ml/train.csv"

In [8]:
df = pd.read_csv(ruta)

In [9]:
df.shape

(9294, 55)

In [10]:
df.head()

Unnamed: 0,ID,C_01,C_02,C_03,C_04,C_05,C_06,C_07,C_08,C_09,...,C_45,C_46,C_47,C_48,C_49,C_50,C_51,C_52,D_1,TARGET
0,674,41.902848,-87.64336,41.90154,-87.648066,41.901002,-87.642246,41.901952,-87.649264,41.900381,...,1114,607.755835,18.2125,0.442469,12.577837,0.08944,0.385996,0.013465,cat11,0.0
1,7397,,,,,,,,,,...,981,800.978593,514.1375,0.429966,16.390591,0.069741,0.174312,0.260958,cat01,0.0
2,11558,41.891421,-87.641041,41.89738,-87.651233,41.89219,-87.645894,41.89859,-87.651771,41.892631,...,706,699.008499,231.2625,0.357551,12.856969,0.094956,0.100567,0.055241,cat01,0.0
3,463,41.894619,-87.664955,41.896899,-87.657375,41.899288,-87.667246,41.89767,-87.65884,41.90064,...,947,633.389652,391.8,0.33495,14.844393,0.087593,0.271383,0.155227,cat04,0.0
4,2409,41.896174,-87.649482,41.896501,-87.647639,41.895677,-87.646345,41.896954,-87.644895,41.896052,...,917,714.956332,80.775,0.454336,12.954656,0.07786,0.1494,0.030534,cat07,0.0


## Variables del espacio  original $\mathbb{R}^{52}$

In [12]:
varc = [v for v in df.columns if v[:2]=='C_']
len(varc)

52

In [26]:
X = df[varc].copy()

## Tratamiento de ausentes

In [28]:
im = SimpleImputer(strategy='median')
im.fit(X)

In [29]:
Xi = pd.DataFrame(im.transform(X), columns=varc)


In [32]:
ks = pd.DataFrame(map(lambda v: (v, stats.ks_2samp(Xi[v], X[v].dropna(
)).statistic), varc), columns=['var', 'ks']).sort_values('ks', ascending=False)
fuera = list(ks[ks['ks'] > 0.1]['var'])
varc = [v for v in varc if v not in fuera]
len(varc)

32

In [33]:
im.fit(X[varc])

In [35]:
Xi = pd.DataFrame(im.transform(X[varc]), columns=varc)


In [37]:
Xi.head()

Unnamed: 0,C_17,C_18,C_19,C_20,C_22,C_23,C_24,C_25,C_26,C_27,...,C_42,C_43,C_44,C_45,C_46,C_47,C_49,C_50,C_51,C_52
0,279.8,253.0,636.521739,17.2,12.512055,0.076062,0.458498,0.003953,592.67,524.0,...,0.405978,0.011208,1416.46,1114.0,607.755835,18.2125,12.577837,0.08944,0.385996,0.013465
1,424.6,261.0,964.137931,492.35,15.684291,0.079347,0.16092,0.287356,790.93,493.0,...,0.185083,0.256906,1504.42,981.0,800.978593,514.1375,16.390591,0.069741,0.174312,0.260958
2,217.58,124.0,690.0,167.55,12.970403,0.107815,0.048387,0.032258,498.69,296.0,...,0.104449,0.025145,1065.77,706.0,699.008499,231.2625,12.856969,0.094956,0.100567,0.055241
3,276.03,190.0,653.368421,302.0,15.971474,0.078138,0.1,0.147368,656.13,426.0,...,0.248503,0.184132,1385.09,947.0,633.389652,391.8,14.844393,0.087593,0.271383,0.155227
4,257.86,236.0,706.779661,277.75,12.612754,0.076502,0.144068,0.059322,548.68,482.0,...,0.164993,0.034433,1067.92,917.0,714.956332,80.775,12.954656,0.07786,0.1494,0.030534


## Reducción de dimensionalidad

In [38]:
Xi.dropna().shape,Xi.shape

((9294, 32), (9294, 32))

In [40]:
Xi.corr().round(2)

Unnamed: 0,C_17,C_18,C_19,C_20,C_22,C_23,C_24,C_25,C_26,C_27,...,C_42,C_43,C_44,C_45,C_46,C_47,C_49,C_50,C_51,C_52
C_17,1.0,0.71,0.24,0.42,-0.18,0.31,0.04,-0.05,0.88,0.65,...,0.01,-0.04,0.77,0.58,0.14,0.35,-0.16,0.24,0.01,-0.04
C_18,0.71,1.0,0.21,0.5,-0.44,-0.04,0.07,0.01,0.67,0.93,...,0.05,0.01,0.62,0.85,0.14,0.44,-0.38,-0.03,0.05,0.01
C_19,0.24,0.21,1.0,0.37,-0.06,-0.12,0.0,-0.03,0.23,0.2,...,-0.01,-0.02,0.22,0.2,0.41,0.34,-0.05,-0.13,-0.01,-0.02
C_20,0.42,0.5,0.37,1.0,-0.16,-0.12,0.05,0.05,0.4,0.47,...,0.04,0.05,0.38,0.43,0.2,0.72,-0.14,-0.08,0.03,0.05
C_22,-0.18,-0.44,-0.06,-0.16,1.0,-0.03,-0.08,0.1,-0.15,-0.4,...,-0.06,0.07,-0.13,-0.35,-0.02,-0.13,0.61,-0.01,-0.05,0.07
C_23,0.31,-0.04,-0.12,-0.12,-0.03,1.0,-0.03,-0.09,0.29,-0.04,...,-0.01,-0.11,0.26,-0.04,-0.08,-0.11,-0.03,0.73,-0.0,-0.11
C_24,0.04,0.07,0.0,0.05,-0.08,-0.03,1.0,0.01,0.02,0.06,...,0.61,0.0,0.02,0.05,0.01,0.04,-0.05,-0.0,0.54,-0.0
C_25,-0.05,0.01,-0.03,0.05,0.1,-0.09,0.01,1.0,-0.03,0.03,...,0.01,0.68,-0.0,0.05,-0.01,0.05,0.05,-0.09,0.02,0.63
C_26,0.88,0.67,0.23,0.4,-0.15,0.29,0.02,-0.03,1.0,0.76,...,0.02,-0.05,0.89,0.69,0.17,0.41,-0.22,0.3,0.02,-0.05
C_27,0.65,0.93,0.2,0.47,-0.4,-0.04,0.06,0.03,0.76,1.0,...,0.06,0.0,0.71,0.93,0.17,0.48,-0.41,0.0,0.06,0.0


In [44]:
sc = StandardScaler()
pca = PCA()
sc.fit(Xi)
pca.fit(sc.transform(Xi))

In [57]:
pca = PCA(n_components=3)

In [58]:
pca.fit(sc.transform(Xi))

In [64]:
Z = pd.DataFrame(pca.transform(sc.transform(Xi)),columns=['Z1','Z2','Z3'])

In [67]:
Z.sample(500).iplot(kind='scatter3d', x='Z1', y='Z2',
                    z='Z3', mode='markers', size=5, color='purple')


In [68]:
Z.corr().round(2)

Unnamed: 0,Z1,Z2,Z3
Z1,1.0,-0.0,-0.0
Z2,-0.0,1.0,-0.0
Z3,-0.0,-0.0,1.0
