# Santander Value Prediction Challenge
[LINK](https://www.kaggle.com/c/santander-value-prediction-challenge)

## Google Drive認証

## TrainとTestの事前調査
- https://www.kaggle.com/nanomathias/distribution-of-test-vs-training-data

In [3]:
import gc
import itertools
from copy import deepcopy

import numpy as np
import pandas as pd

from tqdm import tqdm

from scipy.stats import ks_2samp

from sklearn.preprocessing import scale, MinMaxScaler
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import FastICA
from sklearn.decomposition import PCA
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection

from sklearn import manifold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold

import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import io
%matplotlib inline

## t-SNEによる次元圧縮
- 4459サンプル(testとtrain合わせて)

In [4]:
SAMPLE_SIZE = 4459

train_df = pd.read_csv('train.csv').sample(SAMPLE_SIZE) # sample(n, axis) ランダムにサンプリング
test_df = pd.read_csv('test.csv').sample(SAMPLE_SIZE) 


In [9]:
#concat
total_df = pd.concat([train_df.drop('target', axis=1), test_df], axis=0).drop('ID', axis=1) #　drop(labels, axis)　行・列削除


In [6]:
# 標準偏差が0のカラムをtotal_dfから除く
zero_std_cols = train_df.drop("ID", axis=1).columns[train_df.std() == 0] # std() 各ラベルの標準偏差
total_df.drop(zero_std_cols, axis=1, inplace=True) # inplaceは破壊的変更
print(">> Removed {} constant columns".format(len(zero_std_cols)))

>> Removed 256 constant columns


In [57]:
# 被ってるカラムを取り除く
# Taken from: https://www.kaggle.com/scirpus/santander-poor-mans-tsne
colsToRemove = [] # 同じ列のカラム名を入れる(2つのうち後半の方)
colsScaned = [] # これはcolsToRemoveと一緒じゃない？
dupList = {}
columns = total_df.columns
for i in range(len(columns)-1):
    v = train_df[columns[i]].values
    dupCols = []
    for j in range(i+1,len(columns)):
        if np.array_equal(v, train_df[columns[j]].values): # 列ごとに要素が同じか比較
            colsToRemove.append(columns[j])
            if columns[j] not in colsScaned:
                dupCols.append(columns[j]) 
                colsScaned.append(columns[j])
                dupList[columns[i]] = dupCols
colsToRemove = list(set(colsToRemove)) # setは集合(重複ない、順番もない)
total_df.drop(colsToRemove, axis=1, inplace=True)
print(">> Dropped {} duplicate columns".format(len(colsToRemove)))

>> Dropped 5 duplicate columns


In [1]:
# Go through the columns one at a time (can't do it all at once for this dataset)
total_df_all = deepcopy(total_df) # オブジェクト内のオブジェクトは別IDをもつ（ex.入れ子のリスト)             
for col in total_df.columns:
    
    # Detect outliers in this column
    data = total_df[col].values
    data_mean, data_std = np.mean(data), np.std(data) # 各カラムの平均、標準偏差
    cut_off = data_std * 3
    lower, upper = data_mean - cut_off, data_mean + cut_off # 3σの範囲求める
    outliers = [x for x in data if (x < lower) or (x > upper)] # 外れ値
    
    # If there are crazy high values, do a log-transform
    # 外れ値のlogとる
    if len(outliers) > 0:
        non_zero_idx = data != 0
        total_df.loc[non_zero_idx, col] = np.log(data[non_zero_idx]) # loc(row,col) データを参照 # 0ではない各要素のlog
    
    # Scale non-zero column values
    nonzero_rows = total_df[col] != 0
    total_df.loc[nonzero_rows, col] = scale(total_df.loc[nonzero_rows, col]) # 平均0, 分散1　Y = (X - μ)/σ
    
    # Scale all column values
    total_df_all[col] = scale(total_df_all[col])
    gc.collect()
    
# Train and test
train_idx = range(0, len(train_df))
test_idx = range(len(train_df), len(total_df))

NameError: name 'deepcopy' is not defined

In [2]:
def test_pca(data, create_plots=True):
    """Run PCA analysis, return embedding"""
    
    # Create a PCA object, specifying how many components we wish to keep
    pca = PCA(n_components=1000) # 1000次元に圧縮

    # Run PCA on scaled numeric dataframe, and retrieve the projected data
    pca_trafo = pca.fit_transform(data)  # 実行

    # The transformed data is in a numpy matrix. This may be inconvenient if we want to further
    # process the data, and have a more visual impression of what each column is etc. We therefore
    # put transformed/projected data into new dataframe, where we specify column names and index
    pca_df = pd.DataFrame(
        pca_trafo,
        index=total_df.index,
        columns=["PC" + str(i + 1) for i in range(pca_trafo.shape[1])]
    )

    # Only construct plots if requested
    if create_plots:
        
        # Create two plots next to each other
        _, axes = plt.subplots(2, 2, figsize=(20, 15))
        axes = list(itertools.chain.from_iterable(axes))
        
                # Plot the explained variance# Plot t 
        axes[0].plot(
            pca.explained_variance_ratio_, "--o", linewidth=2,
            label="Explained variance ratio"
        )

        # Plot the cumulative explained variance
        axes[0].plot(
            pca.explained_variance_ratio_.cumsum(), "--o", linewidth=2,
            label="Cumulative explained variance ratio"
        )

        # Show legend
        axes[0].legend(loc="best", frameon=True)

        # Show biplots
        for i in range(1, 4):

            # Components to be plottet
            x, y = "PC"+str(i), "PC"+str(i+1)

            # Plot biplots
            settings = {'kind': 'scatter', 'ax': axes[i], 'alpha': 0.2, 'x': x, 'y': y} # 描写する列x,y指定
            pca_df.iloc[train_idx].plot(label='Train', c='#ff7f0e', **settings)
            pca_df.iloc[test_idx].plot(label='Test',  c='#1f77b4', **settings)    

                    # Show the plot
        plt.show()
    
    return pca_df

# Run the PCA and get the embedded dimension
pca_df = test_pca(total_df)
pca_df_all = test_pca(total_df_all, create_plots=False)

NameError: name 'total_df' is not defined

In [60]:
df_wine_all=pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)
#品種(0列、1～3)と色（10列）とプロリンの量(13列)を使用する
X=df_wine_all.iloc[:,1:].values
Y=df_wine_all.iloc[:,0].values

In [72]:
pca_w = PCA(n_components=10)
X_transformed = pca_w.fit_transform(X)

In [73]:
pca_w_df = pd.DataFrame(
        X_transformed,
        index=df_wine_all.index,
        columns=["PC" + str(i + 1) for i in range(X_transformed.shape[1])]
    )

In [81]:
pca_w_df

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10
0,318.562979,21.492131,3.130735,-0.250114,0.677078,0.568081,-0.619642,-0.199555,0.701280,-0.095008
1,303.097420,-5.364718,6.822835,-0.864035,-0.486096,0.014340,0.108865,0.604714,0.286717,-0.045782
2,438.061133,-6.537309,-1.113223,0.912411,0.380651,0.672404,0.785819,-0.500886,0.024547,-0.208960
3,733.240139,0.192729,-0.917257,-0.541251,0.858662,0.599122,0.018770,0.190428,0.054277,0.531684
4,-11.571428,18.489995,-0.554422,1.360896,0.276442,0.768884,-0.309976,0.119091,-0.195843,0.061771
5,703.231192,-0.332159,0.949375,-0.359994,0.156827,0.061011,0.026076,0.097809,-0.388096,0.101953
6,542.971581,-13.518967,2.126943,0.055566,-0.483660,0.037712,-0.546570,-0.341777,0.570189,-0.062810
7,548.401860,11.449432,0.040492,1.349454,-0.852591,-0.092885,-0.589468,0.471963,0.545029,0.015047
8,298.036863,-8.180158,3.880975,-0.910643,0.311489,0.321003,-1.130445,-0.481429,-0.353403,-0.048536
9,298.049553,-7.101543,1.558455,-1.677042,1.558416,0.174919,-0.219401,0.246656,0.425169,-0.014587
