# Title

In [None]:
%matplotlib inline
from __future__ import division, print_function

import matplotlib
import sklearn.cross_validation
import itertools

import numpy as np
import pandas as pd
import matplotlib as mplt
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches


In [None]:
filename = "Data/train.csv"

### Get the data

In [None]:
data = pd.read_csv(filename, index_col=0)
print(set(data.dtypes))

X = np.array(data.ix[:,0:-1])
labels = np.array(data.ix[:,-1])
classes = np.unique(labels)
print("Classes :",", ".join(classes))


In [None]:
X = np.log(1+X)

### Plot function

In [None]:
def get_feature_distrib(feature, bins=None, log=False, **kwargs):
    if bins is None:
        bins = np.linspace(X[:,feature].min(), X[:,feature].max(), 15)
        
    x = [X[np.where(labels==c), feature].flatten() for c in classes]
    
    fig, ax = plt.subplots()
    ax.hist(x, bins, stacked=True, log=log)
    
    patches = [mpatches.Patch(label=l, color=c) 
               for l,c in zip(classes, itertools.cycle(mplt.rcParams['axes.color_cycle']))]
    # red_patch = mpatches.Patch(color='blue', label='Signals')
    # blue_patch = mpatches.Patch(color='red', label='Background')
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., handles=patches)

    ax.set_title('Class distribution for feature '+str(feature))
    y_label = 'Count'
    if log:
        y_label = 'Count (log scale)'
    ax.set_ylabel(y_label)
    ax.set_xlabel(u'Feature n°'+str(feature))
    return fig, ax

def plot_feature_distrib(*args, **kwargs):
    fig, ax = get_feature_distrib(*args, **kwargs)
    plt.tight_layout()
    plt.show()
    plt.close(fig)
    
def save_feature_distrib(*args, **kwargs):
    fig, ax = get_feature_distrib(*args, **kwargs)
    plt.tight_layout()
    plt.savefig(kwargs['fname'], bbox_inches='tight')
    plt.close(fig)

In [None]:
feature = 1
plot_feature_distrib(2, bins=np.linspace(1, X[:,feature].max(), 15))

### Save every distribution plot in a directory

## Covariance



In [None]:
print(X.shape)
t = np.array(np.where(np.abs(np.corrcoef(X, rowvar=0))>0.7)).T
cov = t[np.where(t[:,0]-t[:,1]!=0)]
#print(cov)
print(cov.shape[0], "liaisons")

In [None]:
limits = np.linspace(0.1,1)
n_corr = np.zeros(limits.shape)
corr = np.corrcoef(X, rowvar=0)
for i in range(limits.shape[0]):
    limit = limits[i]
    t = np.array(np.where(np.sign(limit)*corr>np.sign(limit)*limit)).T
    n_corr[i] = t[np.where(t[:,0]-t[:,1]!=0)].shape[0]

plt.plot(limits, n_corr)
plt.xlabel("limit")
plt.ylabel("number of liaison")
plt.yscale('log')
plt.show()
print(93*92)