## Grasping Data

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot  as plt
from sklearn import preprocessing as p
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import model_selection as sk

cols = ['fLength', 'fWidth', 'fSize', 'fConc', 'fConc1', 'fAsym', 'fM3Long', 'fM3Trans', 'fAlpha', 'fDist', 'Class']
df_raw = pd.read_csv("magic04.data", sep=',', names=cols)

In [4]:
df=df_raw 
df['Class'] = pd.to_numeric(df.apply(lambda row: 1 if (row.Class == 'g' or row.Class == 1) else 0 , axis = 1))

In [5]:
df.head()

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,Class
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092,81.8828,1
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.261,1
2,162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96,256.788,1
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449,116.737,1
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648,356.462,1


In [6]:
df.describe()

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,Class
count,19020.0,19020.0,19020.0,19020.0,19020.0,19020.0,19020.0,19020.0,19020.0,19020.0,19020.0
mean,53.250154,22.180966,2.825017,0.380327,0.214657,-4.331745,10.545545,0.249726,27.645707,193.818026,0.64837
std,42.364855,18.346056,0.472599,0.182813,0.110511,59.206062,51.000118,20.827439,26.103621,74.731787,0.477492
min,4.2835,0.0,1.9413,0.0131,0.0003,-457.9161,-331.78,-205.8947,0.0,1.2826,0.0
25%,24.336,11.8638,2.4771,0.2358,0.128475,-20.58655,-12.842775,-10.849375,5.547925,142.49225,0.0
50%,37.1477,17.1399,2.7396,0.35415,0.1965,4.01305,15.3141,0.6662,17.6795,191.85145,1.0
75%,70.122175,24.739475,3.1016,0.5037,0.285225,24.0637,35.8378,10.946425,45.88355,240.563825,1.0
max,334.177,256.382,5.3233,0.893,0.6752,575.2407,238.321,179.851,90.0,495.561,1.0


In [7]:
#No Missing Values shown
df.isnull().sum()

fLength     0
fWidth      0
fSize       0
fConc       0
fConc1      0
fAsym       0
fM3Long     0
fM3Trans    0
fAlpha      0
fDist       0
Class       0
dtype: int64

In [8]:
class CustomScaler(BaseEstimator,TransformerMixin): 
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        self.scaler = p.StandardScaler(copy,with_mean,with_std)
        self.columns = columns

    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        return self

    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.iloc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

scale = CustomScaler(columns=cols[0:10])
scaled_df = scale.fit_transform(df)

scaled_df.describe()

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,Class
count,19020.0,19020.0,19020.0,19020.0,19020.0,19020.0,19020.0,19020.0,19020.0,19020.0,19020.0
mean,7.172671000000001e-17,-1.434534e-16,-2.301232e-16,-3.040914e-16,1.128201e-16,-5.9772260000000005e-18,4.1840580000000006e-17,-1.083372e-17,2.39089e-17,-1.494306e-18,0.64837
std,1.000026,1.000026,1.000026,1.000026,1.000026,1.000026,1.000026,1.000026,1.000026,1.000026,0.477492
min,-1.155862,-1.209064,-1.869959,-2.008809,-1.939745,-7.661315,-6.712427,-9.897993,-1.059103,-2.57642,0.0
25%,-0.6825213,-0.562379,-0.7361978,-0.7905934,-0.7798731,-0.2745535,-0.4586055,-0.5329216,-0.8465631,-0.6868179,0.0
50%,-0.3800999,-0.2747838,-0.1807437,-0.1431941,-0.1643062,0.1409487,0.09350332,0.01999694,-0.3818041,-0.02631582,1.0
75%,0.3982656,0.1394619,0.5852541,0.6748758,0.6385776,0.4796163,0.4959385,0.5136004,0.6986894,0.6255307,1.0
max,6.631304,12.76608,5.286407,2.804429,4.167511,9.78933,4.466292,8.623528,2.388785,4.037785,1.0


### Set aside test and build use sets

In [9]:
#Feature/Target Split
y = df['Class']
x = df.drop('Class', 1)

y_sc = scaled_df['Class']
x_sc = scaled_df.drop('Class', 1)

testsizepercent = .25

#Training/Testing DataSplits
x_tr, x_te, y_tr, y_te = sk.train_test_split(x, y, 
                                             test_size=testsizepercent)#,stratify=y)
xs_tr, xs_te, ys_tr, ys_te = sk.train_test_split(x_sc, y_sc, 
                                                 test_size=testsizepercent)#,stratify=y_sc)

In [11]:
"""
Store dataframe to allow it to be called by other notebooks

call using: 

    %store -r df 

in new notebook

"""
%store df
%store scaled_df
%store x
%store y
%store x_sc
%store y_sc

%store x_tr
%store y_tr
%store xs_tr
%store ys_tr

%store x_te
%store y_te
%store xs_te
%store ys_te

%store cols
%store testsizepercent

Stored 'df' (DataFrame)
Stored 'scaled_df' (DataFrame)
Stored 'x' (DataFrame)
Stored 'y' (Series)
Stored 'x_sc' (DataFrame)
Stored 'y_sc' (Series)
Stored 'x_tr' (DataFrame)
Stored 'y_tr' (Series)
Stored 'xs_tr' (DataFrame)
Stored 'ys_tr' (Series)
Stored 'x_te' (DataFrame)
Stored 'y_te' (Series)
Stored 'xs_te' (DataFrame)
Stored 'ys_te' (Series)
Stored 'cols' (list)
Stored 'testsizepercent' (float)


### Conclusions/Insights

    - All features are numeric
    - no missing values (seemingly)
    
    - Created Stratified normalized df 
    - Stored sets