##### Jupyter Notebook, Step 2 - Identify Features
- Build feature selection pipelines using at least three different techniques
- **NOTE**: these pipelines are being used for feature selection not prediction
    
For this portion of the project I will investigate important features in the madelon dataset. 

One method I will use will be unsupervised learning with a DecisionTreeRegressor against every feature. Second and third, I will use other approaches to attempt to reach the same result as I reach using unsupervised learning, such as SelectKBest and SelectFromModel.

For each set of features, I will test the new feature sets against naive models to compare against baseline. 

In [1]:
import csv
from IPython.display import display
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

madelon_file ='madelon_train.csv'
madelon_data = []        

with open(madelon_file) as f:
    readcsv = csv.reader(f, delimiter=' ')
    
    for row in readcsv:
        madelon_data.append(row)
        
madelon_file_target ='madelon_train_targets.csv'
madelon_data_target = []        

with open(madelon_file_target) as f:
    readcsv = csv.reader(f, delimiter=' ')
    
    for row in readcsv:
        madelon_data_target.append(row)
        
madelon1 = madelon_data[0:200]

madelon_data_df = pd.DataFrame(madelon1)
madelon_targets_df = pd.DataFrame(madelon_data_target[0:200])

X = madelon_data_df
y = madelon_targets_df
X['y'] = y

X = X.drop([500],axis=1)
X['y'] = X['y'].map(int)
for column in X.columns:
    X[column] = X[column].map(int)

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

y = X['y']
X = X.drop(['y'], axis=1)

In [2]:
from tqdm import tqdm

In [3]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
import numpy as np
def calculate_r_2_for_feature(X, feature, model):
    tmp_X = X.drop(feature, axis = 1)
    
    X_train, X_test, y_train, y_test = train_test_split(tmp_X, X[feature], test_size=0.25)
    
    regressor = model()
    regressor.fit(X_train, y_train)
    
    score = regressor.score(X_test, y_test)
    return score

def mean_r2_for_feature(data, feature, model):
    scores = []
    for _ in range(100):
        scores.append(calculate_r_2_for_feature(data, feature, model))
        
        scores = np.array(scores)
        return scores.mean()


In [4]:
scores_informative = []
for i in range(500):#tqdm(range(500)):
    r2 = mean_r2_for_feature(X, i, DecisionTreeRegressor)
    if r2 > 0:
        scores_informative.append([i,r2])
        #print("informative found!: ", i, r2)

In [5]:
information_df = pd.DataFrame(scores_informative)
inform_mask = information_df[0]
#inform_mask.shape

In [4]:
def fit_and_score_model_against_raw_and_scaled(model, X_train, X_test, y_train, y_test):
    scaler = StandardScaler()
    scaler.fit(X_train)

    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    scaler = StandardScaler()
 
    model.fit(X_train, y_train)
    train_score = model.score(X_train, y_train)
    test_score = model.score(X_test, y_test)

    model.fit(X_train_scaled, y_train)
    train_scaled_score = model.score(X_train_scaled, y_train)
    test_scaled_score = model.score(X_test_scaled, y_test)
    
    return {
            'model': model,
            'train_raw_score' : train_score,
            'test_raw_score' : test_score,
            'train_scaled_score' : train_scaled_score,
            'test_scaled_score' : test_scaled_score,
           }

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X[inform_mask], y, test_size=.3, random_state=42)

NameError: name 'inform_mask' is not defined

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
results = [fit_and_score_model_against_raw_and_scaled(LogisticRegression(), X_train, X_test, y_train, y_test),
           fit_and_score_model_against_raw_and_scaled(KNeighborsRegressor(), X_train, X_test, y_train, y_test),
           fit_and_score_model_against_raw_and_scaled(KNeighborsClassifier(), X_train, X_test, y_train, y_test),
           fit_and_score_model_against_raw_and_scaled(DecisionTreeClassifier(), X_train, X_test, y_train, y_test),
           fit_and_score_model_against_raw_and_scaled(SVC(), X_train, X_test, y_train, y_test)]

results_df = pd.DataFrame(results)
results_df

Unnamed: 0,model,test_raw_score,test_scaled_score,train_raw_score,train_scaled_score
0,"LogisticRegression(C=1.0, class_weight=None, d...",0.683333,0.666667,0.678571,0.628571
1,"KNeighborsRegressor(algorithm='auto', leaf_siz...",0.437321,0.3311,0.485099,0.422394
2,"KNeighborsClassifier(algorithm='auto', leaf_si...",0.8,0.75,0.864286,0.821429
3,"DecisionTreeClassifier(class_weight=None, crit...",0.7,0.683333,1.0,1.0
4,"SVC(C=1.0, cache_size=200, class_weight=None, ...",0.633333,0.866667,1.0,0.842857


Unsupervised learning methods clearly enhance the ability of the models to predict for the dataset. Even with the very small dataset, the model now performs noticably better. 

Next I will use SelectFromModel and SelectKBest to choose features.

In [5]:
from sklearn.feature_selection import SelectKBest, \
                                      SelectFromModel, \
                                      RFE, SelectPercentile
from itertools import combinations

In [10]:
X.shape

(200, 500)

In [11]:
skb = SelectKBest(k=20)

skb.fit(X,y)

SelectKBest(k=20, score_func=<function f_classif at 0x7f6ce0d75268>)

In [12]:
skb_feats = np.where(skb.get_support())[0]

In [13]:
skb_feats

array([ 32,  34,  40,  47,  48,  70, 105, 128, 193, 235, 282, 378, 380,
       402, 415, 417, 420, 435, 474, 477])

In [27]:
skb_feats.dtype

dtype('int64')

In [14]:
skb.pvalues_.shape

(500,)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X[skb_feats], y, test_size=.3, random_state=42)

In [16]:
X_train.shape

(140, 20)

In [17]:
results = [fit_and_score_model_against_raw_and_scaled(LogisticRegression(), X_train, X_test, y_train, y_test),
           fit_and_score_model_against_raw_and_scaled(KNeighborsRegressor(), X_train, X_test, y_train, y_test),
           fit_and_score_model_against_raw_and_scaled(KNeighborsClassifier(), X_train, X_test, y_train, y_test),
           fit_and_score_model_against_raw_and_scaled(DecisionTreeClassifier(), X_train, X_test, y_train, y_test),
           fit_and_score_model_against_raw_and_scaled(SVC(), X_train, X_test, y_train, y_test)]

results_df = pd.DataFrame(results)
results_df

Unnamed: 0,model,test_raw_score,test_scaled_score,train_raw_score,train_scaled_score
0,"LogisticRegression(C=1.0, class_weight=None, d...",0.633333,0.716667,0.735714,0.714286
1,"KNeighborsRegressor(algorithm='auto', leaf_siz...",-0.125359,-0.122488,0.259604,0.270457
2,"KNeighborsClassifier(algorithm='auto', leaf_si...",0.566667,0.583333,0.7,0.757143
3,"DecisionTreeClassifier(class_weight=None, crit...",0.583333,0.6,1.0,1.0
4,"SVC(C=1.0, cache_size=200, class_weight=None, ...",0.633333,0.733333,1.0,0.885714


SelectKBest clearly under performs unsupervised learning when selecting 20 best features. 

A notable exception is LogisticRegression, which actually shows signs of doing slightly better. It should be noted, however, that different train test splits were used in these two datasets.

For the third feature selection mechanism, I will use SelectFromModel with a Support Vector Machine. 

In [18]:
from sklearn.svm import SVR

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42)
ss = StandardScaler()
X_train_scaled = ss.fit_transform(X_train)
X_test_scaled = ss.transform(X_test)

sfm = SelectFromModel(LogisticRegression(), threshold='2*mean')
sfm.fit(X_train_scaled, y_train)
sfm_feats = np.where(sfm.get_support())[0]
#sfm_feats

In [15]:
sfm_feats.shape

(114,)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X[sfm_feats], y, test_size=.3, random_state=42)

results = [fit_and_score_model_against_raw_and_scaled(LogisticRegression(), X_train, X_test, y_train, y_test),
           fit_and_score_model_against_raw_and_scaled(KNeighborsRegressor(), X_train, X_test, y_train, y_test),
           fit_and_score_model_against_raw_and_scaled(KNeighborsClassifier(), X_train, X_test, y_train, y_test),
           fit_and_score_model_against_raw_and_scaled(DecisionTreeClassifier(), X_train, X_test, y_train, y_test),
           fit_and_score_model_against_raw_and_scaled(SVC(), X_train, X_test, y_train, y_test)]

results_df = pd.DataFrame(results)
results_df

Unnamed: 0,model,test_raw_score,test_scaled_score,train_raw_score,train_scaled_score
0,"LogisticRegression(C=1.0, class_weight=None, d...",0.55,0.533333,1.0,1.0
1,"KNeighborsRegressor(algorithm='auto', leaf_siz...",-0.23445,-0.168421,0.498363,0.51404
2,"KNeighborsClassifier(algorithm='auto', leaf_si...",0.55,0.6,0.871429,0.878571
3,"DecisionTreeClassifier(class_weight=None, crit...",0.5,0.466667,1.0,1.0
4,"SVC(C=1.0, cache_size=200, class_weight=None, ...",0.633333,0.583333,1.0,1.0


Select from model does not clearly show any improvement on the naive models at this point. After doing some tuning to the selection mechanism (threshold), I still do not see any marked improvement at this point.

In [28]:
supports = [inform_mask,skb_feats,sfm_feats]

In [38]:
supports

[0      28
 1      48
 2      64
 3     105
 4     128
 5     153
 6     241
 7     281
 8     318
 9     336
 10    338
 11    378
 12    433
 13    442
 14    451
 15    453
 16    455
 17    472
 18    475
 19    493
 Name: 0, dtype: int64,
 array([ 32,  34,  40,  47,  48,  70, 105, 128, 193, 235, 282, 378, 380,
        402, 415, 417, 420, 435, 474, 477]),
 array([  1,  32,  34,  40,  43,  47,  51,  55,  70,  73,  75,  80,  83,
         85,  93, 111, 126, 131, 141, 155, 162, 192, 193, 196, 200, 207,
        209, 213, 218, 231, 287, 295, 299, 306, 376, 387, 389, 395, 407,
        415, 417, 418, 420, 424, 430, 435, 441, 452, 461, 463, 473, 476])]

In [5]:
import pickle

In [95]:
with open('supports.pkl', 'wb') as f:
    pickle.dump(supports, f)

In [96]:
madelon_from_sql = pd.read_pickle('m_sql_1.pickle')
madelon_from_sql.set_index('_id', inplace=True)
X = madelon_from_sql.drop('target', axis=1)
y = madelon_from_sql['target']

In [97]:
X.shape, y.shape

((2000, 1000), (2000,))

In [98]:
Xb = X[0:1000]

In [99]:
Xb.shape

(1000, 1000)

In [69]:
Xb = Xb[['feat_257',
 'feat_269',
 'feat_308',
 'feat_315',
 'feat_336',
 'feat_341',
 'feat_395',
 'feat_504',
 'feat_526',
 'feat_639',
 'feat_681',
 'feat_701',
 'feat_724',
 'feat_736',
 'feat_769',
 'feat_808',
 'feat_829',
 'feat_867',
 'feat_920',
 'feat_956']]

In [100]:
Xb.shape

(1000, 1000)

In [101]:
sql_scores_informative = []
for i in tqdm(Xb):
    print("lots of stuff")
    r2 = mean_r2_for_feature(Xb, i, DecisionTreeRegressor)
    if r2 > 0.92:
        sql_scores_informative.append([i,r2])
        print("informative found!: ", i, r2)

  0%|          | 0/1000 [00:00<?, ?it/s]

lots of stuff


  0%|          | 1/1000 [00:00<11:47,  1.41it/s]

lots of stuff


  0%|          | 2/1000 [00:01<11:41,  1.42it/s]

lots of stuff


  0%|          | 3/1000 [00:02<11:29,  1.45it/s]

lots of stuff


  0%|          | 4/1000 [00:02<11:36,  1.43it/s]

lots of stuff


  0%|          | 5/1000 [00:03<11:24,  1.45it/s]

lots of stuff


  1%|          | 6/1000 [00:04<11:54,  1.39it/s]

lots of stuff


  1%|          | 7/1000 [00:04<11:46,  1.41it/s]

lots of stuff


  1%|          | 8/1000 [00:05<11:45,  1.41it/s]

lots of stuff


  1%|          | 9/1000 [00:06<12:08,  1.36it/s]

lots of stuff


  1%|          | 10/1000 [00:07<11:58,  1.38it/s]

lots of stuff


  1%|          | 11/1000 [00:07<11:58,  1.38it/s]

lots of stuff


  1%|          | 12/1000 [00:08<11:53,  1.38it/s]

lots of stuff


  1%|▏         | 13/1000 [00:09<11:46,  1.40it/s]

lots of stuff


  1%|▏         | 14/1000 [00:09<11:41,  1.41it/s]

lots of stuff


  2%|▏         | 15/1000 [00:10<11:41,  1.40it/s]

lots of stuff


  2%|▏         | 16/1000 [00:11<11:36,  1.41it/s]

lots of stuff


  2%|▏         | 17/1000 [00:11<11:31,  1.42it/s]

lots of stuff


  2%|▏         | 18/1000 [00:12<11:24,  1.44it/s]

lots of stuff


  2%|▏         | 19/1000 [00:13<11:21,  1.44it/s]

lots of stuff


  2%|▏         | 20/1000 [00:13<11:17,  1.45it/s]

lots of stuff


  2%|▏         | 21/1000 [00:14<11:17,  1.45it/s]

lots of stuff


  2%|▏         | 22/1000 [00:15<11:14,  1.45it/s]

lots of stuff


  2%|▏         | 23/1000 [00:15<11:14,  1.45it/s]

lots of stuff


  2%|▏         | 24/1000 [00:16<11:18,  1.44it/s]

lots of stuff


  2%|▎         | 25/1000 [00:17<11:16,  1.44it/s]

lots of stuff


  3%|▎         | 26/1000 [00:18<11:15,  1.44it/s]

lots of stuff


  3%|▎         | 27/1000 [00:18<11:13,  1.45it/s]

lots of stuff


  3%|▎         | 28/1000 [00:19<11:11,  1.45it/s]

lots of stuff


  3%|▎         | 29/1000 [00:20<11:16,  1.44it/s]

lots of stuff


  3%|▎         | 30/1000 [00:20<11:14,  1.44it/s]

lots of stuff


  3%|▎         | 31/1000 [00:21<11:15,  1.43it/s]

lots of stuff


  3%|▎         | 32/1000 [00:22<11:12,  1.44it/s]

lots of stuff


  3%|▎         | 33/1000 [00:23<11:18,  1.43it/s]

lots of stuff


  3%|▎         | 34/1000 [00:23<11:16,  1.43it/s]

lots of stuff


  4%|▎         | 35/1000 [00:24<11:13,  1.43it/s]

lots of stuff


  4%|▎         | 36/1000 [00:25<11:14,  1.43it/s]

lots of stuff


  4%|▎         | 37/1000 [00:25<11:10,  1.44it/s]

lots of stuff


  4%|▍         | 38/1000 [00:26<11:08,  1.44it/s]

lots of stuff


  4%|▍         | 39/1000 [00:26<11:05,  1.44it/s]

lots of stuff


  4%|▍         | 40/1000 [00:27<11:08,  1.44it/s]

lots of stuff


  4%|▍         | 41/1000 [00:28<11:06,  1.44it/s]

lots of stuff


  4%|▍         | 42/1000 [00:29<11:06,  1.44it/s]

lots of stuff


KeyboardInterrupt: 

In [94]:
sql_scores_informative

[['feat_269', 0.94191582689721287],
 ['feat_315', 0.93182798428292213],
 ['feat_341', 0.93280868233103176],
 ['feat_395', 0.93785116449378292],
 ['feat_639', 0.97685423340934086],
 ['feat_956', 0.97528427979255861]]

In [21]:
sql_scores_informative

[['feat_257', 0.74895256045322978],
 ['feat_269', 0.88810281325383411],
 ['feat_681', 0.67880996399072724],
 ['feat_808', 0.69855498767387192],
 ['feat_829', 0.70962933432153519],
 ['feat_920', 0.81468393371269088]]

In [10]:
sql_scores_informative

[['feat_257', 0.48885988681021347],
 ['feat_269', 0.55368127493783093],
 ['feat_308', 0.31454295404360233],
 ['feat_315', 0.73546577604917074],
 ['feat_336', 0.69337820342894474],
 ['feat_341', 0.62144839816190611],
 ['feat_395', 0.63479630273013954],
 ['feat_504', 0.40896058230504628],
 ['feat_526', 0.21032540438036473],
 ['feat_639', 0.84227343387442299],
 ['feat_681', 0.47442578506922012],
 ['feat_701', 0.76280481035288283],
 ['feat_724', 0.52833368594341357],
 ['feat_736', 0.60555953875298307],
 ['feat_769', 0.43235716767371635],
 ['feat_808', 0.47998839978887853],
 ['feat_829', 0.58253418729726647],
 ['feat_867', 0.63503139250910301],
 ['feat_920', 0.64267634615223335],
 ['feat_956', 0.93082504050825332]]

In [14]:
len(sql_scores_informative)

20

In [16]:
with open('supports_sql.pkl', 'wb') as f:
    pickle.dump(sql_scores_informative, f)