In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

from sklearn import __version__
print("Sklearn version: ", __version__, " (need >= 0.18)")

Sklearn version:  0.18  (need > 0.18)


# Feature Selection Experiments <small>on alpine plants data</small>

In [80]:
DROP_FEATS = 0 # 0 no, 1 yes. 0 = drop rows with N/A. 1 = drop columns with N/A

plants_master = pd.read_csv("plants5.csv")


drop_features = ["Taxon",
                 "migr_sterr_m", 
                 "shift + 2SE", 
                 'signif_shift',
                 "signif_shift2",
                 "dispmode01",
                 "DispModeEng", ## what is this
                 "shift + 2SE",
                ]

categorical_features = ["oceanity", "dispersal_mode", "BreedSysCode", "Grime"]

# one-hot encoding for categorical features:
plants = pd.get_dummies(plants_master, columns=categorical_features)

# drop features we don't want
features = plants.drop(drop_features, axis=1)

# drop features with n/a or NaN
beforenona = set(features.columns.values)
## axis = 1 drops columns with any NAs, axis = 0 drops rows with any NAs
features = features.dropna(axis=DROP_FEATS)
afternona  = set(features.columns.values)

# extract response variable from trimmed data
target   = features["migration_m"]
features.drop(["migration_m"], inplace=True, axis=1)

print("Processing complete:")
print("Number of Entries: " + str(len(features)))
print("Cols removed: " + str((beforenona-afternona)))
print("# Rows Removed: " + str(len(plants_master) - len(features)))

Processing complete:
Number of Entries: 20
Cols removed: set()
# Rows Removed: 113


## 1) Normalize Features

In [81]:
from sklearn.preprocessing import StandardScaler
scaled_features = StandardScaler().fit(features).transform(features)

## 2) RFR with Cross Validation and Feature Selection

In [87]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import ShuffleSplit, KFold, cross_val_score
from sklearn.metrics import r2_score
from collections import defaultdict
 
X = scaled_features
Y = target
names = features.columns.values

print(np.any(np.isnan(X)))
print(np.any(np.isnan(Y)))
print(np.all(np.isfinite(X)))
print(np.all(np.isfinite(Y)))

rf = RandomForestRegressor()
scores = defaultdict(list)
 
#crossvalidate the scores on a number of different random splits of the data
for train_idx, test_idx in KFold(len(X)):
    X_train, X_test = X[train_idx], X[test_idx]
    Y_train, Y_test = Y[train_idx], Y[test_idx]
    print(X_train, Y_train)
    r = rf.fit(X_train, Y_train)
    acc = r2_score(Y_test, rf.predict(X_test))
    for i in range(X.shape[1]):
        X_t = X_test.copy()
        np.random.shuffle(X_t[:, i])
        shuff_acc = r2_score(Y_test, rf.predict(X_t))
        scores[names[i]].append((acc-shuff_acc)/acc)
print ("Features sorted by their score:")
print (sorted([(round(np.mean(score), 4), feat) for
              feat, score in scores.items()], reverse=True))

False
False
True
True
[[-1.22474487  1.         -2.18555181 -0.13018891  2.5413099  -1.73916398
   0.87093638  2.18306339 -0.5        -0.59342385 -0.63197941 -0.54575406
  -0.63579539 -0.59190877 -0.18304938 -0.09192978 -0.1182005  -0.96252004
  -0.35805744 -0.67800972 -1.57525452 -0.88041408 -0.65672674 -1.59248651
  -0.89683016 -0.11764291 -0.48189987  0.         -0.30914351  0.
  -0.42008403 -0.57735027 -0.33333333  1.36277029 -0.42008403 -0.33333333
   0.81649658  0.         -0.57735027 -0.42008403 -0.57735027  1.22474487
  -0.5        -0.22941573 -0.33333333  0.90453403  0.         -0.65465367
   0.        ]
 [ 0.81649658 -1.          0.68074565 -0.13018891 -0.90453403  0.74535599
  -0.15369466 -1.13898959 -0.5         1.15082516  1.13257616  0.21576323
   0.56825131  0.46479771 -0.18304938 -0.45964892 -0.41370176 -0.12554609
  -0.35805744  0.57447443 -0.32597162 -0.48572846  0.57777944 -0.32347415
  -0.48839039 -1.09880274 -0.48189987  0.         -0.53027621  0.
  -0.42008403 -0.

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [115]:
cross_val_score(rf, scaled_features, target, cv=5, scoring='neg_mean_squared_error').mean()



-35.870204999999999

In [121]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
print(cross_val_score(lr, scaled_features, target, cv=5, scoring='neg_mean_squared_error').mean())
print(cross_val_score(lr, features, target, cv=5, scoring='neg_mean_squared_error').mean())

-72.7652613495
-108.307860866


In [118]:
cross_val_score(SVR_rbf, scaled_features, target, cv=5, scoring='neg_mean_squared_error').mean()


-27.99667119707285

In [117]:
cross_val_score(SVR_rbf, features, target, cv=5, scoring='neg_mean_squared_error').mean()

-28.679414172227485