In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as spy

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [11]:
from useful_code import useful_functions
from useful_code import PandasImputer 

In [None]:
import reliefF

### Data pre-processing

- Load in the data
- Separate into train, validation, and test splits
- Use median strategy for imputation of missing data
- Re-scale

In [12]:
data = pd.read_csv('data/train_v2.csv')
data = useful_functions.clean_dataset(data, delete_missing_data=False)
data = data.drop(['id', 'index'], axis=1)

X = data.drop(['loss'], axis=1)
y = data['loss']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)
    
imp = PandasImputer.PandasImputer(strategy='median', missing_values=np.nan)
X_train = imp.fit_transform(X_train)
X_test = imp.transform(X_test)
X_val = imp.transform(X_val)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
X_val = sc.transform(X_val)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


### The Feature Generation Functions
(The following is a paraphrased version of the description in my dissertation followed by the specifics of each function.)

The ICL dataset does not have labelled features. The ICL is great because it has many features, but the trade off is that we don't know what they represent. Ordinarily, when looking to generate features, there are at least some hints. For example, if you have the features 'distance' and 'time', you could generate the feature 'speed' using speed = distance / time. In this case we don't have the privilege of making educated guesses.

The solution I employ therefore is to generate all possible pairs of features for each operation +, -, \*. Once generated, I check its pearson correlation with the target feature. If a 3-tuple of two features and an operation is such that this correlation is less than some `sig` then the 4-tuple will be recorded, where the first 3 elements are from the previous 3-tuple and the final is the `op` that should be used.

- `get_corr_pairs` requires an `op` (i.e., a 2-function that performs the operation) and returns the list of good 3-tuples (as defined above).
- `get_corr_pairs_plus`, `get_corr_pairs_minus` and `get_corr_pairs_mult` use the above function, feeding in `lambda a,b: a+b`, etc., for `op`.
- `get_all_corr_pairs` calls all three functions in the above bullet point and concatenates the lists they return

In [13]:
def get_corr_pairs(X, y, op, op_name, sig=0.01, suppress_checkpoints=True):
    good_pairs = []
    no_features = len(X[0])
    for i in range(no_features):
        if i % 100 == 0 and not suppress_checkpoints:
            print('Outer loop up to feature ' + str(i) + '.')
        for j in range(i+1, no_features):
            corr, _ = spy.pearsonr(op(X[:, i], X[:, j]), y)
            if sig < abs(corr):
                good_pair = (i, j, corr, op_name)
                good_pairs.append(good_pair)
    return good_pairs
                

def get_corr_pairs_plus(X, y, sig=0.01, suppress_checkpoints=True):
    plus_pairs = get_corr_pairs(X, y, lambda a,b: a+b, 'plus', sig=sig, suppress_checkpoints=suppress_checkpoints)
    return plus_pairs


def get_corr_pairs_minus(X, y, sig=0.01, suppress_checkpoints=True):
    minus_pairs = get_corr_pairs(X, y, lambda a,b: a-b, 'minus', sig=sig, suppress_checkpoints=suppress_checkpoints)
    return minus_pairs
    

def get_corr_pairs_mult(X, y, sig=0.01, suppress_checkpoints=True):
    mult_pairs = get_corr_pairs(X, y, lambda a,b: a*b, 'mult', sig=sig, suppress_checkpoints=suppress_checkpoints)
    return mult_pairs


def get_all_corr_pairs(X, y, sig=0.01, suppress_checkpoints=True):
    all_pairs = get_corr_pairs_plus(X, y, sig, suppress_checkpoints)
    if not suppress_checkpoints:
        print('Plus pairs done')
    all_pairs.extend(get_corr_pairs_minus(X, y, sig, suppress_checkpoints))
    if not suppress_checkpoints:
        print('Minus pairs done')
    all_pairs.extend(get_corr_pairs_mult(X, y, sig, suppress_checkpoints))
    if not suppress_checkpoints:
        print('Mult pairs done')
    return all_pairs

The following three cells are for use if you don't already have what they generate

In [20]:
pairs = get_all_corr_pairs(X_train, y_train, sig=0.013, suppress_checkpoints=True)



In [None]:
best_pairs_df = pd.DataFrame.from_records(pairs, columns = ['i_feature', 'j_feature', 'corr_w_target', 'operation'])

In [55]:
best_pairs_df.sort_values(by='corr_w_target', key=abs, inplace=True, ascending=False)
best_pairs_df.to_csv('best-pairs.csv')
best_pairs_df.head(5)

Unnamed: 0,i_feature,j_feature,corr_w_target,operation
5279,271,518,-0.155469,minus
7005,517,518,-0.143993,minus
5278,271,517,-0.082747,minus
2233,464,526,0.046875,plus
15191,526,545,0.045942,mult


Or you can just import what the above three generate if you have the file...

In [14]:
best_pairs_df = pd.read_csv('best-pairs.csv')
best_pairs_df.drop(['Unnamed: 0'], axis=1, inplace=True)
best_pairs_df.head(5)

Unnamed: 0,i_feature,j_feature,corr_w_target,operation
0,271,518,-0.155469,minus
1,517,518,-0.143993,minus
2,271,517,-0.082747,minus
3,464,526,0.046875,plus
4,526,545,0.045942,mult


### Generating the highly correlated features
Given some `X` matrix and a DataFrame `best_pair_df` which has the stated 4-tuples, `gen_new_dataset` returns `X` with the top `n` additional features generated and added to it.

In [16]:
def gen_new_dataset(X, best_pairs_df, n=100):
    X_new = pd.DataFrame(X.copy())
    i = 'i_feature'
    j = 'j_feature'
    for index, row in best_pairs_df.iterrows():
        if index < 100:
            if row['operation'] == 'minus':
                X_new[str(row[i]) + '-' + str(row[j])] = X_new.iloc[:, row[i]] - X_new.iloc[:, row[j]]
            if row['operation'] == 'plus':
                X_new[str(row[i]) + '+' + str(row[j])] = X_new.iloc[:, row[i]] + X_new.iloc[:, row[j]]
            if row['operation'] == 'mult':
                X_new[str(row[i]) + '*' + str(row[j])] = X_new.iloc[:, row[i]] * X_new.iloc[:, row[j]]
    return X_new.to_numpy()

In [17]:
X_train_new = gen_new_dataset(X_train, best_pairs_df, 100)
X_val_new = gen_new_dataset(X_val, best_pairs_df, 100)
X_test_new = gen_new_dataset(X_test, best_pairs_df, 100)
X_new = np.concatenate([X_train_new, X_val_new, X_test_new])

In [18]:
pd.DataFrame(X_train_new).to_csv('generated-data/X-train-new.csv')
pd.DataFrame(X_val_new).to_csv('generated-data/X-val-new.csv')
pd.DataFrame(X_test_new).to_csv('generated-data/X-test-new.csv')
pd.DataFrame(X_new).to_csv('generated-data/X-new.csv')

### Feature Selection - Relief
I use an implementation of relief which can be found at https://github.com/gitter-badger/ReliefF/blob/master/ReliefF/ReliefF.py
This is *not* my implementation

In [137]:
relief = reliefF.ReliefF(n_neighbors=70, n_features_to_keep=100)

In [138]:
relief.fit(X_train_new, y_train.to_numpy())

In [161]:
pd.DataFrame(relief.top_features).to_csv('relief-top-features.csv')

### Feature Selection - Using a Random Forest

In [19]:
def convert_y_to_class(y):
    class_case = [0 if i < 2.0 else 1 for i in y]
    return class_case

In [20]:
y_train_class = convert_y_to_class(y_train)
y_val_class = convert_y_to_class(y_val)
y_test_class = convert_y_to_class(y_test)

In [21]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train_new, y_train_class)

RandomForestClassifier()

In [22]:
y_pred = rf.predict(X_val_new)
confusion_matrix(y_val_class, y_pred)

array([[19185,   151],
       [  815,   943]], dtype=int64)

In [23]:
importances = rf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf.estimators_], axis=0)

In [24]:
df_importances = pd.DataFrame(std, columns=['std'])
df_importances['feature'] = df_importances.index
best_features = np.array(df_importances.sort_values(by='std', ascending=False)['feature'].head(50))

In [25]:
best_features

array([763, 765, 764,   1,  22, 269, 374, 248, 329, 375, 637, 334, 268,
       411, 647, 200, 750, 289,  70, 331, 332, 646, 505, 218, 420, 275,
       208, 519, 648, 249, 321, 582, 373, 416, 769, 335, 827, 401, 271,
       336, 831, 614, 330, 258, 219, 410, 399, 779, 209, 782], dtype=int64)

In [26]:
X_train_new2 = X_train_new[:, best_features]
X_test_new2 = X_test_new[:, best_features]
X_val_new2 = X_val_new[:, best_features]

In [27]:
rf2 = RandomForestClassifier(n_estimators=100)
rf2.fit(X_train_new2, y_train_class)

RandomForestClassifier()

In [28]:
y_pred2 = rf2.predict(X_val_new2)
print(classification_report(y_val_class, y_pred2))
confusion_matrix(y_val_class, y_pred2)

              precision    recall  f1-score   support

           0       0.98      0.99      0.98     19336
           1       0.83      0.81      0.82      1758

    accuracy                           0.97     21094
   macro avg       0.91      0.90      0.90     21094
weighted avg       0.97      0.97      0.97     21094



array([[19051,   285],
       [  333,  1425]], dtype=int64)

In [246]:
X_new2 = np.concatenate([X_train_new2, X_val_new2, X_test_new2])
pd.DataFrame(X_train_new2).to_csv('generated-data/X-train-new2.csv')
pd.DataFrame(X_val_new2).to_csv('generated-data/X-val-new2.csv')
pd.DataFrame(X_test_new2).to_csv('generated-data/X-test-new2.csv')
pd.DataFrame(X_new2).to_csv('generated-data/X-new2.csv')
pd.DataFrame(np.concatenate([y_train, y_val, y_test])).to_csv('generated-data/y-in-order-for-new2.csv')

In [13]:
pd.DataFrame(y_train).to_csv('generated-data/y-train.csv')
pd.DataFrame(y_val).to_csv('generated-data/y-val.csv')
pd.DataFrame(y_test).to_csv('generated-data/y-test.csv')