In [1]:
# Import libraries
import time
import numpy as np
import pandas as pd

In [2]:
# Read Claims data
try:
    claims_train_data = pd.read_csv("Data/train.csv")    
    print "Claims data read successfully!"
except:
    print "Dataset could not be loaded. Is the dataset missing?"    

# Note: The last column 'loss' is the target/label, all other are feature columns

Claims data read successfully!


In [3]:
# Read Test data
try:
    claims_test_data = pd.read_csv("Data/test.csv")
    print "Test data read successfully!"
except:
    print "Dataset could not be loaded. Is the dataset missing?"       

Test data read successfully!


In [None]:
SPLITS = 10

In [4]:
claims_test_data['loss'] = np.nan

In [5]:
claims_train_data.head()

Unnamed: 0,id,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14,loss
0,1,A,B,A,B,A,A,A,A,B,...,0.718367,0.33506,0.3026,0.67135,0.8351,0.569745,0.594646,0.822493,0.714843,2213.18
1,2,A,B,A,A,A,A,A,A,B,...,0.438917,0.436585,0.60087,0.35127,0.43919,0.338312,0.366307,0.611431,0.304496,1283.6
2,5,A,B,A,A,B,A,A,A,B,...,0.289648,0.315545,0.2732,0.26076,0.32446,0.381398,0.373424,0.195709,0.774425,3005.09
3,10,B,B,A,B,A,A,A,A,B,...,0.440945,0.391128,0.31796,0.32128,0.44467,0.327915,0.32157,0.605077,0.602642,939.85
4,11,A,B,A,B,A,A,A,A,B,...,0.178193,0.247408,0.24564,0.22089,0.2123,0.204687,0.202213,0.246011,0.432606,2763.85


In [6]:
claims_test_data.head()

Unnamed: 0,id,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,cont14,loss
0,4,A,B,A,A,A,A,A,A,B,...,0.466591,0.317681,0.61229,0.34365,0.38016,0.377724,0.369858,0.704052,0.392562,
1,6,A,B,A,B,A,A,A,A,B,...,0.482425,0.44376,0.7133,0.5189,0.60401,0.689039,0.675759,0.453468,0.208045,
2,9,A,B,A,B,B,A,B,A,B,...,0.212308,0.325779,0.29758,0.34365,0.30529,0.24541,0.241676,0.258586,0.297232,
3,12,A,A,A,A,B,A,A,A,A,...,0.36993,0.342355,0.40028,0.33237,0.3148,0.348867,0.341872,0.592264,0.555955,
4,15,B,A,A,A,A,B,A,A,A,...,0.398862,0.391833,0.23688,0.43731,0.50556,0.359572,0.352251,0.301535,0.825823,


In [7]:
from sklearn.preprocessing import StandardScaler
def PreProcess(train, test):
    scaler = StandardScaler() 
    joined = pd.concat([train, test])
    for column, col_data in joined.iteritems():
        if (col_data.dtype == object):
                if train[column].nunique() != test[column].nunique():
                    set_train = set(train[column].unique())
                    set_test = set(test[column].unique())
                    remove_train = set_train - set_test
                    remove_test = set_test - set_train

                    remove = remove_train.union(remove_test)
                    def filter_cat(x):
                        if x in remove:
                            return np.nan
                        return x

                    joined[column] = joined[column].apply(lambda x: filter_cat(x), 1)

                joined[column] = pd.factorize(joined[column].values, sort=True)[0]

        else:
                if column != 'id' and column != 'loss':
                    joined[column] = scaler.fit_transform(col_data.values.reshape(-1,1))                  
    return joined


In [8]:
#claims_train_data.drop('id',axis=1,inplace=True)  #Remove the ID from the training set
PreProcess_joined = PreProcess(claims_train_data, claims_test_data)

In [9]:
processed_claims_train_data = PreProcess_joined[PreProcess_joined['loss'].notnull()]
processed_claims_test_data = PreProcess_joined[PreProcess_joined['loss'].isnull()]   

In [32]:
# Extract feature (X) and target (y) columns
feature_cols = list(processed_claims_train_data.columns[1:-1])  # all columns but first(id) and last(loss) are features
target_col = processed_claims_train_data.columns[-1]  # last column is the target/label

X_all = processed_claims_train_data[feature_cols]  # feature values for all students
y_all = np.log(processed_claims_train_data[target_col]+200)  # corresponding targets/labels

In [34]:
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

clfs = [RandomForestRegressor(n_estimators=100, n_jobs=-1, criterion='mae'),
            RandomForestRegressor(n_estimators=100, n_jobs=-1, criterion='mae'),
            ExtraTreesRegressor(n_estimators=100, n_jobs=-1, criterion='mae'),
            ExtraTreesRegressor(n_estimators=100, n_jobs=-1, criterion='mae')]

In [21]:
print "Creating train and test sets for blending."

dataset_blend_train = np.zeros((processed_claims_train_data.shape[0], len(clfs)))
dataset_blend_test = np.zeros((processed_claims_test_data.shape[0], len(clfs)))

Creating train and test sets for blending.


In [22]:
from sklearn.model_selection import ShuffleSplit
rs = ShuffleSplit(n_splits=SPLITS, test_size=.25, random_state=42)

3

In [27]:
# use the fitted model to make predictions for the testing set observations
test_feature_cols = list(processed_claims_test_data.columns[1:-1])
test_data = processed_claims_test_data[test_feature_cols].values

In [None]:
for j, clf in enumerate(clfs):
    print j, clf
    dataset_blend_test_j = np.zeros((claims_test_data.shape[0], rs.get_n_splits(X_all)))
    for i, (train, test) in enumerate(rs.split(X_all)):
        print "Fold", i
        X_train = X_all.iloc[train]
        y_train = y_all.iloc[train]
        X_test = X_all.iloc[test]
        y_test = y_all.iloc[test]
        clf.fit(X_train, y_train)
        y_submission = np.exp(clf.predict(X_test)) - 200
        dataset_blend_train[test, j] = y_submission
        dataset_blend_test_j[:, i] = np.exp(clf.predict(test_data)) - 200
    dataset_blend_test[:, j] = dataset_blend_test_j.mean(1)

In [29]:
print
print "Blending."
clf = LogisticRegression()
clf.fit(dataset_blend_train, y)
y_submission = clf.predict_proba(dataset_blend_test)[:, 1]

print "Linear stretch of predictions to [0,1]"
y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())

print "Saving Results."
tmp = np.vstack([range(1, len(y_submission)+1), y_submission]).T
np.savetxt(fname='submission.csv', X=tmp, fmt='%d,%0.9f',
           header='MoleculeId,PredictedProbability', comments='')

0 RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=6,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=5, n_jobs=-1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)
Fold 0
Fold 1
Fold 2
1 RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=6,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=5, n_jobs=-1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)
Fold 0
Fold 1
Fold 2


In [31]:
dataset_blend_test

array([[ 1931.11767,  1932.22793],
       [ 2084.2082 ,  1943.98066],
       ..., 
       [ 1446.27312,  1435.59875],
       [ 3940.88002,  3904.07255]])

In [27]:
# write the DataFrame to a CSV file that can be submitted to Kaggle
pd.DataFrame({'id':processed_claims_test_data.id, 'loss':test_pred}).set_index('id').to_csv('MLP GRID 50 layer.csv')