In [1]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import model_selection
from sklearn import metrics
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectFpr, f_regression, f_classif
from sklearn.preprocessing import StandardScaler

  from numpy.core.umath_tests import inner1d


In [2]:
SEED = 1234

This code is the best model I could build. Please look at the file "SD701_Valentin_Larrieu_Report.docx" 
for more informations

## Model 3 : ExtraTrees with Sklearn

In [54]:
# We need pandas dataframe for sklearn
df_train = pd.read_csv('C:\\Users\\Orion\\Documents\\MS-BGD\\BigDataMining\\train-set.csv')
df_test = pd.read_csv('C:\\Users\\Orion\\Documents\\MS-BGD\\BigDataMining\\test-set.csv')


Y = df_train.Cover_Type

# We drop the ID column because it do not give usefull information
X = df_train.drop(['Id','Cover_Type'],axis=1)
X_test_input = df_test.drop('Id',axis=1)

#Feature creation
X["distance"] = np.sqrt(X.Horizontal_Distance_To_Hydrology**2 + X.Vertical_Distance_To_Hydrology**2)
X["High"] = X.Elevation+ X.Vertical_Distance_To_Hydrology 
X["Shade_mean"] = (X.Hillshade_9am+X.Hillshade_Noon+X.Hillshade_3pm)/3
X["slope_shade"] = X.Slope/ X.Shade_mean
X["elevation_shade"] = X.Elevation/ X.Shade_mean
X["slope_elevation"] = X.Slope/ X.Elevation
X['Hydro_slope'] = X.Vertical_Distance_To_Hydrology / X.Horizontal_Distance_To_Hydrology

# We try to create features that can be separated verticaly
X['Hydro_elev']=X.Elevation - 0.2 * X.Horizontal_Distance_To_Hydrology
X['Road_elev']=X.Elevation - 0.05 * X.Horizontal_Distance_To_Roadways
X['Hydro_elev_vert']=X.Elevation - X.Vertical_Distance_To_Hydrology
X['Horiz_mean']=(X.Horizontal_Distance_To_Fire_Points + X.Horizontal_Distance_To_Hydrology + X.Horizontal_Distance_To_Roadways) / 3
X['Horiz_fire_hydr']=(X.Horizontal_Distance_To_Fire_Points + X.Horizontal_Distance_To_Hydrology) / 2
X['Horiz_hydr_road']=(X.Horizontal_Distance_To_Roadways + X.Horizontal_Distance_To_Hydrology) / 2
X['Horiz_road_fire']=(X.Horizontal_Distance_To_Roadways + X.Horizontal_Distance_To_Hydrology) / 2
X['Horiz_diff_fire_hydr']=abs(X.Horizontal_Distance_To_Fire_Points - X.Horizontal_Distance_To_Hydrology)
X['Horiz_diff_hydr_road']=abs(X.Horizontal_Distance_To_Roadways - X.Horizontal_Distance_To_Hydrology)
X['Horiz_diff_road_fire']=abs(X.Horizontal_Distance_To_Roadways - X.Horizontal_Distance_To_Hydrology)


X_test_input["distance"] = np.sqrt(X_test_input.Horizontal_Distance_To_Hydrology**2 + X_test_input.Vertical_Distance_To_Hydrology**2)
X_test_input["High"] = X_test_input.Elevation+ X_test_input.Vertical_Distance_To_Hydrology 
X_test_input["Shade_mean"] = (X_test_input.Hillshade_9am+X_test_input.Hillshade_Noon+X_test_input.Hillshade_3pm)/3
X_test_input["slope_shade"] = X_test_input.Slope/ X_test_input.Shade_mean
X_test_input["elevation_shade"] = X_test_input.Elevation/ X_test_input.Shade_mean
X_test_input["slope_elevation"] = X_test_input.Slope/ X_test_input.Elevation 
X_test_input['Hydro_slope'] = X_test_input.Vertical_Distance_To_Hydrology / X_test_input.Horizontal_Distance_To_Hydrology
X_test_input['Hydro_elev']=X_test_input.Elevation - 0.2 * X_test_input.Horizontal_Distance_To_Hydrology
X_test_input['Road_elev']=X_test_input.Elevation - 0.05 * X_test_input.Horizontal_Distance_To_Roadways
X_test_input['Hydro_elev_vert']=X_test_input.Elevation - X_test_input.Vertical_Distance_To_Hydrology
X_test_input['Horiz_mean']=(X_test_input.Horizontal_Distance_To_Fire_Points + X_test_input.Horizontal_Distance_To_Hydrology + X_test_input.Horizontal_Distance_To_Roadways) / 3
X_test_input['Horiz_fire_hydr']=(X_test_input.Horizontal_Distance_To_Fire_Points + X_test_input.Horizontal_Distance_To_Hydrology) / 2
X_test_input['Horiz_hydr_road']=(X_test_input.Horizontal_Distance_To_Roadways + X_test_input.Horizontal_Distance_To_Hydrology) / 2
X_test_input['Horiz_hydr_road']=(X_test_input.Horizontal_Distance_To_Roadways + X_test_input.Horizontal_Distance_To_Hydrology) / 2
X_test_input['Horiz_road_fire']=(X_test_input.Horizontal_Distance_To_Roadways + X_test_input.Horizontal_Distance_To_Hydrology) / 2
X_test_input['Horiz_diff_fire_hydr']=abs(X_test_input.Horizontal_Distance_To_Fire_Points - X_test_input.Horizontal_Distance_To_Hydrology)
X_test_input['Horiz_diff_hydr_road']=abs(X_test_input.Horizontal_Distance_To_Roadways - X_test_input.Horizontal_Distance_To_Hydrology)
X_test_input['Horiz_diff_road_fire']=abs(X_test_input.Horizontal_Distance_To_Roadways - X_test_input.Horizontal_Distance_To_Hydrology)

# We remove the infinite values the column division could have created
X.Shade_mean=X.Shade_mean.map(lambda x: 0 if np.isinf(x) else x)
X.slope_shade=X.slope_shade.map(lambda x: 0 if np.isinf(x) else x)
X.elevation_shade=X.elevation_shade.map(lambda x: 0 if np.isinf(x) else x)
X.Hydro_slope=X.Hydro_slope.map(lambda x: 0 if np.isinf(x) else x)
X.slope_elevation=X.slope_elevation.map(lambda x: 0 if np.isinf(x) else x)

X_test_input.Shade_mean=X_test_input.Shade_mean.map(lambda x: 0 if np.isinf(x) else x)
X_test_input.slope_shade=X_test_input.slope_shade.map(lambda x: 0 if np.isinf(x) else x)
X_test_input.elevation_shade=X_test_input.elevation_shade.map(lambda x: 0 if np.isinf(x) else x)
X_test_input.Hydro_slope=X_test_input.Hydro_slope.map(lambda x: 0 if np.isinf(x) else x)
X_test_input.slope_elevation=X_test_input.slope_elevation.map(lambda x: 0 if np.isinf(x) else x)

X[X==np.inf] = np.nan
X.fillna(X.mean(), inplace=True)
X_test_input[X_test_input==np.inf] = np.nan
X_test_input.fillna(X_test_input.mean(), inplace=True)

#Feature selection
print("shape before drop", X.shape)

# Normalisation of the data
sc = StandardScaler()
model_centered = sc.fit(X)
X = model_centered.transform(X)
model_centered = sc.fit(X_test_input)
X_test_input = model_centered.transform(X_test_input)

# Feature selection
sel = SelectFpr(f_regression,alpha=0.000001)
model_sel = sel.fit(X,Y)
X = sel.transform(X)
X_test_input = sel.transform(X_test_input)

print("shape after drop", X.shape)

# We split our data
X_train,X_test,Y_train,Y_test = model_selection.train_test_split(X,Y,test_size=0.1)


shape before drop (528720, 71)
shape after drop (528720, 63)


In [66]:
# We fit our model
et = AdaBoostClassifier(ExtraTreesClassifier(n_estimators=300, criterion= 'entropy', n_jobs = -1, warm_start=True, max_features = 19), n_estimators=300, learning_rate=0.001, algorithm='SAMME.R')

et.fit(X_train,Y_train)

# We use it to predict our output
Y_hat = et.predict(X_test)

# We print the results
print(metrics.classification_report(Y_test,Y_hat))
print("ExtraTrees Accuracy :", metrics.accuracy_score(Y_test,Y_hat))

             precision    recall  f1-score   support

          1       0.96      0.95      0.96     19278
          2       0.96      0.97      0.96     25709
          3       0.95      0.96      0.96      3325
          4       0.90      0.89      0.89       230
          5       0.91      0.82      0.86       899
          6       0.94      0.91      0.92      1542
          7       0.97      0.95      0.96      1889

avg / total       0.96      0.96      0.96     52872

ExtraTrees Accuracy : 0.9578037524587684


In [39]:
# We retrain our model with our entire set to have the best model for kaggle
et2 = AdaBoostClassifier(ExtraTreesClassifier(n_estimators=300, criterion= 'entropy', n_jobs = -1, warm_start=True, max_features = 19), n_estimators=300, learning_rate=0.01, algorithm='SAMME.R')

et2.fit(X,Y)
Y_hat_export2 = et2.predict(X_test_input)

export_df2 = pd.DataFrame({'Id':df_test.Id.values,'Cover_Type':Y_hat_export2}).sort_index(ascending=False,axis=1)

In [40]:
export_df2.to_csv('C:\\Users\\Orion\\Documents\\MS-BGD\\BigDataMining\\submission_local.csv',index=False)