In [1]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder,CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import DecisionTreeClassifier,RandomForestClassifier                              
from pyspark.ml.regression import DecisionTreeRegressor,RandomForestRegressor
from pyspark.sql import SQLContext
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import model_selection
from sklearn import metrics
import pandas as pd

In [2]:
# Please look at the report for more details.
# My way to do this competition was to test different models, improve them then select the best one and try to pefect it.
# I tried some feature selection and cleaning. For instance rebalancing the dataset since all the cover type are not equaly balanced, but it did not iprove the score. I suppose its a bias linked with how
# The cover type are balanded also in the test set (some cover have 10 times more datapoints than others).
# After trying the logistic regression model i implemented a cross validation to optimise parameters. I saw that the result could be improved so i tried other algorithms.
# I used random forest then decision tree. This one gave me the best result (regarding to the limit of databricks) on Mllib.
# In order to improve my score i then tried some of the model of sklearn. The best result I found were with the extraTree algorithm. I reached a score of 0.95967 on kaggle with it (code of model 3 in the file "SDI_701_sklearn_Valentin_Larrieu").
# Databricks limited me on the number of estimator (too much ressources needed), so i trained my model locally on my PC to slightly improve the results (to gain some .0001%).
#
# You will find in this notebook the code :
#      -[Model 1] The best Model I could build with Mllib (A Decision Tree with cross validation)
#      -[Model 2] The best Model I could build with sklearn (An Extratree) on Databricks before the improvment i did locally
#  

In [3]:
# We fix the seed
SEED = 1234

In [4]:
# We import the data
df_train = spark.read.  \
         option("header", "true"). \
         option("nullValue", "?"). \
         option("inferSchema", "true"). \
         option("sep", ","). \
         csv("/FileStore/tables/train_set-51e11.csv") 

df_test = spark.read.  \
         option("header", "true"). \
         option("nullValue", "?"). \
         option("inferSchema", "true"). \
         option("sep", ","). \
         csv("/FileStore/tables/test_set-b5f57.csv") 

# We split it according to the split giving us the best results (90% / 10%)
trainData, testData = df_train.randomSplit([0.90,0.10],seed=SEED)


In [5]:
# We control the size of our elements
print('Train data size: {} rows, {} columns'.format(df_train.count(), len(df_train.columns)))
print('Test data size: {} rows, {} columns'.format(df_test.count(), len(df_test.columns)))

In [6]:
vector_assembler = VectorAssembler(inputCols=["Elevation", "Aspect", "Slope", "Horizontal_Distance_To_Hydrology", "Vertical_Distance_To_Hydrology", "Horizontal_Distance_To_Roadways", "Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm", "Horizontal_Distance_To_Fire_Points", "Wilderness_Area1", "Wilderness_Area2", "Wilderness_Area3", "Wilderness_Area4", "Soil_Type1", "Soil_Type2", "Soil_Type3", "Soil_Type4", "Soil_Type5", "Soil_Type6", "Soil_Type7", "Soil_Type8", "Soil_Type9", "Soil_Type10", "Soil_Type11", "Soil_Type12", "Soil_Type13", "Soil_Type14", "Soil_Type15", "Soil_Type16", "Soil_Type17", "Soil_Type18", "Soil_Type19", "Soil_Type20", "Soil_Type21", "Soil_Type22", "Soil_Type23", "Soil_Type24", "Soil_Type25", "Soil_Type26", "Soil_Type27", "Soil_Type28", "Soil_Type29", "Soil_Type30", "Soil_Type31", "Soil_Type32", "Soil_Type33", "Soil_Type34", "Soil_Type35", "Soil_Type36", "Soil_Type37", "Soil_Type38", "Soil_Type39", "Soil_Type40"], outputCol="features")


In [7]:
############# Model 1 Cross validation and decision tree

# Create initial Decision Tree Model
dt = DecisionTreeClassifier(labelCol="Cover_Type", featuresCol="features", cacheNodeIds=True)

# We create our evaluator
evaluator = MulticlassClassificationEvaluator(labelCol="Cover_Type", predictionCol="prediction", metricName="f1")

# We set the parameters we want to try
paramGridDt = ParamGridBuilder().\
            addGrid(dt.maxDepth, [30]).\
            addGrid(dt.maxBins, [200]).\
            addGrid(dt.impurity, ["entropy"]).\
            addGrid(dt.minInstancesPerNode, [3]).\
            build()
# Here there is no different choice for each parameter, that is because we optimised the different param

# We set the pipeline
pipelineDt = Pipeline(stages=[vector_assembler, dt])

#Create 5-fold CrossValidator
cv3 = CrossValidator(estimator=pipelineDt, estimatorParamMaps=paramGridDt, evaluator=evaluator, numFolds=5)

#Fit cross-validation model
cvModel3 = cv3.fit(trainData)

#Use test set to measure the accuracy of our model on new data
#Prediction
pred_training_cv3 = cvModel3.transform(trainData)
pred_test_cv3 = cvModel3.transform(testData) #test

#Evaluation
# performance on training data
print("Train data performance Decision Tree = ", evaluator.evaluate(pred_training_cv3))

# performance on test data
print("Test data performance Decision Tree = ", evaluator.evaluate(pred_test_cv3))


In [8]:
# We export our best model to submit it on kaggle
model = cvModel3

# Make predictions on testData
predictions = model.transform(df_test) 


predictions = predictions.withColumn("Cover_Type", predictions["prediction"].cast("int")) 

# Select columns Id and prediction
(predictions
 .repartition(1)
 .select('Id', 'Cover_Type')
 .write
 .format('com.databricks.spark.csv')
 .options(header='true')
 .mode('overwrite')
 .save('/FileStore/kaggle-submission'))

In [9]:
# We display the path of the file
display(dbutils.fs.ls("dbfs:/FileStore/kaggle-submission"))


path,name,size
dbfs:/FileStore/kaggle-submission/_SUCCESS,_SUCCESS,0
dbfs:/FileStore/kaggle-submission/_committed_5702443627356436892,_committed_5702443627356436892,199
dbfs:/FileStore/kaggle-submission/_committed_5868879643784577909,_committed_5868879643784577909,199
dbfs:/FileStore/kaggle-submission/_committed_9174243400005013254,_committed_9174243400005013254,199
dbfs:/FileStore/kaggle-submission/_committed_vacuum4319346291273316749,_committed_vacuum4319346291273316749,129
dbfs:/FileStore/kaggle-submission/_started_9174243400005013254,_started_9174243400005013254,0
dbfs:/FileStore/kaggle-submission/part-00000-tid-9174243400005013254-7650f166-2638-4abb-a192-5506563ab2e5-3896-c000.csv,part-00000-tid-9174243400005013254-7650f166-2638-4abb-a192-5506563ab2e5-3896-c000.csv,2039369


In [10]:
## Model 2 : ExtraTrees with Sklearn

# We need pandas dataframe for sklearn
df_train2 = df_train.toPandas()
df_test2 = df_test.toPandas()

Y = df_train2.Cover_Type
# We drop the ID column because it do not give usefull information
X = df_train2.drop(['Id','Cover_Type'],axis=1)
X_test_input = df_test2.drop('Id',axis=1)

# We split our data
X_train,X_test,Y_train,Y_test = model_selection.train_test_split(X,Y,test_size=0.1)

# We set our ExtraTrees model with parameters we optimised
et = ExtraTreesClassifier(n_estimators=200, criterion= 'entropy')

In [11]:
# We fit our model
et.fit(X_train,Y_train)

# We use it to predict our output
Y_hat = et.predict(X_test)

# We print the results
print(metrics.classification_report(Y_test,Y_hat))
print("ExtraTrees Accuracy :", metrics.accuracy_score(Y_test,Y_hat))

In [12]:
# We retrain our model with our entire set to have the best model for kaggle
et2 = ExtraTreesClassifier(n_estimators=200, criterion= 'entropy')
et2.fit(X,Y)
Y_hat_export = et2.predict(X_test_input)

export_df = pd.DataFrame({'Id':df_test2.Id.values,'Cover_Type':Y_hat_export}).sort_index(ascending=False,axis=1)
df_export_2 = sqlContext.createDataFrame(export_df)


In [13]:
(df_export_2
 .repartition(1)
 .select('Id', 'Cover_Type')
 .write
 .format('com.databricks.spark.csv')
 .options(header='true')
 .mode('overwrite')
 .save('/FileStore/kaggle-submission2'))

In [14]:
# We display the path of the file
display(dbutils.fs.ls("dbfs:/FileStore/kaggle-submission2"))

In [15]:
## Model 3 Local : ExtraTrees with Sklearn => loook at the file "SDI_701_sklearn_Valentin_Larrieu"
