In [2]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
spark = SparkSession.builder.master("local[*]").appName("practiceML").getOrCreate()
spark

In [3]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler

In [4]:
fire_occurence = spark.read.csv("FireOccurence.data.csv", header=True, inferSchema=True)
fire_occurence = fire_occurence.drop("_id", "Serial" ,"FireCategory" ,"FireYear" ,"Area" ,"DistrictName" ,"UnitName" ,"FullFireNumber" ,"FireName" ,"Cause_Comments" ,"Lat_DD" ,"Long_DD" ,"LatLongDD" ,"FO_LandOwnType" ,"Twn" ,"Rng" ,"Sec" ,"Subdiv" ,"LandmarkLocation" ,"County" ,"RegUseZone","RegUseRestriction" ,"Ign_DateTime" ,"ReportDateTime" ,"Discover_DateTime" ,"Control_DateTime" ,"CreationDate" ,"ModifiedDate" ,"DistrictCode" ,"UnitCode" ,"DistFireNumber")
fire_occurence = fire_occurence.na.drop()
fire_occurence.show()
# fire_occurence.where(fire_occurence.HumanOrLightning == "Lightning").count()
# fire_occurence.where((fire_occurence.EstTotalAcres >= 300) & (fire_occurence.Size_class == "D")).show(100)
# fire_occurence.select("GeneralCause").distinct().show(100)

+----------+-------------+---------------+----------------+---------------+--------------+--------------------+----------------------+
|Size_class|EstTotalAcres|Protected_Acres|HumanOrLightning|        CauseBy|  GeneralCause|       SpecificCause|Industrial_Restriction|
+----------+-------------+---------------+----------------+---------------+--------------+--------------------+----------------------+
|         B|         0.75|           0.75|       Lightning|      Lightning|     Lightning|           Lightning|  Does Not Apply - ...|
|         C|         80.0|           80.0|       Lightning|      Lightning|     Lightning|           Lightning|  Does Not Apply - ...|
|         A|          0.1|            0.1|           Human|       Motorist|       Smoking|Other - Smoker Re...|  Lvl 3 Restricted ...|
|         A|         0.01|           0.01|           Human|       Motorist|    Recreation|           Fireworks|  Lvl 1 Fire Season...|
|         A|         0.01|           0.01|       Lightn

In [5]:
fire_occurence.groupBy("HumanOrLightning").count().show()

+----------------+-----+
|HumanOrLightning|count|
+----------------+-----+
|       Lightning| 6249|
|           Human|16975|
+----------------+-----+



In [6]:
fire_occurence.groupBy("Size_class").count().show()

+----------+-----+
|Size_class|count|
+----------+-----+
|         F|   75|
|         E|   97|
|         B| 5065|
|         D|  184|
|         C|  836|
|         A|16857|
|         G|  110|
+----------+-----+



In [7]:
fire_occurence.printSchema()

root
 |-- Size_class: string (nullable = true)
 |-- EstTotalAcres: double (nullable = true)
 |-- Protected_Acres: double (nullable = true)
 |-- HumanOrLightning: string (nullable = true)
 |-- CauseBy: string (nullable = true)
 |-- GeneralCause: string (nullable = true)
 |-- SpecificCause: string (nullable = true)
 |-- Industrial_Restriction: string (nullable = true)



In [8]:
indexer = StringIndexer(inputCols=["Size_class", "HumanOrLightning", "CauseBy", "GeneralCause", "SpecificCause", "Industrial_Restriction"], 
                        outputCols= ["{}_num".format(c) for c in ["Size_class", "HumanOrLightning", "CauseBy", "GeneralCause", "SpecificCause", "Industrial_Restriction"]])
indexed = indexer.fit(fire_occurence).transform(fire_occurence)
indexed.show()

+----------+-------------+---------------+----------------+---------------+--------------+--------------------+----------------------+--------------+--------------------+-----------+----------------+-----------------+--------------------------+
|Size_class|EstTotalAcres|Protected_Acres|HumanOrLightning|        CauseBy|  GeneralCause|       SpecificCause|Industrial_Restriction|Size_class_num|HumanOrLightning_num|CauseBy_num|GeneralCause_num|SpecificCause_num|Industrial_Restriction_num|
+----------+-------------+---------------+----------------+---------------+--------------+--------------------+----------------------+--------------+--------------------+-----------+----------------+-----------------+--------------------------+
|         B|         0.75|           0.75|       Lightning|      Lightning|     Lightning|           Lightning|  Does Not Apply - ...|           1.0|                 1.0|        0.0|             0.0|              0.0|                       0.0|
|         C|        

In [9]:
assembler = VectorAssembler(inputCols=["EstTotalAcres", "Protected_Acres","Size_class_num", "CauseBy_num", "GeneralCause_num", "SpecificCause_num", "Industrial_Restriction_num"], 
                            outputCol="features")

output = assembler.transform(indexed)
indexed.select("EstTotalAcres", "Protected_Acres","Size_class_num", "CauseBy_num", "GeneralCause_num", "SpecificCause_num", "Industrial_Restriction_num").printSchema()

root
 |-- EstTotalAcres: double (nullable = true)
 |-- Protected_Acres: double (nullable = true)
 |-- Size_class_num: double (nullable = false)
 |-- CauseBy_num: double (nullable = false)
 |-- GeneralCause_num: double (nullable = false)
 |-- SpecificCause_num: double (nullable = false)
 |-- Industrial_Restriction_num: double (nullable = false)



In [10]:
output.select("features", "HumanOrLightning_num").show(truncate=False)

+---------------------------------+--------------------+
|features                         |HumanOrLightning_num|
+---------------------------------+--------------------+
|(7,[0,1,2],[0.75,0.75,1.0])      |1.0                 |
|(7,[0,1,2],[80.0,80.0,2.0])      |1.0                 |
|[0.1,0.1,0.0,4.0,5.0,25.0,4.0]   |0.0                 |
|[0.01,0.01,0.0,4.0,3.0,17.0,1.0] |0.0                 |
|(7,[0,1,6],[0.01,0.01,4.0])      |1.0                 |
|(7,[0,1],[0.01,0.01])            |1.0                 |
|(7,[3,4,5],[3.0,2.0,7.0])        |0.0                 |
|[0.01,0.01,0.0,6.0,1.0,4.0,1.0]  |0.0                 |
|[0.01,0.01,0.0,4.0,4.0,1.0,0.0]  |0.0                 |
|[3.0,3.0,1.0,2.0,3.0,17.0,1.0]   |0.0                 |
|(7,[0,1],[0.1,0.1])              |1.0                 |
|[0.01,0.01,0.0,1.0,4.0,24.0,3.0] |0.0                 |
|[1.5,1.5,1.0,0.0,0.0,0.0,4.0]    |1.0                 |
|[0.01,0.01,0.0,11.0,4.0,21.0,1.0]|0.0                 |
|[0.1,0.1,0.0,4.0,1.0,6.0,4.0] 

In [11]:
df_model = output.select("features", "HumanOrLightning_num")
df_model.show(truncate=False)

+---------------------------------+--------------------+
|features                         |HumanOrLightning_num|
+---------------------------------+--------------------+
|(7,[0,1,2],[0.75,0.75,1.0])      |1.0                 |
|(7,[0,1,2],[80.0,80.0,2.0])      |1.0                 |
|[0.1,0.1,0.0,4.0,5.0,25.0,4.0]   |0.0                 |
|[0.01,0.01,0.0,4.0,3.0,17.0,1.0] |0.0                 |
|(7,[0,1,6],[0.01,0.01,4.0])      |1.0                 |
|(7,[0,1],[0.01,0.01])            |1.0                 |
|(7,[3,4,5],[3.0,2.0,7.0])        |0.0                 |
|[0.01,0.01,0.0,6.0,1.0,4.0,1.0]  |0.0                 |
|[0.01,0.01,0.0,4.0,4.0,1.0,0.0]  |0.0                 |
|[3.0,3.0,1.0,2.0,3.0,17.0,1.0]   |0.0                 |
|(7,[0,1],[0.1,0.1])              |1.0                 |
|[0.01,0.01,0.0,1.0,4.0,24.0,3.0] |0.0                 |
|[1.5,1.5,1.0,0.0,0.0,0.0,4.0]    |1.0                 |
|[0.01,0.01,0.0,11.0,4.0,21.0,1.0]|0.0                 |
|[0.1,0.1,0.0,4.0,1.0,6.0,4.0] 

In [12]:
train_df, test_df = df_model.randomSplit([0.7, 0.3])
print(train_df.count(), test_df.count())

16386 6838


In [13]:
from pyspark.ml.classification import LogisticRegression
log_reg = LogisticRegression(labelCol="HumanOrLightning_num").fit(train_df)
log_reg

LogisticRegressionModel: uid=LogisticRegression_5718fbeabc7b, numClasses=2, numFeatures=7

In [14]:
train_result = log_reg.evaluate(train_df).predictions
train_result.show(100)

+--------------------+--------------------+--------------------+--------------------+----------+
|            features|HumanOrLightning_num|       rawPrediction|         probability|prediction|
+--------------------+--------------------+--------------------+--------------------+----------+
|           (7,[],[])|                 1.0|[-17.755410913384...|[1.94501523728089...|       1.0|
|           (7,[],[])|                 1.0|[-17.755410913384...|[1.94501523728089...|       1.0|
|           (7,[],[])|                 1.0|[-17.755410913384...|[1.94501523728089...|       1.0|
|           (7,[],[])|                 1.0|[-17.755410913384...|[1.94501523728089...|       1.0|
|           (7,[],[])|                 1.0|[-17.755410913384...|[1.94501523728089...|       1.0|
|           (7,[],[])|                 1.0|[-17.755410913384...|[1.94501523728089...|       1.0|
|           (7,[],[])|                 1.0|[-17.755410913384...|[1.94501523728089...|       1.0|
|      (7,[0],[0.01])|        

In [15]:
result = log_reg.evaluate(test_df).predictions
result.show(10000)

+--------------------+--------------------+--------------------+--------------------+----------+
|            features|HumanOrLightning_num|       rawPrediction|         probability|prediction|
+--------------------+--------------------+--------------------+--------------------+----------+
|           (7,[],[])|                 1.0|[-17.755410913384...|[1.94501523728089...|       1.0|
|           (7,[],[])|                 1.0|[-17.755410913384...|[1.94501523728089...|       1.0|
|           (7,[],[])|                 1.0|[-17.755410913384...|[1.94501523728089...|       1.0|
|      (7,[0],[0.01])|                 1.0|[-17.755412382950...|[1.94501237895509...|       1.0|
|      (7,[0],[0.01])|                 1.0|[-17.755412382950...|[1.94501237895509...|       1.0|
|       (7,[0],[0.1])|                 1.0|[-17.755425609043...|[1.94498665421189...|       1.0|
|      (7,[0],[0.25])|                 1.0|[-17.755447652530...|[1.94494378039597...|       1.0|
|      (7,[0],[0.25])|        

In [16]:
tp = result[(result.HumanOrLightning_num == 1) & (result.prediction == 1)].count()
tn = result[(result.HumanOrLightning_num == 0) & (result.prediction == 0)].count()
fp = result[(result.HumanOrLightning_num == 0) & (result.prediction == 1)].count()
fn = result[(result.HumanOrLightning_num == 1) & (result.prediction == 0)].count()
print(
    "True-Positif: ",tp, "\n",
    "True-Negatif: ",tn, "\n",
    "False-Positif: ",fp, "\n",
    "False-Negatif: ",fn, "\n",
      )

True-Positif:  1872 
 True-Negatif:  4966 
 False-Positif:  0 
 False-Negatif:  0 



In [17]:
accuracy = float((tp+tn)/(result.count()))
recall = float(tn)/(tp+tn)
print("Accuracy",accuracy,"\nRecall",recall)

Accuracy 1.0 
Recall 0.7262357414448669
