In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import rand


In [2]:
spark = SparkSession.builder.appName('randomForest').getOrCreate()

In [3]:
dfInfo = spark.read.csv('/Users/chenyuanshan/temp/data/data/featureMatrix/withHeader.csv', inferSchema=True,header=True)

In [4]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler

In [5]:
dfInfoAssembler = VectorAssembler(inputCols=['age_range','gender','total_log','click','add_chart','buy','favourite', \
                                            'store_buy_rate', 'store_rate'],outputCol='features')

In [6]:
dfInfo = dfInfoAssembler.transform(dfInfo)

In [7]:
dfInfo.select(['features', 'label']).show(10,False)

+-------------------------------------------------------------------------+-----+
|features                                                                 |label|
+-------------------------------------------------------------------------+-----+
|[6.0,0.0,62.0,3.0,0.0,1.0,0.0,0.3333333333333333,0.08940820834786653]    |0    |
|[3.0,0.0,591.0,16.0,0.0,1.0,0.0,0.0625,0.09346991037131884]              |0    |
|[6.0,1.0,537.0,3.0,0.0,1.0,0.0,0.3333333333333333,0.12137006701414745]   |0    |
|[0.0,0.0,49.0,3.0,0.0,2.0,0.0,0.6666666666666666,0.07458405048766495]    |0    |
|[4.0,1.0,182.0,0.0,0.0,1.0,0.0,0.6666666666666666,0.12729948491537896]   |0    |
|[4.0,0.0,149.0,2.0,0.0,1.0,0.0,0.5,0.13248090925235306]                  |0    |
|[6.0,0.0,457.0,0.0,0.0,1.0,0.0,0.5,0.1026058631921824]                   |0    |
|[2.0,0.0,128.0,7.0,0.0,1.0,0.0,0.14285714285714285,0.06504520137703268]  |0    |
|[6.0,1.0,274.0,3.0,0.0,1.0,0.0,0.3333333333333333,0.13646485199831074]   |0    |
|[4.0,0.0,307.0,

In [8]:
dfInfoModel = dfInfo.select(['features', 'label'])

In [9]:
training,test = dfInfoModel.randomSplit([0.75,0.25])

In [10]:
from pyspark.ml.classification import RandomForestClassifier

In [11]:
rfModel = RandomForestClassifier(labelCol='label').fit(training)

In [12]:
testRslt = rfModel.transform(test)

In [13]:
TP = testRslt.filter(testRslt['prediction'] == 1).filter(testRslt['label'] == 1).count()
FN = testRslt.filter(testRslt['prediction'] == 0).filter(testRslt['label'] == 1).count()
TN = testRslt.filter(testRslt['prediction'] == 0).filter(testRslt['label'] == 0).count()
FP = testRslt.filter(testRslt['prediction'] == 1).filter(testRslt['label'] == 0).count()

In [17]:
acc =(TP+TN)/(TP+TN+FP+FN)
print(acc)

0.9396709323583181


In [19]:
recall = TP/(TP+TN)
print(recall, TP)

0.0 0


In [14]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [21]:
acc2 = MulticlassClassificationEvaluator(labelCol='label', metricName='accuracy').evaluate(testRslt)

In [23]:
auc = BinaryClassificationEvaluator(labelCol='label').evaluate(testRslt)
print('acc[{}], auc[{}]'.format(acc2,auc))

acc[0.9396709323583181], auc[0.619954063732903]


In [15]:
testData = \
spark.read.csv('/Users/chenyuanshan/temp/data/data/testMatrix/withHeader.csv', inferSchema=True,header=True)

In [16]:
testAssembler = VectorAssembler(inputCols=['age_range','gender','total_log','click','add_chart','buy','favourite', \
                                            'store_buy_rate', 'store_rate'],outputCol='features')

In [17]:
dftest = dfInfoAssembler.transform(testData)

In [18]:
changeName = StringIndexer(inputCol='prob', outputCol='label').fit(dftest)
testFinal = changeName.transform(dfInfo)

In [19]:
testFinal.select(['features', 'label']).show(10,False)

+-------------------------------------------------------------------------+-----+
|features                                                                 |label|
+-------------------------------------------------------------------------+-----+
|[6.0,0.0,62.0,3.0,0.0,1.0,0.0,0.3333333333333333,0.08940820834786653]    |0    |
|[3.0,0.0,591.0,16.0,0.0,1.0,0.0,0.0625,0.09346991037131884]              |0    |
|[6.0,1.0,537.0,3.0,0.0,1.0,0.0,0.3333333333333333,0.12137006701414745]   |0    |
|[0.0,0.0,49.0,3.0,0.0,2.0,0.0,0.6666666666666666,0.07458405048766495]    |0    |
|[4.0,1.0,182.0,0.0,0.0,1.0,0.0,0.6666666666666666,0.12729948491537896]   |0    |
|[4.0,0.0,149.0,2.0,0.0,1.0,0.0,0.5,0.13248090925235306]                  |0    |
|[6.0,0.0,457.0,0.0,0.0,1.0,0.0,0.5,0.1026058631921824]                   |0    |
|[2.0,0.0,128.0,7.0,0.0,1.0,0.0,0.14285714285714285,0.06504520137703268]  |0    |
|[6.0,1.0,274.0,3.0,0.0,1.0,0.0,0.3333333333333333,0.13646485199831074]   |0    |
|[4.0,0.0,307.0,

In [20]:
testOutput = rfModel.transform(testFinal)

In [21]:
testOutput.filter(testOutput['label'] == 1).count()

15952

In [22]:
test_num = testOutput.count()

In [45]:
testData.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- merchant_id: integer (nullable = true)
 |-- age_range: double (nullable = true)
 |-- gender: double (nullable = true)
 |-- total_log: integer (nullable = true)
 |-- click: integer (nullable = true)
 |-- add_chart: integer (nullable = true)
 |-- buy: integer (nullable = true)
 |-- favourite: integer (nullable = true)
 |-- store_buy_rate: double (nullable = true)
 |-- store_rate: double (nullable = true)
 |-- prob: string (nullable = true)



In [47]:
outcome = testData[['user_id', 'merchant_id']]

In [23]:
colums = testOutput.columns
print(colums)

['user_id', 'merchant_id', 'age_range', 'gender', 'total_log', 'click', 'add_chart', 'buy', 'favourite', 'store_buy_rate', 'store_rate', 'label', 'features', 'rawPrediction', 'probability', 'prediction']


In [29]:
t = testOutput.select(['user_id','merchant_id','label'])

In [30]:
t.repartition(1).write.format("com.databricks.spark.csv") \
.options(header='true',inferschema='true').save('/Users/chenyuanshan/temp/data/RamdonForestOut.csv')

In [31]:
tread = spark.read.csv('/Users/chenyuanshan/temp/data/RamdonForestOut.csv', inferSchema=True,header=True)

In [32]:
tread.take(5)

[Row(user_id=464, merchant_id=4718, label=0),
 Row(user_id=867, merchant_id=3152, label=0),
 Row(user_id=1882, merchant_id=4377, label=0),
 Row(user_id=2450, merchant_id=2760, label=0),
 Row(user_id=2766, merchant_id=3885, label=0)]

In [38]:
pandas_tread = tread.toPandas()

In [40]:
pandas_tread.info

<bound method DataFrame.info of         user_id  merchant_id  label
0           464         4718      0
1           867         3152      0
2          1882         4377      0
3          2450         2760      0
4          2766         3885      0
...         ...          ...    ...
260859   421807         3057      0
260860   422078         3374      0
260861   422097         3609      0
260862   422648         4502      0
260863   423267         3578      0

[260864 rows x 3 columns]>

In [41]:
pandas_tread.to_csv('/Users/chenyuanshan/temp/data/data/output/RandomForestOutput.csv',index = False)