# Decesion Tree

## Set up

In [1]:
# import findspark
# findspark.init()

# import glob # to get file paths
# import random
import numpy as np # for preprocess
# import itertools # to generate pairs from list

import pyspark
from pyspark import SparkConf, SparkContext

In [3]:
import os
os.environ['JAVA_HOME'] = 'C:\Program Files\Java\jdk1.8.0_201'

In [4]:
conf = SparkConf().setMaster("local").setAppName("DececisionTree").set("spark.default.parallelism", 4)
sc = SparkContext(conf=conf)
sc

In [48]:
# Parameter
category_Numbers = 14 # 一共14個categories類別
spilt_rate = [8,1,1] # 用80：10：10的比例分割資料成 訓練/驗證/測試 資料集 

## Input

In [6]:
Input = sc.textFile("./data/train.tsv")
Input.count()

7396

In [7]:
Input.first()

'"url"\t"urlid"\t"boilerplate"\t"alchemy_category"\t"alchemy_category_score"\t"avglinksize"\t"commonlinkratio_1"\t"commonlinkratio_2"\t"commonlinkratio_3"\t"commonlinkratio_4"\t"compression_ratio"\t"embed_ratio"\t"framebased"\t"frameTagRatio"\t"hasDomainLink"\t"html_ratio"\t"image_ratio"\t"is_news"\t"lengthyLinkDomain"\t"linkwordscore"\t"news_front_page"\t"non_markup_alphanum_characters"\t"numberOfLinks"\t"numwords_in_url"\t"parametrizedLinkRatio"\t"spelling_errors_ratio"\t"label"'

## Preprocess

### 資料清洗

#### 清洗標題

In [8]:
title = Input.first()
Data = Input.filter(lambda x : x!= title)

In [11]:
Data.first()

'"http://www.bloomberg.com/news/2010-12-23/ibm-predicts-holographic-calls-air-breathing-batteries-by-2015.html"\t"4042"\t"{""title"":""IBM Sees Holographic Calls Air Breathing Batteries ibm sees holographic calls, air-breathing batteries"",""body"":""A sign stands outside the International Business Machines Corp IBM Almaden Research Center campus in San Jose California Photographer Tony Avelar Bloomberg Buildings stand at the International Business Machines Corp IBM Almaden Research Center campus in the Santa Teresa Hills of San Jose California Photographer Tony Avelar Bloomberg By 2015 your mobile phone will project a 3 D image of anyone who calls and your laptop will be powered by kinetic energy At least that s what International Business Machines Corp sees in its crystal ball The predictions are part of an annual tradition for the Armonk New York based company which surveys its 3 000 researchers to find five ideas expected to take root in the next five years IBM the world s largest 

#### 分割資料
+ 原始資料是以`\t`分割,並由`"`包覆

In [12]:
lines = Data.map(lambda x : x.replace("\"","")).map(lambda x : x.split("\t"))
lines.first()[3:]

['business',
 '0.789131',
 '2.055555556',
 '0.676470588',
 '0.205882353',
 '0.047058824',
 '0.023529412',
 '0.443783175',
 '0',
 '0',
 '0.09077381',
 '0',
 '0.245831182',
 '0.003883495',
 '1',
 '1',
 '24',
 '0',
 '5424',
 '170',
 '8',
 '0.152941176',
 '0.079129575',
 '0']

### 提取特徵

#### 建立one-hot encode table

In [13]:
category_with_index = lines.map(lambda x: x[3]).distinct().zipWithIndex()

In [14]:
category_Numbers_list = list(range(category_Numbers))
category_Numbers_array = np.array(category_Numbers_list).reshape(category_Numbers, -1)
category_Numbers_array

array([[ 0],
       [ 1],
       [ 2],
       [ 3],
       [ 4],
       [ 5],
       [ 6],
       [ 7],
       [ 8],
       [ 9],
       [10],
       [11],
       [12],
       [13]])

In [15]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
enc.fit(category_Numbers_array)
encoder_table = enc.transform(category_Numbers_array).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [16]:
category_Map = category_with_index.map(lambda x : (x[0],encoder_table[x[1]])).collectAsMap()
category_Map

{'business': array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'arts_entertainment': array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'computer_internet': array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'law_crime': array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'weather': array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'unknown': array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]),
 'science_technology': array([0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.]),
 'sports': array([0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.]),
 '?': array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]),
 'gaming': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.]),
 'culture_politics': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.]),
 'religion': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]),
 'recreation': array([0., 0., 0., 0., 0

In [95]:
from pyspark.mllib.regression import LabeledPoint
def extract_features(row):
    category_features = category_Map[row[3]]
    number_features = row[4:-2]
    number_features = [0.0 if x=="?" else float(x) for x in number_features]
    
    features = np.concatenate((category_features,number_features))
    label = float(row[-1])
    
    return (label,features)

In [98]:
labelRDD = lines.map(extract_features).map(lambda x: LabeledPoint(x[0],x[1]))
labelRDD.first()

LabeledPoint(0.0, [1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.789131,2.055555556,0.676470588,0.205882353,0.047058824,0.023529412,0.443783175,0.0,0.0,0.09077381,0.0,0.245831182,0.003883495,1.0,1.0,24.0,0.0,5424.0,170.0,8.0,0.152941176])

### 切分資料

In [99]:
(trainRDD,validRDD,testRDD) = labelRDD.randomSplit(spilt_rate)
print("train: " + str(trainRDD.count()))
print("valid: " + str(validRDD.count()))
print("test:  " + str(testRDD.count()))

train: 5932
valid: 729
test:  734


## Train Model

In [100]:
from pyspark.mllib.tree import DecisionTree
model = DecisionTree.trainClassifier(
    data=trainRDD,numClasses=2,categoricalFeaturesInfo={},
    impurity="entropy", maxDepth=5, maxBins=5)

In [115]:
model.predict(tmp.features)

1.0

In [125]:
rate = 0
N = 500
for test_data in testRDD.take(N):
    ans = test_data.label
    gus = model.predict(test_data.features)
    if ans==gus:
        print("Y")
        rate += 1
    else :
        print("N")
rate = rate/N
print(rate)

Y
N
Y
N
Y
N
Y
N
N
N
Y
N
N
Y
Y
Y
Y
N
N
N
N
Y
N
Y
Y
Y
N
N
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
N
Y
N
N
Y
N
N
Y
N
Y
N
Y
Y
Y
N
Y
Y
Y
Y
N
Y
Y
N
Y
N
Y
Y
Y
N
Y
Y
Y
Y
N
Y
Y
Y
Y
Y
N
Y
Y
Y
N
N
N
N
Y
Y
Y
Y
Y
Y
Y
Y
Y
N
Y
N
Y
N
Y
Y
Y
Y
Y
Y
N
Y
Y
N
N
N
N
Y
Y
N
Y
Y
Y
Y
N
Y
Y
Y
N
Y
N
Y
Y
N
Y
N
Y
Y
N
Y
Y
Y
N
N
Y
N
Y
Y
Y
Y
Y
Y
Y
Y
N
Y
N
Y
Y
Y
Y
Y
Y
Y
Y
Y
N
N
N
Y
N
Y
Y
Y
Y
N
Y
Y
N
Y
Y
N
Y
Y
N
Y
N
N
Y
Y
N
N
N
Y
N
Y
N
N
Y
Y
N
N
Y
Y
Y
Y
Y
N
Y
N
N
Y
Y
N
N
N
Y
Y
N
Y
N
Y
Y
N
N
N
Y
Y
Y
N
N
Y
N
Y
N
Y
Y
N
N
Y
N
N
Y
Y
Y
Y
N
N
Y
N
Y
N
Y
Y
Y
Y
N
Y
N
Y
N
N
Y
N
Y
Y
Y
N
Y
Y
Y
N
Y
Y
Y
Y
Y
N
N
Y
N
Y
Y
Y
Y
N
N
Y
N
Y
Y
Y
N
Y
N
Y
Y
Y
N
N
Y
Y
Y
Y
Y
N
N
Y
N
N
N
Y
Y
Y
Y
N
Y
N
N
N
Y
Y
Y
Y
N
Y
N
Y
Y
Y
N
Y
N
N
Y
N
N
N
N
Y
Y
N
N
Y
Y
Y
Y
Y
N
Y
Y
Y
Y
Y
N
N
N
N
Y
Y
Y
Y
N
Y
N
N
N
N
Y
N
N
Y
Y
Y
N
Y
Y
N
Y
N
Y
Y
Y
N
Y
N
Y
Y
Y
N
Y
N
Y
Y
Y
N
Y
N
N
Y
Y
Y
Y
Y
N
Y
N
Y
N
Y
N
Y
N
N
Y
Y
Y
Y
Y
Y
Y
N
Y
Y
Y
N
Y
Y
N
Y
Y
Y
Y
Y
N
N
Y
Y
Y
Y
N
Y
Y
Y
N
Y
N
N
Y
Y
N
N
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
Y
N
Y
Y
Y
N
Y
Y
Y
Y
Y
Y
N
N
N
Y
N
Y
Y
N
Y
Y
Y
Y
N
Y
N
N
Y
Y
N
Y
N
Y
N
Y
Y
Y
N
