# Open the file that recorded clean wiki pages

In [40]:
from pyspark.sql import Row
from pyspark.sql import SparkSession
import csv
spark = SparkSession.builder.getOrCreate()
f = open("wiki_result.csv")
reader = csv.reader(f,delimiter='|')
ww = []
for w in reader:
    ww.append(w)
ww= map(lambda p: Row(label=int(p[0]), text=str(p[1])),ww)
wikies = spark.createDataFrame(ww)
wikies.show()
f.close

+-----+--------------------+
|label|                text|
+-----+--------------------+
|    0|asia ( ) earth 's...|
|    0|unilev ( ) dutch-...|
|    0|eurasia combin co...|
|    0|eric hoffer ( jul...|
|    0|shaman ( shah-men...|
|    0|the asian giant h...|
|    0|list asian pornog...|
|    0|georgia ( ; georg...|
|    0|calligraphi ( gre...|
|    0|hornet ( insect g...|
|    0|asia argento ( it...|
|    0|time american wee...|
|    0|the pacif war , s...|
|    0|the boundari cont...|
|    0|the demograph rus...|
|    0|thi list common s...|
|    0|A humid subtrop c...|
|    0|aed albopictu ( s...|
|    0|buddhism ( ) reli...|
|    0|thi list list wor...|
+-----+--------------------+
only showing top 20 rows



<function close>

# Do tokenize

In [41]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
tokenizer = Tokenizer(inputCol="text", outputCol="words")
wordsData = tokenizer.transform(wikies)
wordsData.show()


+-----+--------------------+--------------------+
|label|                text|               words|
+-----+--------------------+--------------------+
|    0|asia ( ) earth 's...|[asia, (, ), eart...|
|    0|unilev ( ) dutch-...|[unilev, (, ), du...|
|    0|eurasia combin co...|[eurasia, combin,...|
|    0|eric hoffer ( jul...|[eric, hoffer, (,...|
|    0|shaman ( shah-men...|[shaman, (, shah-...|
|    0|the asian giant h...|[the, asian, gian...|
|    0|list asian pornog...|[list, asian, por...|
|    0|georgia ( ; georg...|[georgia, (, ;, g...|
|    0|calligraphi ( gre...|[calligraphi, (, ...|
|    0|hornet ( insect g...|[hornet, (, insec...|
|    0|asia argento ( it...|[asia, argento, (...|
|    0|time american wee...|[time, american, ...|
|    0|the pacif war , s...|[the, pacif, war,...|
|    0|the boundari cont...|[the, boundari, c...|
|    0|the demograph rus...|[the, demograph, ...|
|    0|thi list common s...|[thi, list, commo...|
|    0|A humid subtrop c...|[a, humid, subtro...|


# Do tokenize

In [42]:
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures",numFeatures=300)
featurizedData = hashingTF.transform(wordsData)
featurizedData.show()

+-----+--------------------+--------------------+--------------------+
|label|                text|               words|         rawFeatures|
+-----+--------------------+--------------------+--------------------+
|    0|asia ( ) earth 's...|[asia, (, ), eart...|(300,[0,1,2,3,4,5...|
|    0|unilev ( ) dutch-...|[unilev, (, ), du...|(300,[0,1,2,3,4,6...|
|    0|eurasia combin co...|[eurasia, combin,...|(300,[0,1,2,3,4,5...|
|    0|eric hoffer ( jul...|[eric, hoffer, (,...|(300,[0,1,2,3,4,5...|
|    0|shaman ( shah-men...|[shaman, (, shah-...|(300,[0,1,2,3,4,5...|
|    0|the asian giant h...|[the, asian, gian...|(300,[0,1,2,3,4,5...|
|    0|list asian pornog...|[list, asian, por...|(300,[1,4,8,10,12...|
|    0|georgia ( ; georg...|[georgia, (, ;, g...|(300,[0,1,2,3,4,5...|
|    0|calligraphi ( gre...|[calligraphi, (, ...|(300,[0,1,2,3,4,5...|
|    0|hornet ( insect g...|[hornet, (, insec...|(300,[0,1,3,4,5,6...|
|    0|asia argento ( it...|[asia, argento, (...|(300,[0,1,2,3,4,5...|
|    0

In [43]:
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

rescaledData.select("text", "features").show()
rescaledData = rescaledData.select("label","features")

+--------------------+--------------------+
|                text|            features|
+--------------------+--------------------+
|asia ( ) earth 's...|(300,[0,1,2,3,4,5...|
|unilev ( ) dutch-...|(300,[0,1,2,3,4,6...|
|eurasia combin co...|(300,[0,1,2,3,4,5...|
|eric hoffer ( jul...|(300,[0,1,2,3,4,5...|
|shaman ( shah-men...|(300,[0,1,2,3,4,5...|
|the asian giant h...|(300,[0,1,2,3,4,5...|
|list asian pornog...|(300,[1,4,8,10,12...|
|georgia ( ; georg...|(300,[0,1,2,3,4,5...|
|calligraphi ( gre...|(300,[0,1,2,3,4,5...|
|hornet ( insect g...|(300,[0,1,3,4,5,6...|
|asia argento ( it...|(300,[0,1,2,3,4,5...|
|time american wee...|(300,[0,1,2,3,4,5...|
|the pacif war , s...|(300,[0,1,2,3,4,5...|
|the boundari cont...|(300,[0,1,2,3,4,5...|
|the demograph rus...|(300,[0,1,2,3,4,5...|
|thi list common s...|(300,[1,4,5,7,9,1...|
|A humid subtrop c...|(300,[0,1,2,3,4,5...|
|aed albopictu ( s...|(300,[0,1,2,3,4,5...|
|buddhism ( ) reli...|(300,[0,1,2,3,4,5...|
|thi list list wor...|(300,[1,2,

In [44]:
from pyspark.ml.clustering import KMeans

kmeans = KMeans().setK(2).setSeed(1)
model = kmeans.fit(rescaledData)


wssse = model.computeCost(rescaledData)
print("Within Set Sum of Squared Errors = " + str(wssse))


centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Within Set Sum of Squared Errors = 53447.3276304
Cluster Centers: 
[  3.64005606   0.           6.9126512    2.73938205   3.9037373
   2.33138898   7.42197374   4.86614262   7.00360714   3.52862577   0.
   3.47475518   5.25497411   2.3896737    2.76852441   0.71731246
   4.4073333    1.77148904   4.51706615   5.45735621   6.33847393   0.
   6.18741141   2.22842045   2.76852441   2.78838379   2.8098329
   2.76852441   2.42906735   7.18551901   6.67360095   1.72702476
   4.41136294   5.95761387   6.32143761   4.320407     1.47998832
   3.52862577   3.04577306   4.95709856   3.20289532   6.69696294
   3.60291263   1.39883339   3.1250631    4.17147429   1.76914732
   3.32222929   3.06722217   0.58626499   6.32674656   5.42294066
   4.95709856   1.93145832   4.42006807   2.89562932   0.           3.49406762
   7.41290886   5.77570199   3.95477433   3.14214947   2.65968915
   4.07533015   5.25497411   0.           4.7132242    2.71023969
   7.55923641   3.00861776   1.63013206   2.25215614  

# Summary
At first I took 10 pages for each category. The Error is around 2000.
When I finally used 30 pages for each category. The Error increased to 50000.
So the model is not effective to cluster.

# next we perform the classification to the wikipages. Question3-wiki:

In [45]:
splits = rescaledData.select("label", "features").randomSplit([0.8, 0.2], 1234)
train = splits[1]
test = splits[0]

In [46]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

nb = NaiveBayes()
model = nb.fit(train)
predictions = model.transform(test)

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

Test set accuracy = 0.382978723404


In [47]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

dt = DecisionTreeClassifier()

model = dt.fit(train)

predictions = model.transform(test)

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

Test set accuracy = 0.382978723404


In [48]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

rf = RandomForestClassifier()

model = rf.fit(train)

predictions = model.transform(test)

evaluator = MulticlassClassificationEvaluator(labelCol="label",predictionCol="prediction",metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy of RandomForest= " + str(accuracy))

Test set accuracy of RandomForest= 0.382978723404
