In [1]:
%%bash
apt-get install openjdk-8-jdk-headless -qq > /dev/null

wget -q https://downloads.apache.org/spark/spark-2.4.8/spark-2.4.8-bin-hadoop2.7.tgz
tar xf spark-2.4.8-bin-hadoop2.7.tgz

pip install findspark

Collecting findspark
  Downloading https://files.pythonhosted.org/packages/fc/2d/2e39f9a023479ea798eed4351cd66f163ce61e00c717e03c37109f00c0f2/findspark-1.4.2-py2.py3-none-any.whl
Installing collected packages: findspark
Successfully installed findspark-1.4.2


In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.8-bin-hadoop2.7"

In [3]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
sparkContext=spark.sparkContext

In [4]:
import numpy as np

In [5]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


Считывание датасета

In [6]:
df = spark.read.csv('./drive/My Drive/data/website_classification.csv',inferSchema=True,sep=",", header=True,)
df.show(15)
df.printSchema()

+---+--------------------+--------------------+--------+
|_c0|         website_url|cleaned_website_text|Category|
+---+--------------------+--------------------+--------+
|  0|https://www.booki...|official site goo...|  Travel|
|  1|https://travelsit...|expedia hotel boo...|  Travel|
|  2|https://travelsit...|tripadvisor hotel...|  Travel|
|  3|https://www.momon...|cheap flights sea...|  Travel|
|  4|https://www.ebook...|bot create free a...|  Travel|
|  5|https://book.pric...|hotel reservation...|  Travel|
|  6|https://www.trip....|official travel d...|  Travel|
|  7|https://www.orbit...|bot create free a...|  Travel|
|  8|https://www.trave...|bot create free a...|  Travel|
|  9|https://www.hotwi...|cheap hotels cars...|  Travel|
| 10|https://www.otel....|hotel cheap hotel...|  Travel|
| 11|https://www.ebook...|bot create free a...|  Travel|
| 12|https://www.airbn...|holiday lets home...|  Travel|
| 13|https://in.lastmi...|book cheap flight...|  Travel|
| 14|https://www.onthe...|well 

In [7]:
df.select('Category').distinct().collect()


[Row(Category='Law and Government'),
 Row(Category='Education'),
 Row(Category='Health and Fitness'),
 Row(Category='Food'),
 Row(Category='Computers and Technology'),
 Row(Category='Social Networking and Messaging'),
 Row(Category='Sports'),
 Row(Category='Business/Corporate'),
 Row(Category='Travel'),
 Row(Category='Forums'),
 Row(Category='Adult'),
 Row(Category='Games'),
 Row(Category='Streaming Services'),
 Row(Category='Photography'),
 Row(Category='E-Commerce'),
 Row(Category='News')]

In [8]:
df.select("cleaned_website_text").show(5,False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [9]:
from pyspark.sql.functions import col
df.groupBy("Category") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()


+--------------------+-----+
|            Category|count|
+--------------------+-----+
|           Education|  114|
|  Business/Corporate|  109|
|              Travel|  107|
|  Streaming Services|  105|
|              Sports|  104|
|          E-Commerce|  102|
|               Games|   98|
|  Health and Fitness|   96|
|                News|   96|
|         Photography|   93|
|Computers and Tec...|   93|
|                Food|   92|
|  Law and Government|   84|
|Social Networking...|   83|
|              Forums|   16|
|               Adult|   16|
+--------------------+-----+



Преобразование содержимого веб-сайтов и категорий в удобное представление

In [10]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer, OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression

# разбитие строк по элементам массива 
regexTokenizer = RegexTokenizer(inputCol="cleaned_website_text", outputCol="words", pattern="\\W")

# фильтрация слов
add_stopwords = ["http","https","amp","rt","t","c","the"] 
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(add_stopwords)

# создание векторов
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=100000, minDF=5)

# преобразование категорий в индексы
label_stringIdx = StringIndexer(inputCol = "Category", outputCol = "label")

Созданиe pipeline модели

In [11]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])

# Заполнение pipeline
pipelineFit = pipeline.fit(df)
dataset = pipelineFit.transform(df)
dataset.show(5)

+---+--------------------+--------------------+--------+--------------------+--------------------+--------------------+-----+
|_c0|         website_url|cleaned_website_text|Category|               words|            filtered|            features|label|
+---+--------------------+--------------------+--------+--------------------+--------------------+--------------------+-----+
|  0|https://www.booki...|official site goo...|  Travel|[official, site, ...|[official, site, ...|(11349,[1,2,3,4,6...|  2.0|
|  1|https://travelsit...|expedia hotel boo...|  Travel|[expedia, hotel, ...|[expedia, hotel, ...|(11349,[1,2,4,6,7...|  2.0|
|  2|https://travelsit...|tripadvisor hotel...|  Travel|[tripadvisor, hot...|[tripadvisor, hot...|(11349,[2,4,6,9,1...|  2.0|
|  3|https://www.momon...|cheap flights sea...|  Travel|[cheap, flights, ...|[cheap, flights, ...|(11349,[1,2,6,7,9...|  2.0|
|  4|https://www.ebook...|bot create free a...|  Travel|[bot, create, fre...|[bot, create, fre...|(11349,[1,3,7,9,1...

In [12]:
dataset.select("Category",'label').distinct().orderBy("label").show()

+--------------------+-----+
|            Category|label|
+--------------------+-----+
|           Education|  0.0|
|  Business/Corporate|  1.0|
|              Travel|  2.0|
|  Streaming Services|  3.0|
|              Sports|  4.0|
|          E-Commerce|  5.0|
|               Games|  6.0|
|  Health and Fitness|  7.0|
|                News|  8.0|
|         Photography|  9.0|
|Computers and Tec...| 10.0|
|                Food| 11.0|
|  Law and Government| 12.0|
|Social Networking...| 13.0|
|               Adult| 14.0|
|              Forums| 15.0|
+--------------------+-----+





Создание обучающей и тестовой выборок

In [13]:
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 988
Test Dataset Count: 420


Логистическая регрессия

In [14]:
lr = LogisticRegression(maxIter=40, regParam=0.5, elasticNetParam=0)
lrModel = lr.fit(trainingData)

predictions = lrModel.transform(testData)

predictions \
    .select("cleaned_website_text","Category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 20, truncate = 30)

+------------------------------+------------------------------+------------------------------+-----+----------+
|          cleaned_website_text|                      Category|                   probability|label|prediction|
+------------------------------+------------------------------+------------------------------+-----+----------+
|geochemistry wikipedia geoc...|                     Education|[0.9999999997548839,1.87020...|  0.0|       0.0|
|group theory wikipedia grou...|                     Education|[0.9999999977762926,5.70453...|  0.0|       0.0|
|archaeology archaeologist u...|                     Education|[0.9996817616437905,6.91029...|  0.0|       0.0|
|hplc course course chromato...|                     Education|[0.9988899713736181,9.83732...|  0.0|       0.0|
|editorial board resources c...|                     Education|[0.99316112585672,2.6840184...|  0.0|       0.0|
|nottingham trent internatio...|                     Education|[0.802513982309609,0.007500...|  0.0|    

In [15]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.783399398479512

In [20]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.mllib.evaluation import MulticlassMetrics

y_true = predictions.select(['label']).collect()
y_pred = predictions.select(['prediction']).collect()

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true, y_pred))


              precision    recall  f1-score   support

         0.0       0.85      0.81      0.83        27
         1.0       0.64      0.72      0.68        32
         2.0       0.94      0.86      0.90        35
         3.0       0.40      1.00      0.57        29
         4.0       0.90      0.85      0.88        33
         5.0       0.95      0.78      0.86        23
         6.0       0.70      0.81      0.75        26
         7.0       0.85      0.69      0.76        32
         8.0       1.00      0.58      0.73        26
         9.0       0.87      0.79      0.83        34
        10.0       0.75      0.64      0.69        28
        11.0       0.96      0.81      0.88        31
        12.0       0.97      0.82      0.89        34
        13.0       0.86      0.75      0.80        24
        14.0       1.00      0.50      0.67         2
        15.0       0.00      0.00      0.00         4

    accuracy                           0.77       420
   macro avg       0.79   

Логистическая регрессия с использованием перекрёстной проверки

In [21]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer, OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression

regexTokenizer = RegexTokenizer(inputCol="cleaned_website_text", outputCol="words", pattern="\\W")
add_stopwords = ["http","https","amp","rt","t","c","the"] 
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(add_stopwords)
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=100000, minDF=5)
label_stringIdx = StringIndexer(inputCol = "Category", outputCol = "label")

pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])
pipelineFit = pipeline.fit(df)
dataset = pipelineFit.transform(df)

(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 15)

lr = LogisticRegression(maxIter=40, regParam=0.5, elasticNetParam=0)

In [22]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")

# создание ParamGrid для Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.1, 0.3, 0.5]) # параметр регуляризации
             .addGrid(lr.elasticNetParam, [0.0, 0.1, 0.2]) # параметры эластичной сети
             .addGrid(lr.maxIter, [10, 20, 50]) # число итераций
             .build())

# создание CrossValidator
cv = CrossValidator(estimator=lr, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=5)

cvModel = cv.fit(trainingData)

predictions = cvModel.transform(testData)

# оценка лучшей модели
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.8126875171324125

In [27]:
predictions.filter(predictions['label'] == 15) \
    .select("cleaned_website_text","Category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 100, truncate = 20)

+--------------------+--------+--------------------+-----+----------+
|cleaned_website_text|Category|         probability|label|prediction|
+--------------------+--------+--------------------+-----+----------+
|forums craigslist...|  Forums|[0.09995177066276...| 15.0|       3.0|
|wattpad story liv...|  Forums|[0.07848988972895...| 15.0|       1.0|
|eevblog script fe...|  Forums|[0.07246681784781...| 15.0|       1.0|
+--------------------+--------+--------------------+-----+----------+



In [24]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.mllib.evaluation import MulticlassMetrics

y_true = predictions.select(['label']).collect()
y_pred = predictions.select(['prediction']).collect()

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       0.74      0.88      0.81        26
         1.0       0.54      0.79      0.64        28
         2.0       0.89      0.92      0.90        36
         3.0       0.61      0.90      0.73        30
         4.0       0.97      0.78      0.86        36
         5.0       0.93      0.81      0.87        32
         6.0       0.88      0.85      0.87        27
         7.0       0.93      0.70      0.80        37
         8.0       0.93      0.82      0.87        34
         9.0       0.76      0.73      0.74        22
        10.0       0.68      0.65      0.67        26
        11.0       0.88      0.88      0.88        26
        12.0       0.95      1.00      0.98        21
        13.0       0.87      0.83      0.85        24
        14.0       1.00      0.50      0.67         6
        15.0       0.00      0.00      0.00         3

    accuracy                           0.81       414
   macro avg       0.79   

  _warn_prf(average, modifier, msg_start, len(result))


Naive Bayes

In [28]:
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(smoothing=1)
model = nb.fit(trainingData)
predictions = model.transform(testData)
predictions \
    .select("cleaned_website_text","Category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 20, truncate = 30)


+------------------------------+---------+------------------------------+-----+----------+
|          cleaned_website_text| Category|                   probability|label|prediction|
+------------------------------+---------+------------------------------+-----+----------+
|real college life student j...|Education|[1.0,3.5387528593204926E-39...|  0.0|       0.0|
|radio astronomy seti big ea...|Education|[1.0,2.7026103797082037E-56...|  0.0|       0.0|
|error file find biosphere e...|Education|[1.0,1.1580753740416661E-64...|  0.0|       0.0|
|nitric oxide journal elsevi...|Education|[1.0,2.380188218277759E-88,...|  0.0|       0.0|
|faculty native studies sear...|Education|[1.0,9.589851117688293E-89,...|  0.0|       0.0|
|applied probability trust s...|Education|[1.0,1.7253432546248965E-97...|  0.0|       0.0|
|pharmaceutical chemistry jo...|Education|[1.0,5.397987772579099E-145...|  0.0|       0.0|
|mathematical medicine biolo...|Education|[1.0,2.7406313071225587E-15...|  0.0|       0.0|

In [29]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.9286528110864746

In [30]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.mllib.evaluation import MulticlassMetrics

y_true = predictions.select(['label']).collect()
y_pred = predictions.select(['prediction']).collect()

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       0.96      0.88      0.92        26
         1.0       0.85      0.82      0.84        28
         2.0       1.00      0.97      0.99        36
         3.0       0.91      1.00      0.95        30
         4.0       1.00      0.97      0.99        36
         5.0       0.86      0.97      0.91        32
         6.0       0.96      0.96      0.96        27
         7.0       0.97      0.95      0.96        37
         8.0       0.89      0.91      0.90        34
         9.0       0.90      0.86      0.88        22
        10.0       0.88      0.88      0.88        26
        11.0       0.93      1.00      0.96        26
        12.0       0.95      1.00      0.98        21
        13.0       0.91      0.83      0.87        24
        14.0       1.00      1.00      1.00         6
        15.0       1.00      0.33      0.50         3

    accuracy                           0.93       414
   macro avg       0.94   

Random Forest


In [31]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 32)

rfModel = rf.fit(trainingData)
predictions = rfModel.transform(testData)
predictions \
    .select("cleaned_website_text","Category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 20, truncate = 30)


+------------------------------+------------------------+------------------------------+-----+----------+
|          cleaned_website_text|                Category|                   probability|label|prediction|
+------------------------------+------------------------+------------------------------+-----+----------+
|geochemistry wikipedia geoc...|               Education|[0.20454965030093852,0.0818...|  0.0|       0.0|
|interactive science teacher...|               Education|[0.17468972447354342,0.0811...|  0.0|       0.0|
|applied probability trust s...|               Education|[0.1490531947770424,0.09005...|  0.0|       0.0|
|international chemistry oly...|               Education|[0.14735494445837513,0.0856...|  0.0|       0.0|
|pharmaceutical chemistry jo...|               Education|[0.1460555048448639,0.08815...|  0.0|       0.0|
|lambda ultimate programming...|Computers and Technology|[0.1434433430528127,0.07292...| 10.0|       0.0|
|fast fourier transform wolf...|              

In [33]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.648791749243923

In [34]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.mllib.evaluation import MulticlassMetrics

y_true = predictions.select(['label']).collect()
y_pred = predictions.select(['prediction']).collect()

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       0.34      0.92      0.49        26
         1.0       0.19      0.46      0.27        28
         2.0       0.89      0.94      0.92        36
         3.0       0.75      0.70      0.72        30
         4.0       0.83      0.97      0.90        36
         5.0       0.69      0.69      0.69        32
         6.0       0.92      0.44      0.60        27
         7.0       1.00      0.46      0.63        37
         8.0       0.86      0.88      0.87        34
         9.0       0.84      0.73      0.78        22
        10.0       0.00      0.00      0.00        26
        11.0       0.80      0.77      0.78        26
        12.0       0.93      0.67      0.78        21
        13.0       1.00      0.50      0.67        24
        14.0       0.00      0.00      0.00         6
        15.0       0.00      0.00      0.00         3

    accuracy                           0.65       414
   macro avg       0.63   

  _warn_prf(average, modifier, msg_start, len(result))
