In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName('MachineLearning') \
    .master('local[*]') \
    .getOrCreate()

In [2]:
input_path = '/home/jovyan/work/data/'
users_path = f'{input_path}cleaned_data_users/'

In [3]:
df = spark.read.csv(users_path, multiLine=True, header=True, inferSchema=True)

In [5]:
print(f'Dataframe rows number: {df.count()}')

Users-rows: 147577


In [6]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer

In [18]:
cols = ['_AccountId', 'forum', 'ActivityDays', 
        'comments_number', 'edits_of_answers', 'edits_of_question', 'created_questions', 
        'comments_every_day', 'activity_answer_edits', 'activity_question_edits', 'activity_questions']

In [19]:
df = df[cols]

In [20]:
df.printSchema()

root
 |-- _AccountId: integer (nullable = true)
 |-- forum: string (nullable = true)
 |-- ActivityDays: double (nullable = true)
 |-- comments_number: double (nullable = true)
 |-- edits_of_answers: double (nullable = true)
 |-- edits_of_question: double (nullable = true)
 |-- created_questions: double (nullable = true)
 |-- comments_every_day: double (nullable = true)
 |-- activity_answer_edits: double (nullable = true)
 |-- activity_question_edits: double (nullable = true)
 |-- activity_questions: double (nullable = true)



'OneHotEncoder'

In [22]:
str_cols = ['forum']
encoder = OneHotEncoder(inputCol='forum', outputCol='forum_features')

In [78]:
# unique list of forums
forums_list = df.select('forum').distinct().rdd.map(lambda r: r[0]).collect()

In [109]:
forum_udf = udf(lambda x: 1 if x == forum else 0)

In [127]:
for forum in forums_list:
    forum_udf = udf(lambda x: 1 if x == forum else 0)
    df = df.withColumn(f'forum_{forum}', forum_udf('forum'))
    df = df.withColumn(f'activity_days_{forum}', forum_udf('forum'))
    df = df.withColumn(f'comments_number_{forum}', forum_udf('forum'))
    df = df.withColumn(f'answer_edits_{forum}', forum_udf('forum'))
    df = df.withColumn(f'question_edits_{forum}', forum_udf('forum'))
    df = df.withColumn(f'created_questions_{forum}', forum_udf('forum'))
    df = df.withColumn(f'activity_comments_{forum}', forum_udf('forum'))
    df = df.withColumn(f'activity_answer_edits_{forum}', forum_udf('forum'))
    df = df.withColumn(f'activity_question_edits_{forum}', forum_udf('forum'))
    df = df.withColumn(f'activity_questions_{forum}', forum_udf('forum'))

In [128]:
exprs = {x: 'sum' for x in df.columns if x not in cols}

In [129]:
ddf=df.groupBy('_AccountId').agg(exprs)

In [132]:
# number of columns
len(ddf.columns)

71

Features

In [133]:
from pyspark.ml.feature import VectorAssembler

In [137]:
features = [col for col in ddf.columns if col != '_AccountId']

In [138]:
vector_assembler = VectorAssembler(inputCols = features, outputCol= "features_output")
data_training_and_test = vector_assembler.transform(ddf)

Normalization

In [256]:
from pyspark.ml.feature import Normalizer

In [257]:
normalizer = Normalizer(p=1, inputCol="features_output", outputCol="features_norm")

In [264]:
data_training_and_test = normalizer.transform(data_training_and_test)

Dimension Reduction/feature selection

In [139]:
from pyspark.ml.feature import PCA

In [266]:
# k - extract k features from all
pca_model = PCA(k = 30, inputCol = "features_norm", outputCol = "pca_features_norm")
model = pca_model.fit(data_training_and_test)
data_training_and_test = model.transform(data_training_and_test)

Modeling fitting and tuning

In [160]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.clustering import *
from pyspark.ml.evaluation import ClusteringEvaluator

In [267]:
(training_data, test_data) = data_training_and_test.randomSplit([0.7, 0.3])

In [272]:
kmeans_model = KMeans(k=10, seed=1, featuresCol='features_norm')
kmeans_model_pca = KMeans(k=10, seed=1, featuresCol='pca_features_norm')

In [273]:
kmeans_model = kmeans_model.fit(training_data)

In [274]:
kmeans_model_pca = kmeans_model_pca.fit(training_data)

In [166]:
centers = model.clusterCenters()
len(centers)

10

In [275]:
predictions = kmeans_model.transform(test_data)

In [276]:
predictions_pca = kmeans_model_pca.transform(test_data)

Evaluation

Note : Experimental

Evaluator for Clustering results, which expects two input columns: prediction and features. The metric computes the Silhouette measure using the squared Euclidean distance.

The Silhouette is a measure for the validation of the consistency within clusters. It ranges between 1 and -1, where a value close to 1 means that the points in a cluster are close to the other points in the same cluster and far from the points of the other clusters.

In [277]:
evaluator = ClusteringEvaluator(featuresCol='features_norm', predictionCol='prediction')
evaluator_pca = ClusteringEvaluator(featuresCol='pca_features_norm', predictionCol='prediction')

#evaluator.evaluate(predictions)

In [278]:
evaluator.evaluate(predictions)

0.9450269696896779

In [279]:
evaluator_pca.evaluate(predictions_pca)

0.9450750714678824

Grid Search - Finding the best parameters

In [184]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [318]:
very_small_sample = data_training_and_test.sample(False, 0.001).cache()

In [319]:
very_small_sample.count()

134

In [315]:
pca_model = PCA(inputCol = "features_norm", outputCol = "pca_features_cv_norm")
km = KMeans(seed=1, featuresCol="pca_features_cv_norm")
ppl_cv = Pipeline(stages = [pca_model, km])

In [316]:
ddf_length = len(ddf.columns)
from builtins import round as base_round
pca_nums = [base_round(i*ddf_length, 0) for i in [0.3, 0.5, 0.75]]
pca_nums

[21.0, 36.0, 53.0]

In [320]:
paramGrid = ParamGridBuilder() \
      .addGrid(pca_model.k, pca_nums) \
      .addGrid(km.k, [5, 7, 8, 10, 12]) \
      .build()

In [321]:
crossval = CrossValidator(estimator = ppl_cv, 
                          estimatorParamMaps=paramGrid,
                          evaluator = ClusteringEvaluator(featuresCol='pca_features_cv_norm', predictionCol='prediction'),
                          numFolds=2)

In [322]:
cv_model = crossval.fit(very_small_sample)

In [323]:
cv_predictions = cv_model.transform(test_data)
evaluator = ClusteringEvaluator(featuresCol='pca_features_cv_norm', predictionCol='prediction')
accuracy = evaluator.evaluate(cv_predictions)

In [324]:
accuracy

0.9166846106685441

In [325]:
cv_model.avgMetrics

[0.7944176290091253,
 0.902774170349173,
 0.9401177106081882,
 0.9401177106081882,
 0.9401177106081882,
 0.7910323495675389,
 0.8992111570366693,
 0.9365546972956844,
 0.9365546972956844,
 0.9365546972956844,
 0.7748250603531623,
 0.8821802978082681,
 0.9195238380672832,
 0.9195238380672832,
 0.9250092191617185]

In [326]:
best_model = cv_model.bestModel

In [327]:
best_cv_predictions = best_model.transform(test_data)
evaluator = ClusteringEvaluator(featuresCol='pca_features_cv_norm', predictionCol='prediction')
accuracy = evaluator.evaluate(best_cv_predictions)
accuracy

0.9166846106685441

In [337]:
km.getK()

2