In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName('MachineLearning') \
    .master('local[*]') \
    .getOrCreate()

In [2]:
input_path = '/home/jovyan/work/data/'
users_path = f'{input_path}cleaned_data_users/'

In [3]:
df = spark.read.csv(users_path, multiLine=True, header=True, inferSchema=True)

In [5]:
print(f'Dataframe rows number: {df.count()}')

Users-rows: 147577


In [6]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer

In [18]:
cols = ['_AccountId', 'forum', 'ActivityDays', 
        'comments_number', 'edits_of_answers', 'edits_of_question', 'created_questions', 
        'comments_every_day', 'activity_answer_edits', 'activity_question_edits', 'activity_questions']

In [19]:
df = df[cols]

In [20]:
df.printSchema()

root
 |-- _AccountId: integer (nullable = true)
 |-- forum: string (nullable = true)
 |-- ActivityDays: double (nullable = true)
 |-- comments_number: double (nullable = true)
 |-- edits_of_answers: double (nullable = true)
 |-- edits_of_question: double (nullable = true)
 |-- created_questions: double (nullable = true)
 |-- comments_every_day: double (nullable = true)
 |-- activity_answer_edits: double (nullable = true)
 |-- activity_question_edits: double (nullable = true)
 |-- activity_questions: double (nullable = true)



'OneHotEncoder'

In [22]:
str_cols = ['forum']
encoder = OneHotEncoder(inputCol='forum', outputCol='forum_features')

In [78]:
# unique list of forums
forums_list = df.select('forum').distinct().rdd.map(lambda r: r[0]).collect()

In [109]:
forum_udf = udf(lambda x: 1 if x == forum else 0)

In [127]:
for forum in forums_list:
    forum_udf = udf(lambda x: 1 if x == forum else 0)
    df = df.withColumn(f'forum_{forum}', forum_udf('forum'))
    df = df.withColumn(f'activity_days_{forum}', forum_udf('forum'))
    df = df.withColumn(f'comments_number_{forum}', forum_udf('forum'))
    df = df.withColumn(f'answer_edits_{forum}', forum_udf('forum'))
    df = df.withColumn(f'question_edits_{forum}', forum_udf('forum'))
    df = df.withColumn(f'created_questions_{forum}', forum_udf('forum'))
    df = df.withColumn(f'activity_comments_{forum}', forum_udf('forum'))
    df = df.withColumn(f'activity_answer_edits_{forum}', forum_udf('forum'))
    df = df.withColumn(f'activity_question_edits_{forum}', forum_udf('forum'))
    df = df.withColumn(f'activity_questions_{forum}', forum_udf('forum'))

In [128]:
exprs = {x: 'sum' for x in df.columns if x not in cols}

In [129]:
ddf=df.groupBy('_AccountId').agg(exprs)

In [132]:
# number of columns
len(ddf.columns)

71

Features

In [133]:
from pyspark.ml.feature import VectorAssembler

In [137]:
features = [col for col in ddf.columns if col != '_AccountId']

In [138]:
vector_assembler = VectorAssembler(inputCols = features, outputCol= "features_output")
data_training_and_test = vector_assembler.transform(ddf)

Dimension Reduction/feature selection

In [139]:
from pyspark.ml.feature import PCA

In [144]:
# k - extract k features from all
pca_model = PCA(k = 30, inputCol = "features_output", outputCol = "pca_features")
model = pca_model.fit(data_training_and_test)
data_training_and_test = model.transform(data_training_and_test)

Modeling fitting and tuning

In [160]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.clustering import *
from pyspark.ml.evaluation import ClusteringEvaluator

In [149]:
(training_data, test_data) = data_training_and_test.randomSplit([0.7, 0.3])

In [162]:
kmeans_model = KMeans(k=10, seed=1, featuresCol='features_output')

In [163]:
kmeans_model = kmeans.fit(training_data)

In [166]:
centers = model.clusterCenters()
len(centers)

10

In [165]:
predictions = kmeans_model.transform(test_data)

In [169]:
evaluator = ClusteringEvaluator(featuresCol='features_output', predictionCol='prediction')
#evaluator.evaluate(predictions)

In [171]:
evaluator.evaluate(predictions)

0.902137542953629

Evaluation

In [167]:
evaluator = ClusteringEvaluator()
evaluator.evaluate(predictions)

IllegalArgumentException: features does not exist. Available: _AccountId, sum(created_questions_windowsphone), sum(activity_question_edits_windowsphone), sum(activity_days_portuguese), sum(activity_questions_french), sum(answer_edits_windowsphone), sum(comments_number_spanish), sum(activity_questions_italian), sum(activity_comments_french), sum(created_questions_french), sum(activity_answer_edits_italian), sum(comments_number_french), sum(activity_answer_edits_spanish), sum(question_edits_italian), sum(activity_questions_mythology), sum(forum_mythology), sum(activity_days_french), sum(activity_days_italian), sum(activity_days_vegetarianism), sum(forum_french), sum(created_questions_vegetarianism), sum(created_questions_portuguese), sum(activity_answer_edits_mythology), sum(activity_question_edits_mythology), sum(activity_questions_windowsphone), sum(activity_comments_windowsphone), sum(activity_question_edits_italian), sum(activity_question_edits_spanish), sum(question_edits_spanish), sum(activity_comments_vegetarianism), sum(created_questions_italian), sum(question_edits_mythology), sum(comments_number_portuguese), sum(question_edits_portuguese), sum(activity_questions_portuguese), sum(activity_comments_mythology), sum(forum_spanish), sum(forum_vegetarianism), sum(forum_portuguese), sum(answer_edits_portuguese), sum(created_questions_mythology), sum(comments_number_windowsphone), sum(forum_italian), sum(question_edits_vegetarianism), sum(activity_question_edits_french), sum(activity_questions_vegetarianism), sum(activity_question_edits_portuguese), sum(activity_days_mythology), sum(answer_edits_vegetarianism), sum(question_edits_french), sum(answer_edits_italian), sum(activity_answer_edits_vegetarianism), sum(answer_edits_spanish), sum(comments_number_vegetarianism), sum(activity_questions_spanish), sum(activity_comments_portuguese), sum(activity_question_edits_vegetarianism), sum(forum_windowsphone), sum(comments_number_mythology), sum(activity_answer_edits_windowsphone), sum(activity_days_windowsphone), sum(answer_edits_french), sum(activity_comments_italian), sum(activity_days_spanish), sum(activity_answer_edits_portuguese), sum(activity_answer_edits_french), sum(created_questions_spanish), sum(comments_number_italian), sum(activity_comments_spanish), sum(answer_edits_mythology), sum(question_edits_windowsphone), features_output, pca_features, prediction