# Forum Question Analyzer

In [1]:
# Load the dataset from the extracted .xml files.
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Forum Question Analyzer") \
    .config("spark.jars.packages", "com.databricks:spark-xml_2.12:0.17.0") \
    .getOrCreate()
posts = spark.read \
    .format("com.databricks.spark.xml") \
    .option("rootTag", "posts") \
    .option("rowTag", "row") \
    .load("tex.stackexchange.com/Posts.xml") \
    .alias('posts')

In [2]:
posts.show(5)

+-----------------+------------+--------------------+--------------------+-------------+-------------------+---------------+--------------------+--------------+---+--------------------+--------------------+----------------------+-----------------+-----------------+------------+---------+-----------+------+--------------------+--------------------+----------+
|_AcceptedAnswerId|_AnswerCount|               _Body|         _ClosedDate|_CommentCount|_CommunityOwnedDate|_ContentLicense|       _CreationDate|_FavoriteCount|_Id|   _LastActivityDate|       _LastEditDate|_LastEditorDisplayName|_LastEditorUserId|_OwnerDisplayName|_OwnerUserId|_ParentId|_PostTypeId|_Score|               _Tags|              _Title|_ViewCount|
+-----------------+------------+--------------------+--------------------+-------------+-------------------+---------------+--------------------+--------------+---+--------------------+--------------------+----------------------+-----------------+-----------------+------------+

In [3]:
# Descriptive statistics

print("Number of rows: ", posts.count())
print("Number of questions: ", posts.filter(posts._PostTypeId == 1).count())
print("Number of answers: ", posts.filter(posts._PostTypeId == 2).count())
print("Number of accepted answers: ", posts.filter(posts._AcceptedAnswerId.isNotNull()).count())

Number of rows:  584821
Number of questions:  255804
Number of answers:  326115
Number of accepted answers:  153808


## Feature Extraction

In [4]:
from pyspark.ml.feature import VectorAssembler
features = ['_Score', '_ViewCount', '_AnswerCount', '_CommentCount']
assembler = VectorAssembler(inputCols=features, outputCol="features")

## Data Preparation

In [5]:
from pyspark.sql.functions import when, col

questions = posts.filter(posts._PostTypeId == 1)
questions = questions.withColumn("accepted", when(col("_AcceptedAnswerId").isNull(), 0).otherwise(1))

## Model Training

In [6]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

lr = LogisticRegression(labelCol="accepted", featuresCol="features")
pipeline = Pipeline(stages=[assembler, lr])
train, test = questions.randomSplit([0.7, 0.3], seed=12345)

In [7]:
model = pipeline.fit(train)

In [8]:
predictions = model.transform(test)

## Model Evaluation

In [9]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="accepted", metricName="accuracy")
print("Test Accuracy: " + str(evaluator.evaluate(predictions)))

Test Accuracy: 0.742379177327809
