# Feature Engineering

In [1]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when

spark = SparkSession.builder.appName("Forum Question Analyzer") \
    .config("spark.jars.packages", "com.databricks:spark-xml_2.12:0.17.0") \
    .getOrCreate()
posts = spark.read \
    .format("com.databricks.spark.xml") \
    .option("rootTag", "posts") \
    .option("rowTag", "row") \
    .load("tex.stackexchange.com/Posts.xml") \
    .alias('posts')
users = spark.read \
    .format("com.databricks.spark.xml") \
    .option("rootTag", "users") \
    .option("rowTag", "row") \
    .load("tex.stackexchange.com/Users.xml") \
    .alias('users')
tags = spark.read \
    .format("com.databricks.spark.xml") \
    .option("rootTag", "tags") \
    .option("rowTag", "row") \
    .load("tex.stackexchange.com/Tags.xml") \
    .alias('tags')

In [2]:
data = posts.filter(posts._PostTypeId == 1)
data = data.withColumn("accepted", when(col("_AcceptedAnswerId").isNull(), 0).otherwise(1))

## Feature importances

In [ ]:
features = ['_Score', '_ViewCount', '_AnswerCount', '_CommentCount']
assembler = VectorAssembler(inputCols=features, outputCol="features")

In [ ]:
rf = RandomForestClassifier(labelCol="accepted", featuresCol="features", numTrees=10)
rf_pipeline = Pipeline(stages=[assembler, rf])
rf_model = rf_pipeline.fit(data)
result = rf_model.stages[-1].featureImportances.toArray()

In [4]:
print("Feature Importances:")
for feature, importance in zip(features, sorted(result, reverse=True)):
    print(f"{feature}: {importance}")

Feature Importances:
_Score: 0.9078070864005353
_ViewCount: 0.06068359449889784
_AnswerCount: 0.031195081911140638
_CommentCount: 0.0003142371894263021
