# Feature Engineering

In [7]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, ArrayType
from pyspark.sql.functions import col, when, size, expr, udf, array_max
import sys, os

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [8]:

spark = SparkSession.builder.appName("Forum Question Analyzer") \
    .config("spark.jars.packages", "com.databricks:spark-xml_2.12:0.17.0") \
    .getOrCreate()
posts = spark.read \
    .format("com.databricks.spark.xml") \
    .option("rootTag", "posts") \
    .option("rowTag", "row") \
    .load("tex.stackexchange.com/Posts.xml") \
    .alias('posts')
users = spark.read \
    .format("com.databricks.spark.xml") \
    .option("rootTag", "users") \
    .option("rowTag", "row") \
    .load("tex.stackexchange.com/Users.xml") \
    .alias('users')
tags = spark.read \
    .format("com.databricks.spark.xml") \
    .option("rootTag", "tags") \
    .option("rowTag", "row") \
    .load("tex.stackexchange.com/Tags.xml") \
    .alias('tags')

## Data preparation

In [9]:
tag_counts = tags.select("_TagName", "_Count").rdd.collectAsMap()
tags_udf = udf(
    lambda t: [tag_counts.get(tag, 0) for tag in t],
    ArrayType(IntegerType())
)

questions = posts.filter(posts._PostTypeId==1)
questions = questions.withColumn("_Tags", expr("split(substring(_Tags, 2, length(_Tags) - 2), '><')"))\
            .withColumn("_Body", size(expr("split(_Body, ' ')")))\
            .withColumn("_Title", size(expr("split(_Title, ' ')")))
data = questions.join(users, questions._OwnerUserId == users._Id).select(
    questions._Id.alias("question_id"),
    questions._Body.alias("body_length"),
    questions._Title.alias("title_length"),
    array_max(tags_udf(questions._Tags)).alias("tags_count_max"),
    size(questions._Tags).alias("tags_number"),
    users._Id.alias("user_id"),
    users._DownVotes.alias("user_down_votes"),
    users._UpVotes.alias("user_up_votes"),
    users._Reputation.alias("user_reputation"),
    users._Views.alias("user_views"),
    (questions._CreationDate - users._CreationDate).cast("integer").alias("user_experience"),
    when(col("_AcceptedAnswerId").isNull(), 0).otherwise(1).alias("accepted")
)

## Feature importances

In [10]:
features = ['body_length', 'title_length', 'tags_count_max', 'tags_number', 'user_down_votes', 'user_up_votes', 'user_reputation', 'user_views', 'user_experience']
assembler = VectorAssembler(inputCols=features, outputCol="features")
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")

In [11]:
rf = RandomForestClassifier(labelCol="accepted", featuresCol="scaled_features", numTrees=10)
rf_pipeline = Pipeline(stages=[assembler, scaler, rf])
rf_model = rf_pipeline.fit(data)
result = rf_model.stages[-1].featureImportances.toArray()

In [12]:
print("Feature Importances:")
for feature, importance in zip(features, sorted(result, reverse=True)):
    print(f"{feature}: {importance}")

Feature Importances:
body_length: 0.6891810300157529
title_length: 0.15699713218171335
tags_count_max: 0.06042703450024815
tags_number: 0.05236858928918564
user_down_votes: 0.026774196119441474
user_up_votes: 0.011687891654931143
user_reputation: 0.002392938520697858
user_views: 0.00017118771802947637
user_experience: 0.0
