# Feature Engineering

In [1]:
from pyspark.sql.functions import col, when, size, expr, udf, array_max, datediff
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.sql.types import IntegerType, ArrayType
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
import sys
import os

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [2]:
spark = SparkSession.builder.appName("Forum Question Analyzer") \
    .config("spark.jars.packages", "com.databricks:spark-xml_2.12:0.17.0") \
    .getOrCreate()
posts = spark.read \
    .format("com.databricks.spark.xml") \
    .option("rootTag", "posts") \
    .option("rowTag", "row") \
    .load("tex.stackexchange.com/Posts.xml") \
    .alias('posts')
users = spark.read \
    .format("com.databricks.spark.xml") \
    .option("rootTag", "users") \
    .option("rowTag", "row") \
    .load("tex.stackexchange.com/Users.xml") \
    .alias('users')
tags = spark.read \
    .format("com.databricks.spark.xml") \
    .option("rootTag", "tags") \
    .option("rowTag", "row") \
    .load("tex.stackexchange.com/Tags.xml") \
    .alias('tags')

## Data preparation

In [3]:
# UDF to replace tags with their counts
tag_counts = tags.select("_TagName", "_Count").rdd.collectAsMap()
replace_tags = lambda tags: [tag_counts.get(tag, 0) for tag in tags]
replace_tags_udf = udf(replace_tags, ArrayType(IntegerType()))

In [4]:
# Joining tables and extracting features
posts = posts.filter(posts._PostTypeId == 1)
questions = posts.withColumn("_Tags", expr("split(substring(_Tags, 2, length(_Tags) - 2), '><')")) \
    .withColumn("_Body", size(expr("split(_Body, ' ')"))) \
    .withColumn("_Title", size(expr("split(_Title, ' ')")))
questions = questions.join(users, questions._OwnerUserId == users._Id).select(
    questions._Id.alias("QuestionId"),
    questions._Body.alias("BodyLength"),
    questions._Title.alias("TitleLength"),
    array_max(replace_tags_udf(questions._Tags)).alias("TagsCountMax"),
    size(questions._Tags).alias("NumberOfTags"),
    users._Id.alias("OwnerId"),
    users._DownVotes.alias("OwnerDownVotes"),
    users._UpVotes.alias("OwnerUpVotes"),
    users._Reputation.alias("OwnerReputation"),
    users._Views.alias("OwnerViews"),
    datediff(questions._CreationDate, users._CreationDate).alias("OwnerExperience"),
    when(col("_AcceptedAnswerId").isNull(), 0).otherwise(1).alias("Accepted")
)
questions = questions.filter(questions.OwnerExperience >= 0)

## Feature importances

In [5]:
features = ['BodyLength', 'TitleLength', 'NumberOfTags', 'TagsCountMax', 'OwnerUpVotes',
            'OwnerDownVotes', 'OwnerReputation', 'OwnerViews', 'OwnerExperience',]
assembler = VectorAssembler(inputCols=features, outputCol="rawfeatures")
scaler = StandardScaler(inputCol="rawfeatures", outputCol="scaledFeatures", withMean=True, withStd=True)

In [6]:
rf = RandomForestClassifier(labelCol="Accepted", featuresCol="scaledFeatures", maxDepth=7, numTrees=100, seed=57268)
rf_pipeline = Pipeline(stages=[assembler, scaler, rf])
rf_model = rf_pipeline.fit(questions)
result = rf_model.stages[-1].featureImportances.toArray()

In [7]:
print("Feature Importances:")
for feature, importance in sorted(zip(features, result), key=lambda x: x[1], reverse=True):
    print(f"{feature}: {importance:.2%}")

Feature Importances:
OwnerUpVotes: 45.80%
OwnerReputation: 33.41%
OwnerViews: 11.07%
OwnerExperience: 4.57%
TagsCountMax: 2.34%
OwnerDownVotes: 2.04%
BodyLength: 0.53%
NumberOfTags: 0.13%
TitleLength: 0.12%
