# Feature Engineering

In [1]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, ArrayType
from pyspark.sql.functions import col, when, size, expr, udf, array_max

import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [2]:
spark = SparkSession.builder.appName("Forum Question Analyzer") \
    .config("spark.jars.packages", "com.databricks:spark-xml_2.12:0.17.0") \
    .getOrCreate()
posts = spark.read \
    .format("com.databricks.spark.xml") \
    .option("rootTag", "posts") \
    .option("rowTag", "row") \
    .load("tex.stackexchange.com/Posts.xml") \
    .alias('posts')
users = spark.read \
    .format("com.databricks.spark.xml") \
    .option("rootTag", "users") \
    .option("rowTag", "row") \
    .load("tex.stackexchange.com/Users.xml") \
    .alias('users')
tags = spark.read \
    .format("com.databricks.spark.xml") \
    .option("rootTag", "tags") \
    .option("rowTag", "row") \
    .load("tex.stackexchange.com/Tags.xml") \
    .alias('tags')

## Data preparation

In [3]:
#UDF - Fast way to change tag names to tag counts from Tags table

tag_counts = tags.select("_TagName", "_Count").rdd.collectAsMap()

def replace_tags_with_counts(tags):
    return [tag_counts.get(tag, 0) for tag in tags]

replace_tags_with_counts_udf = udf(replace_tags_with_counts, ArrayType(IntegerType()))

posts = posts.filter(posts._PostTypeId==1)
questions = posts.withColumn("_Tags", expr("split(substring(_Tags, 2, length(_Tags) - 2), '><')"))\
            .withColumn("_Body", size(expr("split(_Body, ' ')")))\
            .withColumn("_Title", size(expr("split(_Title, ' ')")))

data = questions.join(users, questions._OwnerUserId == users._Id).select(
    questions._Id.alias("QuestionId"),
    questions._Body.alias("BodyLength"),
    questions._Title.alias("TitleLength"),
    array_max(replace_tags_with_counts_udf(questions._Tags)).alias("TagsCountMax"),
    size(questions._Tags).alias("NumberOfTags"),
    users._Id.alias("OwnerId"),
    users._DownVotes.alias("OwnerDownVotes"),
    users._UpVotes.alias("OwnerUpVotes"),
    users._Reputation.alias("OwnerReputation"),
    users._Views.alias("OwnerViews"),
    (questions._CreationDate - users._CreationDate).cast("integer").alias("UserExperience"),
    when(col("_AcceptedAnswerId").isNull(), 0).otherwise(1).alias("Accepted")
)

In [4]:
data.show(10)

+----------+----------+-----------+------------+------------+-------+--------------+------------+---------------+----------+--------------+--------+
|QuestionId|BodyLength|TitleLength|TagsCountMax|NumberOfTags|OwnerId|OwnerDownVotes|OwnerUpVotes|OwnerReputation|OwnerViews|UserExperience|Accepted|
+----------+----------+-----------+------------+------------+-------+--------------+------------+---------------+----------+--------------+--------+
|       515|       253|         11|        1424|           4|     22|             0|           1|            183|        14|        173490|       1|
|     36756|        40|          7|        2906|           3|     29|             1|          32|          15947|       415|      42537642|       1|
|    611902|        95|          6|        3490|           1|     29|             1|          32|          15947|       415|     349653635|       0|
|       148|        45|          5|       11290|           3|     34|             3|          57|         

## Feature importances

In [5]:
features = ['BodyLength', 'TitleLength', 'NumberOfTags', 'TagsCountMax', 'OwnerUpVotes',
            'OwnerDownVotes', 'OwnerReputation', 'OwnerViews', 'UserExperience',]
assembler = VectorAssembler(inputCols=features, outputCol="rawfeatures")
scaler = StandardScaler(inputCol="rawfeatures", outputCol="scaledFeatures", withMean=True, withStd=True)

In [6]:
rf = RandomForestClassifier(labelCol="Accepted", featuresCol="scaledFeatures", maxDepth=7, numTrees=100)
rf_pipeline = Pipeline(stages=[assembler, scaler, rf])
rf_model = rf_pipeline.fit(data)
result = rf_model.stages[-1].featureImportances.toArray()

In [7]:
print("Feature Importances:")
for feature, importance in sorted(zip(assembler.getInputCols(), result), key=lambda x: x[1], reverse=True):
    print(f"{feature}: {importance}")

Feature Importances:
OwnerUpVotes: 0.46515636498991775
OwnerReputation: 0.3318528331321772
OwnerViews: 0.08485273230819215
UserExperience: 0.07107258602043508
TagsCountMax: 0.021882701144786465
OwnerDownVotes: 0.018208632240339393
BodyLength: 0.00477985871919592
TitleLength: 0.0011267684879300351
NumberOfTags: 0.0010675229570261066
