# Predictive Analysis

In [2]:
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.sql.functions import when, col, array_max, udf, expr, size
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier, MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, ArrayType

import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [3]:
spark = SparkSession.builder.appName("Forum Question Analyzer") \
    .config("spark.jars.packages", "com.databricks:spark-xml_2.12:0.17.0")\
    .getOrCreate()

posts = spark.read \
    .format("com.databricks.spark.xml") \
    .option("rootTag", "posts") \
    .option("rowTag", "row") \
    .load("tex.stackexchange.com/Posts.xml") \
    .alias('posts')
users = spark.read \
    .format("com.databricks.spark.xml") \
    .option("rootTag", "users") \
    .option("rowTag", "row") \
    .load("tex.stackexchange.com/Users.xml") \
    .alias('users')
tags = spark.read \
    .format("com.databricks.spark.xml") \
    .option("rootTag", "tags") \
    .option("rowTag", "row") \
    .load("tex.stackexchange.com/Tags.xml") \
    .alias('tags')

### Feature Extraction

In [4]:
#UDF - Fast way to change tag names to tag counts from Tags table

tag_counts = tags.select("_TagName", "_Count").rdd.collectAsMap()

def replace_tags_with_counts(tags):
    return [tag_counts.get(tag, 0) for tag in tags]

replace_tags_with_counts_udf = udf(replace_tags_with_counts, ArrayType(IntegerType()))

In [5]:
posts = posts.filter(posts._PostTypeId==1)
questions = posts.withColumn("_Tags", expr("split(substring(_Tags, 2, length(_Tags) - 2), '><')"))\
            .withColumn("_Body", size(expr("split(_Body, ' ')")))\
            .withColumn("_Title", size(expr("split(_Title, ' ')")))

questions = questions.join(users, questions._OwnerUserId == users._Id).select(
    questions._Id.alias("QuestionId"),
    questions._Body.alias("BodyLength"),
    questions._Title.alias("TitleLength"),
    array_max(replace_tags_with_counts_udf(questions._Tags)).alias("TagsCountMax"),
    size(questions._Tags).alias("NumberOfTags"),
    users._Id.alias("OwnerId"),
    users._DownVotes.alias("OwnerDownVotes"),
    users._UpVotes.alias("OwnerUpVotes"),
    users._Reputation.alias("OwnerReputation"),
    users._Views.alias("OwnerViews"),
    (questions._CreationDate - users._CreationDate).cast("integer").alias("UserExperience"),
    when(col("_AcceptedAnswerId").isNull(), 0).otherwise(1).alias("Accepted")
)

In [6]:
questions.show(10)

+----------+----------+-----------+------------+------------+-------+--------------+------------+---------------+----------+--------------+--------+
|QuestionId|BodyLength|TitleLength|TagsCountMax|NumberOfTags|OwnerId|OwnerDownVotes|OwnerUpVotes|OwnerReputation|OwnerViews|UserExperience|Accepted|
+----------+----------+-----------+------------+------------+-------+--------------+------------+---------------+----------+--------------+--------+
|       515|       253|         11|        1424|           4|     22|             0|           1|            183|        14|        173490|       1|
|     36756|        40|          7|        2906|           3|     29|             1|          32|          15947|       415|      42537642|       1|
|    611902|        95|          6|        3490|           1|     29|             1|          32|          15947|       415|     349653635|       0|
|       148|        45|          5|       11290|           3|     34|             3|          57|         

In [28]:
features = ['NumberOfTags', 'TagsCountMax', 'OwnerUpVotes',
            'OwnerDownVotes', 'OwnerReputation', 'OwnerViews', 'UserExperience',]
assembler = VectorAssembler(inputCols=features, outputCol="rawfeatures")
scaler = StandardScaler(inputCol="rawfeatures", outputCol="scaledFeatures", withMean=True, withStd=True)

### Data Preparation

### Model Training

In [56]:
train, test = questions.randomSplit([0.8, 0.2])

In [57]:
# Logistic Regression model
lr = LogisticRegression(labelCol="Accepted", featuresCol="scaledFeatures", maxIter=100)
lr_pipeline = Pipeline(stages=[assembler, scaler, lr])
lr_model = lr_pipeline.fit(train)

In [58]:
# Random Forest model
rf = RandomForestClassifier(labelCol="Accepted", featuresCol="scaledFeatures", numTrees=100)
rf_pipeline = Pipeline(stages=[assembler, scaler, rf])
rf_model = rf_pipeline.fit(train)

In [59]:
# Gradient Boosting model
gbt = GBTClassifier(labelCol="Accepted", featuresCol="scaledFeatures", maxIter=100)
gbt_pipeline = Pipeline(stages=[assembler, scaler, gbt])
gbt_model = gbt_pipeline.fit(train)

In [60]:
# Neural Network model
layers = [len(features), 10, 5, 2]  # Adjust layer sizes as needed
nn = MultilayerPerceptronClassifier(labelCol="Accepted", featuresCol="scaledFeatures", layers=layers, blockSize=128, seed=1234)
nn_pipeline = Pipeline(stages=[assembler, scaler, nn])
nn_model = nn_pipeline.fit(train)

### Predictions

In [61]:
lr_predictions = lr_model.transform(test)
rf_predictions = rf_model.transform(test)
gbt_predictions = gbt_model.transform(test)
nn_predictions = nn_model.transform(test)

### Model Evaluation

In [62]:
evaluator = MulticlassClassificationEvaluator(labelCol="Accepted", metricName="accuracy")
print('===== Accuracy =====')
print('Logistic Regression:', evaluator.evaluate(lr_predictions))
print('Random Forest:      ', evaluator.evaluate(rf_predictions))
print('Gradient Boosting:  ', evaluator.evaluate(gbt_predictions))
print('Neural Network:     ', evaluator.evaluate(nn_predictions))

===== Accuracy =====
Logistic Regression: 0.6027293555134974
Random Forest:       0.7045920908457017
Gradient Boosting:   0.7134575156888137
Neural Network:      0.66231696384102
