# Predictive Analysis

In [1]:
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import when, col, explode, max, avg, count, udf, expr, size, datediff
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier, MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, ArrayType, StringType

import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [2]:
spark = SparkSession.builder.appName("Forum Question Analyzer") \
    .config("spark.jars.packages", "com.databricks:spark-xml_2.12:0.17.0")\
    .getOrCreate()

posts = spark.read \
    .format("com.databricks.spark.xml") \
    .option("rootTag", "posts") \
    .option("rowTag", "row") \
    .load("tex.stackexchange.com/Posts.xml") \
    .alias('posts')
users = spark.read \
    .format("com.databricks.spark.xml") \
    .option("rootTag", "users") \
    .option("rowTag", "row") \
    .load("tex.stackexchange.com/Users.xml") \
    .alias('users')
tags = spark.read \
    .format("com.databricks.spark.xml") \
    .option("rootTag", "tags") \
    .option("rowTag", "row") \
    .load("tex.stackexchange.com/Tags.xml") \
    .alias('tags')

### Feature Extraction

In [3]:
posts = posts.filter(posts._PostTypeId==1)
questions = posts.withColumn("_Tags", expr("split(substring(_Tags, 2, length(_Tags) - 2), '><')"))\
            .withColumn("_Body", size(expr("split(_Body, ' ')")))\
            .withColumn("_Title", size(expr("split(_Title, ' ')")))

questions = questions.join(users, questions._OwnerUserId == users._Id).select(
    questions._Id.alias("QuestionId"),
    questions._Body.alias("BodyLength"),
    questions._Title.alias("TitleLength"),
    questions._Tags.alias("TagsCounts"),
    size(questions._Tags).alias("NumberOfTags"),
    users._Id.alias("OwnerId"),
    users._DownVotes.alias("OwnerDownVotes"),
    users._UpVotes.alias("OwnerUpVotes"),
    users._Reputation.alias("OwnerReputation"),
    users._Views.alias("OwnerViews"),
    (questions._CreationDate - users._CreationDate).cast("integer").alias("UserExperience"),
    when(col("_AcceptedAnswerId").isNull(), 0).otherwise(1).alias("Accepted")
)

In [4]:
#Fast way to change tag names to tag counts from Tags table

tag_counts = tags.select("_TagName", "_Count").rdd.collectAsMap()

def replace_tags_with_counts(tags):
    return [tag_counts.get(tag, 0) for tag in tags]

replace_tags_with_counts_udf = udf(replace_tags_with_counts, ArrayType(IntegerType()))

questions = questions.withColumn("TagsCounts", replace_tags_with_counts_udf(questions.TagsCounts))

In [5]:
questions.show(5, truncate=False)
# posts_tags = posts.select(col("_Id").alias("_Id"),
#         explode(col('_Tags')).alias("tag"))\
#         .filter(posts._PostTypeId == 1)

# posts_tags_score = \
#         posts_tags.join(tags, posts_tags.tag == tags._TagName)\
#         .select(posts_tags._Id, tags._Count).groupby(posts_tags._Id)\
#                 .agg(max(tags._Count).alias("max_tag_count"),
#                      avg(tags._Count).alias("avg_tag_count"),
#                      count(tags._Count).alias("number_of_tags"))


+----------+----------+-----------+---------------------+------------+-------+--------------+------------+---------------+----------+--------------+--------+
|QuestionId|BodyLength|TitleLength|TagsCounts           |NumberOfTags|OwnerId|OwnerDownVotes|OwnerUpVotes|OwnerReputation|OwnerViews|UserExperience|Accepted|
+----------+----------+-----------+---------------------+------------+-------+--------------+------------+---------------+----------+--------------+--------+
|515       |253       |11         |[1424, 446, 198, 879]|4           |22     |0             |1           |183            |14        |173490        |1       |
|36756     |40        |7          |[2906, 2364, 1410]   |3           |29     |1             |32          |15947          |415       |42537642      |1       |
|611902    |95        |6          |[3490]               |1           |29     |1             |32          |15947          |415       |349653635     |0       |
|148       |45        |5          |[11290, 2858, 815

In [6]:
features = ['title_length', 'question_length', 'author_reputation',
            'max_tag_count', 'avg_tag_count', 'number_of_tags', 'proper_title']
assembler = VectorAssembler(inputCols=features, outputCol="features")

### Data Preparation

### Model Training

In [7]:
train, test = questions.randomSplit([0.7, 0.3], seed=12345)

In [8]:
# Logistic Regression model
lr = LogisticRegression(labelCol="accepted", featuresCol="features")
lr_pipeline = Pipeline(stages=[assembler, lr])
lr_model = lr_pipeline.fit(train)

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "c:\Users\jurek\anaconda3\lib\socket.py", line 707, in readinto
    raise
socket.timeout: timed out


In [None]:
# Random Forest model
rf = RandomForestClassifier(labelCol="accepted", featuresCol="features", numTrees=10)
rf_pipeline = Pipeline(stages=[assembler, rf])
rf_model = rf_pipeline.fit(train)

In [None]:
# Gradient Boosting model
gbt = GBTClassifier(labelCol="accepted", featuresCol="features", maxIter=10)
gbt_pipeline = Pipeline(stages=[assembler, gbt])
gbt_model = gbt_pipeline.fit(train)

In [None]:
# Neural Network model
layers = [len(features), 10, 5, 2]  # Adjust layer sizes as needed
nn = MultilayerPerceptronClassifier(labelCol="accepted", featuresCol="features", layers=layers, blockSize=128, seed=1234)
nn_pipeline = Pipeline(stages=[assembler, nn])
nn_model = nn_pipeline.fit(train)

### Predictions

In [None]:
lr_predictions = lr_model.transform(test)
rf_predictions = rf_model.transform(test)
gbt_predictions = gbt_model.transform(test)
nn_predictions = nn_model.transform(test)

### Model Evaluation

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="accepted", metricName="accuracy")
print('===== Accuracy =====')
print('Logistic Regression:', evaluator.evaluate(lr_predictions))
print('Random Forest:      ', evaluator.evaluate(rf_predictions))
print('Gradient Boosting:  ', evaluator.evaluate(gbt_predictions))
print('Neural Network:     ', evaluator.evaluate(nn_predictions))

===== Accuracy =====
Logistic Regression: 0.6023578712851592
