# Predictive Analysis

In [1]:
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import when, col, explode, max, avg, count
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier, MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, ArrayType, StringType
from pyspark.sql.functions import udf

import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

spark = SparkSession.builder.appName("Forum Question Analyzer") \
    .config("spark.jars.packages", "com.databricks:spark-xml_2.12:0.17.0")\
    .getOrCreate()

In [2]:
posts = spark.read \
    .format("com.databricks.spark.xml") \
    .option("rootTag", "posts") \
    .option("rowTag", "row") \
    .load("tex.stackexchange.com/Posts.xml") \
    .alias('posts')
users = spark.read \
    .format("com.databricks.spark.xml") \
    .option("rootTag", "users") \
    .option("rowTag", "row") \
    .load("tex.stackexchange.com/Users.xml") \
    .alias('users')
tags = spark.read \
    .format("com.databricks.spark.xml") \
    .option("rootTag", "tags") \
    .option("rowTag", "row") \
    .load("tex.stackexchange.com/Tags.xml") \
    .alias('tags')

### Feature Extraction

In [3]:
import re
CLEANR = re.compile('<.*?>') 

def cleanhtml(raw_html):
  cleantext = re.sub(CLEANR, '', raw_html)
  return cleantext

html_string_len = udf(lambda s: len(cleanhtml(s)), IntegerType())

cleantags = udf(lambda s: s[1:(len(s)-1)].split("><"), ArrayType(StringType()))

proper = udf(lambda s: 1 if (s[-1]=="?" and s[0].isupper()) else 0, IntegerType())

In [4]:
posts = posts.withColumn( "_Tags", cleantags(col('_Tags'))).filter(posts._PostTypeId==1)

posts_tags = posts.select(col("_Id").alias("_Id"),\
        explode(col('_Tags')).alias("tag"))\
        .filter(posts._PostTypeId == 1)

posts_tags_score = \
        posts_tags.join(tags, posts_tags.tag == tags._TagName)\
        .select(posts_tags._Id, tags._Count).groupby(posts_tags._Id)\
                .agg(max(tags._Count).alias("max_tag_count"),\
                     avg(tags._Count).alias("avg_tag_count"),\
                     count(tags._Count).alias("number_of_tags"))

In [5]:
questions = posts.join(posts_tags_score, posts._Id == posts_tags_score._Id).join(users, posts._OwnerUserId == users._Id)\
        .select(posts._Id.alias("question_id"),
                html_string_len(col("_Title")).alias("title_length"),
                html_string_len(col("_Body")).alias("question_length"), 
                col("_Reputation").alias("author_reputation"),
                col("max_tag_count"),
                col("avg_tag_count"),
                col("number_of_tags"),
                proper(col("_Title")).alias("proper_title"),
                when(col("_AcceptedAnswerId").isNull(), 0).otherwise(1).alias("accepted"))

In [6]:
features = ['title_length', 'question_length', 'author_reputation',
            'max_tag_count', 'avg_tag_count', 'number_of_tags', 'proper_title']
assembler = VectorAssembler(inputCols=features, outputCol="features")

### Data Preparation

### Model Training

In [7]:
train, test = questions.randomSplit([0.7, 0.3], seed=12345)

In [8]:
# Logistic Regression model
lr = LogisticRegression(labelCol="accepted", featuresCol="features")
lr_pipeline = Pipeline(stages=[assembler, lr])
lr_model = lr_pipeline.fit(train)

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "c:\Users\jurek\anaconda3\lib\socket.py", line 707, in readinto
    raise
socket.timeout: timed out


In [None]:
# Random Forest model
rf = RandomForestClassifier(labelCol="accepted", featuresCol="features", numTrees=10)
rf_pipeline = Pipeline(stages=[assembler, rf])
rf_model = rf_pipeline.fit(train)

In [None]:
# Gradient Boosting model
gbt = GBTClassifier(labelCol="accepted", featuresCol="features", maxIter=10)
gbt_pipeline = Pipeline(stages=[assembler, gbt])
gbt_model = gbt_pipeline.fit(train)

In [None]:
# Neural Network model
layers = [len(features), 10, 5, 2]  # Adjust layer sizes as needed
nn = MultilayerPerceptronClassifier(labelCol="accepted", featuresCol="features", layers=layers, blockSize=128, seed=1234)
nn_pipeline = Pipeline(stages=[assembler, nn])
nn_model = nn_pipeline.fit(train)

### Predictions

In [None]:
lr_predictions = lr_model.transform(test)
# rf_predictions = rf_model.transform(test)
# gbt_predictions = gbt_model.transform(test)
# nn_predictions = nn_model.transform(test)

### Model Evaluation

In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="accepted", metricName="accuracy")
print('===== Accuracy =====')
print('Logistic Regression:', evaluator.evaluate(lr_predictions))
# print('Random Forest:      ', evaluator.evaluate(rf_predictions))
# print('Gradient Boosting:  ', evaluator.evaluate(gbt_predictions))
# print('Neural Network:     ', evaluator.evaluate(nn_predictions))

===== Accuracy =====
Logistic Regression: 0.6023578712851592


In [None]:
questions.show()

+-----------+------------+---------------+-----------------+-------------+------------------+--------------+------------+--------+
|question_id|title_length|question_length|author_reputation|max_tag_count|     avg_tag_count|number_of_tags|proper_title|accepted|
+-----------+------------+---------------+-----------------+-------------+------------------+--------------+------------+--------+
|      13732|          38|           1967|             1743|         7871|            3998.5|             2|           1|       0|
|      37840|          46|            419|              465|         7871|            5318.0|             4|           0|       1|
|      44322|          37|            281|             2583|        11290| 7950.333333333333|             3|           0|       1|
|      47818|          56|            584|               21|         9582|            7578.0|             2|           0|       0|
|      53921|          86|            243|               21|         6940|         

In [None]:
train.show()

+-----------+------------+---------------+-----------------+-------------+------------------+--------------+------------+--------+
|question_id|title_length|question_length|author_reputation|max_tag_count|     avg_tag_count|number_of_tags|proper_title|accepted|
+-----------+------------+---------------+-----------------+-------------+------------------+--------------+------------+--------+
|         50|          64|            270|             3475|          644|             475.0|             3|           1|       1|
|         77|          46|            380|             4129|         3251|            1827.0|             2|           0|       1|
|        167|          50|           1978|            23813|         4426|            2575.0|             3|           1|       1|
|        196|          17|            294|             3475|         6023|            5010.5|             2|           0|       1|
|        222|          67|            629|            16311|         6940|         