# Predictive Analysis

In [1]:
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.sql.functions import when, col, array_max, udf, expr, size, datediff
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier, MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, ArrayType, FloatType

import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [2]:
spark = SparkSession.builder.appName("Forum Question Analyzer") \
    .config("spark.jars.packages", "com.databricks:spark-xml_2.12:0.17.0")\
    .getOrCreate()

posts = spark.read \
    .format("com.databricks.spark.xml") \
    .option("rootTag", "posts") \
    .option("rowTag", "row") \
    .load("tex.stackexchange.com/Posts.xml") \
    .alias('posts')
users = spark.read \
    .format("com.databricks.spark.xml") \
    .option("rootTag", "users") \
    .option("rowTag", "row") \
    .load("tex.stackexchange.com/Users.xml") \
    .alias('users')
tags = spark.read \
    .format("com.databricks.spark.xml") \
    .option("rootTag", "tags") \
    .option("rowTag", "row") \
    .load("tex.stackexchange.com/Tags.xml") \
    .alias('tags')

### Feature Extraction

In [3]:
#UDF - Fast way to change tag names to tag counts from Tags table

tag_counts = tags.select("_TagName", "_Count").rdd.collectAsMap()

def replace_tags_with_counts(tags):
    return [tag_counts.get(tag, 0) for tag in tags]

replace_tags_with_counts_udf = udf(replace_tags_with_counts, ArrayType(IntegerType()))

In [4]:
posts = posts.filter(posts._PostTypeId==1)
questions = posts.withColumn("_Tags", expr("split(substring(_Tags, 2, length(_Tags) - 2), '><')"))\
            .withColumn("_Body", size(expr("split(_Body, ' ')")))\
            .withColumn("_Title", size(expr("split(_Title, ' ')")))

questions = questions.join(users, questions._OwnerUserId == users._Id).select(
    questions._Id.alias("QuestionId"),
    questions._Body.alias("BodyLength"),
    questions._Title.alias("TitleLength"),
    array_max(replace_tags_with_counts_udf(questions._Tags)).alias("TagsCountMax"),
    size(questions._Tags).alias("NumberOfTags"),
    users._Id.alias("OwnerId"),
    users._DownVotes.alias("OwnerDownVotes"),
    users._UpVotes.alias("OwnerUpVotes"),
    users._Reputation.alias("OwnerReputation"),
    users._Views.alias("OwnerViews"),
    datediff(questions._CreationDate, users._CreationDate).alias("OwnerExperience"),
    when(col("_AcceptedAnswerId").isNull(), 0).otherwise(1).alias("Accepted")
)

questions = questions.filter(questions.OwnerExperience >= 0)

### Data Preparation

In [5]:
features = ['OwnerUpVotes', 'OwnerReputation', 'OwnerViews', 'OwnerExperience', 'TagsCountMax', 'OwnerDownVotes']
assembler = VectorAssembler(inputCols=features, outputCol="rawfeatures")
scaler = StandardScaler(inputCol="rawfeatures", outputCol="scaledFeatures", withMean=True, withStd=True)

In [6]:
train, test = questions.randomSplit([0.8, 0.2])

### Model Training

In [7]:
# Logistic Regression model
lr = LogisticRegression(labelCol="Accepted", featuresCol="scaledFeatures", maxIter=100)
lr_pipeline = Pipeline(stages=[assembler, scaler, lr])
lr_model = lr_pipeline.fit(train)

In [8]:
# Random Forest model
rf = RandomForestClassifier(labelCol="Accepted", featuresCol="scaledFeatures", numTrees=100)
rf_pipeline = Pipeline(stages=[assembler, scaler, rf])
rf_model = rf_pipeline.fit(train)

In [9]:
# Gradient Boosting model
gbt = GBTClassifier(labelCol="Accepted", featuresCol="scaledFeatures", maxIter=100)
gbt_pipeline = Pipeline(stages=[assembler, scaler, gbt])
gbt_model = gbt_pipeline.fit(train)

In [10]:
# Neural Network model
layers = [len(features), 10, 8, 2]  # Adjust layer sizes as needed
nn = MultilayerPerceptronClassifier(labelCol="Accepted", featuresCol="scaledFeatures", layers=layers, blockSize=128)
nn_pipeline = Pipeline(stages=[assembler, scaler, nn])
nn_model = nn_pipeline.fit(train)

### Predictions

In [11]:
lr_predictions = lr_model.transform(test)
rf_predictions = rf_model.transform(test)
gbt_predictions = gbt_model.transform(test)
nn_predictions = nn_model.transform(test)

### Model Evaluation

In [12]:
def metrics(predictions):
    predictionAndLabels = predictions.select("prediction", "Accepted")\
        .rdd.map(lambda row: (float(row["prediction"]), float(row["Accepted"])))
    
    confmat = MulticlassMetrics(predictionAndLabels).confusionMatrix().toArray()

    TP = confmat[0, 0]
    FP = confmat[0, 1]
    FN = confmat[1, 0]
    TN = confmat[1, 1]

    #Bookmaker informedness
    if (TN + FP == 0) or (TP + FN == 0):
        BI = "BI measure cannot be calculated"
    else:
        TNR = TN / (TN + FP)
        TPR = TP / (TP + FN)
        BI = TPR + TNR - 1

    #Matthews correlation coefficient
    #Case when entire row/column is 0
    if (TP + FP) * (TP + FN) * (TN + FP) * (TN + FN) == 0:
        MCC = 0
    else:
        MCC = (TP * TN - FP * FN) / ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))**0.5

    #Accuracy
    accuracy = (TP + TN) / (TP + TN + FP + FN)

    return [accuracy, BI, MCC]

In [13]:
print('===== Accuracy =====')
for name, pred in [
    ("logistic regression", lr_predictions),
    ("random forest      ", rf_predictions),
    ("gradient boosting  ", gbt_predictions),
    ("neural network     ", nn_predictions)]:
    x = metrics(pred)
    if isinstance(x[1], float):
        print(f'{name} : accuracy = {x[0]:.3%}, BI = {x[1]}, MCC = {x[2]:.4f}.')
    else:
        print(f'{name} : accuracy = {x[0]:.3%}, {x[1]}, MCC = {x[2]:.4f}.')

===== Accuracy =====




logistic regression : accuracy = 59.946%, BI measure cannot be calculated, MCC = 0.0000.
random forest       : accuracy = 70.370%, BI = 0.4245699672249277, MCC = 0.3638.
gradient boosting   : accuracy = 71.023%, BI = 0.4286597313739091, MCC = 0.3783.
neural network      : accuracy = 66.468%, BI = 0.3064367015966454, MCC = 0.2709.


### Extracting probability of getting an answer

In [14]:
#Training best model using entire data
best_model = gbt_pipeline.fit(questions)

In [15]:
def prob(values):
    colnames = ['BodyLength', 'TitleLength', 'TagsCountMax',
                'NumberOfTags', 'OwnerDownVotes', 'OwnerUpVotes',
                'OwnerReputation', 'OwnerViews', 'OwnerExperience']
    example = spark.createDataFrame([values], colnames)
    p = best_model.transform(example)
    return p.collect()[0].__getitem__("probability")[1]

In [16]:
#example using most recent posts, not yet in posts dataframe (post 708098)
prob([50, 9, 9630, 3, 0, 7, 99, 303, 273])

0.5338136132319231