In [None]:
# import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib_inline
%matplotlib inline

In [None]:
# set random seed
import random
random.seed(335)

In [None]:
# magic word
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [None]:
# for better viz
import pprint
import warnings
warnings.filterwarnings('ignore')

### reference
-------------------

- [pandas cheat sheet](https://github.com/pandas-dev/pandas/tree/master/doc/cheatsheet)
- [numpy cheat sheet(data camp)](https://www.datacamp.com/community/blog/python-numpy-cheat-sheet)
- [scikit-learn cheat sheet(data camp)](datacamp.com/community/blog/scikit-learn-cheat-sheet)

# modeling
---------------------
In this phase, various modeling techniques are selected and applied and their parameters are calibrated to optimal values. Typically, there are several techniques for the same data mining problem type. Some techniques have specific requirements on the form of data. Therefore, stepping back to the data preparation phase is often necessary.

## select modeling techuique
----------

I used linear regression and gradient boosting methods

First was used just as baseline, second was tested as useful model

## generate test design
------------------


In [96]:
(train_data, test_data) = data.randomSplit([0.7, 0.3], seed = 10)


# A function to run commands
import os
def run(command):
    return os.popen(command).read()

train_data.select("scaledFeatures", "label")\
    .coalesce(1)\
    .write\
    .mode("overwrite")\
    .format("json")\
    .save("project/data/train")

# Run it from root directory of the repository
run("hdfs dfs -cat project/data/train/*.json > data/train.json")

test_data.select("scaledFeatures", "label")\
    .coalesce(1)\
    .write\
    .mode("overwrite")\
    .format("json")\
    .save("project/data/test")

# Run it from root directory of the repository
run("hdfs dfs -cat project/data/test/*.json > data/test.json")

''

## build model
----------


In [97]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.regression import LinearRegression  # Example model, replace with your actual model

# Assuming you have already created a DataFrame 'data' and split it into training and test sets
# train_data, test_data = data.randomSplit([0.7, 0.3])

# Create a LinearRegression instance (replace with your actual model)
lr = LinearRegression(featuresCol="scaledFeatures", labelCol="label")

# Create a ParamGridBuilder and add the parameters to search over
paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()

# Create a CrossValidator instance
crossval = CrossValidator(estimator=lr,
                          estimatorParamMaps=paramGrid,
                          evaluator=RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse"),
                          numFolds=5)


In [98]:
# Transform the data (Prediction)
cvModel = crossval.fit(train_data)

In [100]:
from pyspark.ml.evaluation import RegressionEvaluator 

predictions = cvModel.transform(test_data)
# Evaluate the performance of the model
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
evaluator_r2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")
r2 = evaluator_r2.evaluate(predictions)
print("R-squared (R²) on test data = %g" % r2)

# Calculate the absolute error
predictions = predictions.withColumn("error", F.abs(predictions["label"] - predictions["prediction"]))

# Calculate the median error
median_error = predictions.approxQuantile("error", [0.5], 0.01)[0]
print("Median Error on test data = %g" % median_error)

Root Mean Squared Error (RMSE) on test data = 33.1996
R-squared (R²) on test data = 0.0685226
Median Error on test data = 13.8065


In [101]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import GBTRegressor

gbt = GBTRegressor(featuresCol="scaledFeatures", labelCol="label")

parameters = {
    'maxDepth': [5, 10],
    'maxIter': [5, 10]
}

paramGrid = (ParamGridBuilder()
             .addGrid(gbt.maxDepth, parameters['maxDepth'])
             .addGrid(gbt.maxIter, parameters['maxIter'])
             .build())

crossval = CrossValidator(estimator=gbt,
                          estimatorParamMaps=paramGrid,
                          evaluator=RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse"),
                          numFolds=3)

cvModel = crossval.fit(train_data)
bestModel = cvModel.bestModel

bestMaxDepth = bestModel._java_obj.getMaxDepth()
bestMaxIter = bestModel._java_obj.getMaxIter()
bestStepSize = bestModel._java_obj.getStepSize()

print(f"Best maxDepth: {bestMaxDepth}")
print(f"Best maxIter: {bestMaxIter}")
print(f"Best stepSize: {bestStepSize}")

predictions = cvModel.transform(test_data)

evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(predictions)
print("R-squared (R²) on test data = %g" % r2)

predictions = predictions.withColumn("difference", abs(predictions["prediction"] - predictions["label"]))

filtered_predictions = predictions.filter(predictions["difference"] > 15)
count_filtered = filtered_predictions.count()
total_count = predictions.count()
percentage = 100 - (count_filtered / total_count) * 100
print(f"Values with difference more than 15 percentage: {percentage:.2f}%")

predictions = predictions.withColumn("difference", abs(predictions["prediction"] - predictions["label"]))

filtered_predictions = predictions.filter(predictions["difference"] > 30)

count_filtered = filtered_predictions.count()

total_count = predictions.count()

percentage = 100 - (count_filtered / total_count) * 100

print(f"Values with difference more than 30 percentage: {percentage:.2f}%")

Best maxDepth: 10
Best maxIter: 10
Best stepSize: 0.1
Root Mean Squared Error (RMSE) on test data = 31.9557
R-squared (R²) on test data = 0.137018
