In [None]:
!apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz
!tar -xvf spark-3.1.1-bin-hadoop2.7.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop2.7"

In [2]:
from google.colab import drive
drive.mount("/content/gdrive")
%cd '/content/gdrive/MyDrive/LDS9_K265_TranHoangBach/LDS9_K265_TranHoangBach_Cuoi_ky'

Mounted at /content/gdrive
/content/gdrive/MyDrive/LDS9_K265_TranHoangBach/LDS9_K265_TranHoangBach_Cuoi_ky


In [3]:
import findspark
findspark.init()

from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import *
from pyspark.sql.types import *

import matplotlib.pyplot as plt
import seaborn as sb
import numpy as np
import pandas as pd

%matplotlib inline

In [4]:
spark = SparkSession \
            .builder \
            .master("local[*]")\
            .appName("New-Spark") \
            .config("spark.memory.fraction", 0.8) \
            .config("spark.executor.memory", "10g") \
            .config("spark.driver.memory", "10g")\
            .config("spark.sql.shuffle.partitions" , "800") \
            .config("spark.memory.offHeap.enabled",'true')\
            .config("spark.memory.offHeap.size","10g")\
            .getOrCreate()
spark

Đọc data bằng pandas

In [5]:
data_1 = pd.read_excel('data/CCPP/Folds5x2_pp.xlsx', sheet_name='Sheet1')
data_2 = pd.read_excel('data/CCPP/Folds5x2_pp.xlsx', sheet_name='Sheet2')
data_3 = pd.read_excel('data/CCPP/Folds5x2_pp.xlsx', sheet_name='Sheet3')
data_4 = pd.read_excel('data/CCPP/Folds5x2_pp.xlsx', sheet_name='Sheet4')
data_5 = pd.read_excel('data/CCPP/Folds5x2_pp.xlsx', sheet_name='Sheet5')
data_1

Unnamed: 0,AT,V,AP,RH,PE
0,14.96,41.76,1024.07,73.17,463.26
1,25.18,62.96,1020.04,59.08,444.37
2,5.11,39.40,1012.16,92.14,488.56
3,20.86,57.32,1010.24,76.64,446.48
4,10.82,37.50,1009.23,96.62,473.90
...,...,...,...,...,...
9563,16.65,49.69,1014.01,91.00,460.03
9564,13.19,39.18,1023.67,66.78,469.62
9565,31.32,74.33,1012.92,36.48,429.57
9566,24.48,69.45,1013.86,62.39,435.74


In [6]:
data = pd.concat([data_1, data_2, data_3, data_4, data_5])
data

Unnamed: 0,AT,V,AP,RH,PE
0,14.96,41.76,1024.07,73.17,463.26
1,25.18,62.96,1020.04,59.08,444.37
2,5.11,39.40,1012.16,92.14,488.56
3,20.86,57.32,1010.24,76.64,446.48
4,10.82,37.50,1009.23,96.62,473.90
...,...,...,...,...,...
9563,15.12,48.92,1011.80,72.93,462.59
9564,33.41,77.95,1010.30,59.72,432.90
9565,15.99,43.34,1014.20,78.66,465.96
9566,17.65,59.87,1018.58,94.65,450.93


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47840 entries, 0 to 9567
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AT      47840 non-null  float64
 1   V       47840 non-null  float64
 2   AP      47840 non-null  float64
 3   RH      47840 non-null  float64
 4   PE      47840 non-null  float64
dtypes: float64(5)
memory usage: 2.2 MB


Thông qua data.info và dtype ta tạo schema cho pyspark dataframe

In [8]:
schema = StructType([StructField("AT", DoubleType(), False), 
                     StructField("V", DoubleType(), False),
                     StructField("AP", DoubleType(), False),
                     StructField("RH", DoubleType(), False),
                     StructField("label", DoubleType(), False),
])
df = spark.createDataFrame(data, schema=schema)
df.show(5)

+-----+-----+-------+-----+------+
|   AT|    V|     AP|   RH| label|
+-----+-----+-------+-----+------+
|14.96|41.76|1024.07|73.17|463.26|
|25.18|62.96|1020.04|59.08|444.37|
| 5.11| 39.4|1012.16|92.14|488.56|
|20.86|57.32|1010.24|76.64|446.48|
|10.82| 37.5|1009.23|96.62| 473.9|
+-----+-----+-------+-----+------+
only showing top 5 rows



Nhận xét:
- Bài toán đặt ra thuộc nhóm Regression.
- Các biến input đều là biến continuous.
- Dữ liệu không có null, đã được pre-processing khá hoàn hảo.
- Có thể process thêm scaler cho input.

Pre-processing model, vector assembler

In [9]:
from pyspark.ml.feature import StandardScaler, VectorAssembler
from pyspark.ml.pipeline import Pipeline

vector_assembler = VectorAssembler(inputCols=['AT', 'V', 'AP', 'RH'], outputCol='non_scale_features')

scaler = StandardScaler(inputCol="non_scale_features", outputCol="features")

pre_process_pipeline = Pipeline(stages=[vector_assembler, scaler])
pre_process_pipeline_fit = pre_process_pipeline.fit(df)

final_df = pre_process_pipeline_fit.transform(df)
final_df.show(5)

+-----+-----+-------+-----+------+--------------------+--------------------+
|   AT|    V|     AP|   RH| label|  non_scale_features|            features|
+-----+-----+-------+-----+------+--------------------+--------------------+
|14.96|41.76|1024.07|73.17|463.26|[14.96,41.76,1024...|[2.00747121249814...|
|25.18|62.96|1020.04|59.08|444.37|[25.18,62.96,1020...|[3.37888536969941...|
| 5.11| 39.4|1012.16|92.14|488.56|[5.11,39.4,1012.1...|[0.68570707860063...|
|20.86|57.32|1010.24|76.64|446.48|[20.86,57.32,1010...|[2.79918780031492...|
|10.82| 37.5|1009.23|96.62| 473.9|[10.82,37.5,1009....|[1.45192770850467...|
+-----+-----+-------+-----+------+--------------------+--------------------+
only showing top 5 rows



In [14]:
final_df.count()

47840

In [12]:
train_df, test_df = final_df.randomSplit([0.8, 0.2], seed=42)

Vì số lượng mẫu không nhiều nên ta có thể build nhiều models, xem kết quả

In [13]:
from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor, GeneralizedLinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

list_model = [('Linear Regression', LinearRegression()),
              ('Generalized Linear Regression', GeneralizedLinearRegression(family='gaussian')), 
              ('Random Forest', RandomForestRegressor(seed=1)), 
              ('Gradient Boosting', GBTRegressor(seed=1)),
]

for model_name, model in list_model:
    trained_model = model.fit(train_df)
    predictions = trained_model.transform(test_df)
    
    print('-'*30)
    print("\033[1m" + model_name + "\033[0m")
    print('')
    print('  RMSE: {:.4f}'.format(RegressionEvaluator(metricName='rmse').evaluate(predictions)))
    print('  MSE : {:.4f}'.format(RegressionEvaluator(metricName='mse').evaluate(predictions)))
    print('  R2  : {:.4f}'.format(RegressionEvaluator(metricName='r2').evaluate(predictions)))

------------------------------
[1mLinear Regression[0m

  RMSE: 4.5999
  MSE : 21.1593
  R2  : 0.9264
------------------------------
[1mGeneralized Linear Regression[0m

  RMSE: 4.5999
  MSE : 21.1593
  R2  : 0.9264
------------------------------
[1mRandom Forest[0m

  RMSE: 4.1687
  MSE : 17.3779
  R2  : 0.9395
------------------------------
[1mGradient Boosting[0m

  RMSE: 3.8416
  MSE : 14.7576
  R2  : 0.9487


- Model tốt nhất là Gradient Boosting với R2 đạt đến 94.87%, RMSE đạt 3.8
- Các model đều cho kết quả Regression là khá tốt, R2 đạt 92-95%, RMSE 3.8 - 4.6
- Các model là khá tốt và đều có thể sử dụng cho việc dự đoán trong tương lai