In [1]:
from pyspark.sql import SparkSession

MAX_MEMORY="5g"
spark = SparkSession.builder.appName("taxi-fare-prediciton")\
                .config("spark.executor.memory", MAX_MEMORY)\
                .config("spark.driver.memory", MAX_MEMORY)\
                .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/06/14 04:03:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
data_dir = "/home/ubuntu/working/spark-examples/data/ml-data"

train_df = spark.read.parquet(f"{data_dir}/train/")
test_df = spark.read.parquet(f"{data_dir}/test/")

                                                                                

In [3]:
stages = []

from pyspark.ml.feature import StringIndexer, OneHotEncoder

# OneHotEncoding을 수행할 컬럼을 지정
cat_features = [
    "pickup_location_id",
    "dropoff_location_id",
    "day_of_week"
]

for c in cat_features:
    # 1. 데이터를 문자열 형식으로 바꿔준다. setHandleInvalid : Null값 같은 데이터를 어떻게 처리 할건지
    cat_indexer = StringIndexer(inputCol=c, outputCol=c+"_idx").setHandleInvalid("keep")
    
    # 2. One Hot Encoding 수행
    onehot_encoder = OneHotEncoder(inputCols=[cat_indexer.getOutputCol()], outputCols=[c+"_onehot"])
    
    stages += [cat_indexer, onehot_encoder]

stages

[StringIndexer_ed0f396fb699,
 OneHotEncoder_e2fb91c14a06,
 StringIndexer_b2ae5c1fb936,
 OneHotEncoder_b84305047842,
 StringIndexer_8a90891a4bf7,
 OneHotEncoder_f1990d4b5772]

In [4]:
from pyspark.ml.feature import VectorAssembler, StandardScaler

num_features = [
    "passenger_count",
    "trip_distance",
    "pickup_time"
]

for n in num_features:
    
    # 각각의 컬럼의 데이터가 벡터화. ex) 1.5 -> [1.5]
    num_assembler = VectorAssembler(inputCols=[n], outputCol=n+"_vector")
    
    # StandardScaling 수행
    num_scaler = StandardScaler(inputCol=num_assembler.getOutputCol(), outputCol=n+"_scaled")
    
    stages += [num_assembler, num_scaler]

stages

[StringIndexer_ed0f396fb699,
 OneHotEncoder_e2fb91c14a06,
 StringIndexer_b2ae5c1fb936,
 OneHotEncoder_b84305047842,
 StringIndexer_8a90891a4bf7,
 OneHotEncoder_f1990d4b5772,
 VectorAssembler_ee4490e19a21,
 StandardScaler_0e20464a069c,
 VectorAssembler_bed5b2027e56,
 StandardScaler_6ceb27b76a8f,
 VectorAssembler_5b8991ab86e7,
 StandardScaler_3eb6335109e2]

In [5]:
# _onehot이 붙은 컬럼과 _scaled 가 붙은 컬럼만 있으면 된다.
assembler_inputs = [c + "_onehot" for c in cat_features] + [n + "_scaled" for n in num_features]
assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="feature_vector")
stages += [assembler]

stages

[StringIndexer_ed0f396fb699,
 OneHotEncoder_e2fb91c14a06,
 StringIndexer_b2ae5c1fb936,
 OneHotEncoder_b84305047842,
 StringIndexer_8a90891a4bf7,
 OneHotEncoder_f1990d4b5772,
 VectorAssembler_ee4490e19a21,
 StandardScaler_0e20464a069c,
 VectorAssembler_bed5b2027e56,
 StandardScaler_6ceb27b76a8f,
 VectorAssembler_5b8991ab86e7,
 StandardScaler_3eb6335109e2,
 VectorAssembler_28a5d8327619]

# 하이퍼 파라미터 튜닝

In [6]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression # Ridge, Lasso가 없고, ElasticNet을 포함

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator

In [8]:
# 모델 생성
lr = LinearRegression(
    maxIter= 30,
    solver='normal',
    labelCol="total_amount",
    featuresCol="feature_vector")

# LinearRegression 모델까지 하나의 파이프라인으로 통합
cv_stages = stages+[lr]

In [9]:
# 파이프라인 생성
cv_pipeline = Pipeline(stages = cv_stages)

 ## GridSearch 및 CrossValidation 설정

In [10]:
param_grid = ParamGridBuilder()\
            .addGrid(lr.elasticNetParam, [0.1, 0.2, 0.3, 0.4, 0.5])\ # 혼합율 조절
            .addGrid(lr.regParam, [0.01, 0.02, 0.03, 0.04, 0.05])\ # 정규화 조절
            .build()
param_grid

[{Param(parent='LinearRegression_72c58de5e02a', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.1,
  Param(parent='LinearRegression_72c58de5e02a', name='regParam', doc='regularization parameter (>= 0).'): 0.01},
 {Param(parent='LinearRegression_72c58de5e02a', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.1,
  Param(parent='LinearRegression_72c58de5e02a', name='regParam', doc='regularization parameter (>= 0).'): 0.02},
 {Param(parent='LinearRegression_72c58de5e02a', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.1,
  Param(parent='LinearRegression_72c58de5e02a', name='regParam', doc='regularization parameter (>= 0).'): 0.03},
 {Param(paren

In [11]:
cross_val = CrossValidator(
        estimator=cv_pipeline, # 파이프라인을 Estimator로 넣는 경우 파이프라인에 마지막 stage는 꼭 모델이어야 한다.
        estimatorParamMaps=param_grid, # 없으면 그냥 GridSearch없이 Cross Validation만 진행
        evaluator=RegressionEvaluator(labelCol="total_amount"),
        numFolds=5)
    
cross_val

CrossValidator_877ec19dde50

# 훈련

In [12]:
# 임의의 샘플 데이터 세트 만들기. 전체로 다 하면 시간이 오래 걸린다.
toy_df = train_df.sample(False, 0.1, seed=1) # withReplacement : 복원추출 여부, fraction
toy_df.printSchema()

root
 |-- passenger_count: integer (nullable = true)
 |-- pickup_location_id: integer (nullable = true)
 |-- dropoff_location_id: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- pickup_time: integer (nullable = true)
 |-- day_of_week: string (nullable = true)
 |-- total_amount: double (nullable = true)



In [13]:
cv_model = cross_val.fit(toy_df)

23/06/14 04:22:42 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/06/14 04:22:42 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
23/06/14 04:22:44 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
23/06/14 04:22:44 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
                                                                                

## BestModel 찾기

In [14]:
best_model = cv_model.bestModel

## Best Parameter 찾기

In [15]:
best_alpha = cv_model.bestModel.stages[-1]._java_obj.getElasticNetParam() # 모델까지 다 들어가 있는 list에서 제일 마지막(cv_stages와 동일)
best_reg_param = cv_model.bestModel.stages[-1]._java_obj.getRegParam()

In [16]:
best_alpha, best_reg_param

(0.2, 0.2)

# 전체 데이터를 대상으로 훈련

In [18]:
pipeline = Pipeline(stages=stages) # 모델이 빠진 전처리만 하는 파이프라인 생성
fitted_transformer = pipeline.fit(train_df)

                                                                                

In [19]:
vec_train_df = fitted_transformer.transform(train_df)

In [17]:
# best parameter로 모델 생성하기
lr = LinearRegression(
    maxIter=50,
    solver="normal",
    labelCol="total_amount",
    featuresCol="feature_vector",
    elasticNetParam=best_alpha,
    regParam=best_reg_param)

In [23]:
model = lr.fit(vec_train_df)

AttributeError: 'NoneType' object has no attribute '_jvm'

# 튜닝된 모델 저장 및 불러오기

In [22]:
# 모델이 저장될 디렉토리 지정
model_dir = "/home/ubuntu/working/spark-examples/taxi_pricing_model1/"
model.save(model_dir)

AttributeError: 'NoneType' object has no attribute '_jvm'

In [None]:
# 모델 로딩
from pyspark.ml.regression import LinearRegression
loaded_model = LinearRegression().load(model_dir)

In [21]:
spark.stop()