# 用训练好的模型对test_format1进行预测

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.mllib.util import MLUtils
from pyspark.mllib.linalg import Vectors

In [2]:
conf=SparkConf().setAppName("miniProject").setMaster("local").set("spark.executor.memory","3g")\
        .set("spark.executor.instances","2")
sc=SparkContext.getOrCreate(conf)

In [3]:
#导入测试集
spark = SparkSession\
        .builder\
        .master("local")\
        .appName("DataRead")\
        .getOrCreate()
test_data = spark.read.csv(r"hdfs://node1:9000/user/root/exp4/procd_test_real.csv", encoding='utf8', header=True, inferSchema=True) 
test_data = test_data.rdd

In [4]:
#将测试集的特征转为向量
test = test_data.map(lambda line: (line[0],line[1],line[2],Vectors.dense(line[3:])))

## Logistic Regression

In [13]:
from pyspark.mllib.classification import LogisticRegressionModel
lr_model = LogisticRegressionModel.load(sc, "hdfs://node1:9000/user/root/exp4/models/LogisticRegressionModel")

In [16]:
lr_predictions = test.map(lambda line: (line[0],line[1],float(lr_model.predict(line[3]))))
lr_predictions.coalesce(1).toDF().write.options(header="true").csv("hdfs://node1:9000/user/root/exp4/predictions/lr_predictions.csv")

日期:2020-12-20 14:08:52 排名: 无
score:0.5015744

## SVM

In [18]:
from pyspark.mllib.classification import SVMModel
svm_model = SVMModel.load(sc, "hdfs://node1:9000/user/root/exp4/models/SVMWithSGDModel")

In [19]:
svm_predictions = test.map(lambda line: (line[0],line[1],float(svm_model.predict(line[3]))))
svm_predictions.coalesce(1).toDF().write.options(header="true").csv("hdfs://node1:9000/user/root/exp4/predictions/svm_predictions.csv")

日期:2020-12-20 14:18:59 排名: 无
score:0.5156678

## Gradient Boosted Trees

In [35]:
from pyspark.mllib.tree import GradientBoostedTreesModel
GBDT_model = GradientBoostedTreesModel.load(sc, "hdfs://node1:9000/user/root/exp4/models/myGradientBoostingClassificationModel")

In [36]:
predictions = GBDT_model.predict(test.map(lambda x: x[3]))
GBDT_predictions = test.map(lambda lp: (lp[0],lp[1])).zip(predictions).map(lambda lp:(lp[0][0],lp[0][1],lp[1]))
GBDT_predictions.coalesce(1).toDF().write.options(header="true").csv("hdfs://node1:9000/user/root/exp4/predictions/GBDT_predictions.csv")

# GBDT_predictions = test.map(lambda line: (line[0],line[1],float(GBDT_model.predict(line[3]))))
# GBDT_predictions.coalesce(1).toDF().write.options(header="true").csv("hdfs://node1:9000/user/root/exp4/predictions/GBDT_predictions.csv")

日期:2020-12-20 14:51:00 排名: 无
score:0.5000562

## SVM with Normalized data

In [5]:
from pyspark.mllib.classification import SVMModel
svm_model2 = SVMModel.load(sc, "hdfs://node1:9000/user/root/exp4/models/NormalizedSVMWithSGDModel")

In [8]:
#将数据规范化
from pyspark.mllib.feature import Normalizer
features = test.map(lambda x: x[3])
normalizer = Normalizer()
normalized_test = test.map(lambda lp: (lp[0],lp[1])).zip(normalizer.transform(features)).map(lambda lp:(lp[0][0],lp[0][1],lp[1]))


In [10]:
print(normalized_test.take(10))

[(1461, 2775, DenseVector([0.6124, 0.2041, 0.6124, 0.2041, 0.2041, 0.2041, 0.2041, 0.0, 0.2041, 0.0])), (1783, 4864, DenseVector([0.2298, 0.0, 0.8043, 0.517, 0.1149, 0.1149, 0.0574, 0.0, 0.0574, 0.0])), (1969, 4186, DenseVector([0.4682, 0.117, 0.7022, 0.4682, 0.117, 0.117, 0.117, 0.0, 0.117, 0.0])), (2044, 4491, DenseVector([0.0, 0.0, 0.937, 0.2499, 0.0625, 0.1562, 0.1249, 0.0, 0.1249, 0.0])), (2163, 2995, DenseVector([0.0889, 0.0, 0.9181, 0.2962, 0.1777, 0.1185, 0.0889, 0.0, 0.0889, 0.0])), (2194, 2459, DenseVector([0.1627, 0.0542, 0.8677, 0.3254, 0.2169, 0.2169, 0.1085, 0.0, 0.0542, 0.0542])), (2406, 4775, DenseVector([0.7171, 0.1195, 0.5976, 0.1195, 0.1195, 0.239, 0.1195, 0.0, 0.1195, 0.0])), (2494, 3236, DenseVector([0.8528, 0.0, 0.2132, 0.2132, 0.2132, 0.2132, 0.2132, 0.0, 0.2132, 0.0])), (2756, 2669, DenseVector([0.0, 0.1091, 0.6547, 0.5455, 0.4364, 0.2182, 0.1091, 0.0, 0.1091, 0.0])), (3466, 1892, DenseVector([0.5721, 0.0, 0.3814, 0.3814, 0.286, 0.0953, 0.3814, 0.0, 0.3814, 0.0]

In [9]:
svm_predictions2 = normalized_test.map(lambda line: (line[0],line[1],float(svm_model2.predict(line[2]))))
svm_predictions2.coalesce(1).toDF().write.options(header="true").csv("hdfs://node1:9000/user/root/exp4/predictions/svm_predictions2.csv")

In [12]:

spark.stop()
sc.stop()

日期:2020-12-20 15:06:01 排名: 无
score:0.5000000