In [1]:
from pyspark.sql import SparkSession


import numpy as np
import pandas as pd
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = SparkSession.\
        builder.\
        appName("best_one").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "512m").\
        getOrCreate()

In [2]:
data_df = (spark.read
                  .format('com.databricks.spark.csv')
                  .option("header", "true") 
                  .option("inferSchema", "true") 
                  .load("skewdata.csv")
                  )
data_df.show(5)

+-----------+
|     values|
+-----------+
|81.37291811|
|25.70097086|
|4.942646012|
|43.02085256|
|81.69058902|
+-----------+
only showing top 5 rows



In [7]:
# не забывайте, если нет понимания, как сделать на spark - сделайте на sql
#data_df.createOrReplaceTempView('tmp')

# confidence interval

In [8]:
## ci функция
def getConfidenceInterval(inputDataFrame,num_of_samples, left_quantile_fraction, right_quantile_fraction):
    #симуляция построения сэмплов
    sample_means = np.empty([num_of_samples])
    
    for n in range(0,num_of_samples):
        # набор агрегатов - результатов
        sample_means[n] = (inputDataFrame.sample(withReplacement = True, fraction=1.0)
                   .selectExpr("avg(values) as mean")
                   .collect()[0]
                   .asDict()
                   .get('mean'))
            
    ## сортировка
    sample_means.sort()
    
    ## 
    sampleMeans_local_df = pd.DataFrame(sample_means)
    
    ## из пандас в spark
    fields = [StructField("mean_values", DoubleType(), True)]
    schema = StructType(fields)
    sampleMeans_df = spark.createDataFrame(sampleMeans_local_df, schema)
    
    ## расчет квартилей ( 25 - 75 персентилей)
    sampleMeans_df.createOrReplaceTempView('Guru_SampleMeansTable')
    quantiles_df = spark.sql("select percentile(cast(mean_values as bigint),"
                                  "array("+str(left_quantile_fraction)+","+str(right_quantile_fraction)+")) as "
                                  "percentiles from Guru_SampleMeansTable")
    return quantiles_df

In [9]:
## 95 % доверительынй интервал для данных
quantiles_df = getConfidenceInterval(data_df, 1000, 0.025, 0.975)

In [10]:
quantiles_df.show()

+------------+
| percentiles|
+------------+
|[25.0, 38.0]|
+------------+



# Clustered Samples

In [2]:
data_df = (spark.read
           .format('com.databricks.spark.csv')
           .option("header", "true") 
           .option("inferSchema", "true") 
           .load("skewdata-policy-new.csv")
           )

In [3]:
policyids = data_df.select('policyid').distinct()
policyids.show(5)

+--------+
|policyid|
+--------+
|       1|
|       6|
|       3|
|       5|
|       9|
+--------+
only showing top 5 rows



In [4]:
## разделение сэмплов на фракции и их маркирование
def clusteredSamples(data,policies,policyid_sample_fraction,num_of_samples):
    
    #пустой лист для нумерования
    samples = []
    
    for n in range(0,num_of_samples):
        
        #создание сэмпла 1
        policyids_sample = policies.sample(withReplacement=False, fraction=policyid_sample_fraction)
        sample = policyids_sample.join(data,on='policyid',how='inner')
        samples.append(sample)
    
    # список сэмплов
    return samples

In [5]:
sampleList = clusteredSamples(data_df,policyids,0.8,20)
sampleList

[DataFrame[policyid: int, age: int, values: double],
 DataFrame[policyid: int, age: int, values: double],
 DataFrame[policyid: int, age: int, values: double],
 DataFrame[policyid: int, age: int, values: double],
 DataFrame[policyid: int, age: int, values: double],
 DataFrame[policyid: int, age: int, values: double],
 DataFrame[policyid: int, age: int, values: double],
 DataFrame[policyid: int, age: int, values: double],
 DataFrame[policyid: int, age: int, values: double],
 DataFrame[policyid: int, age: int, values: double],
 DataFrame[policyid: int, age: int, values: double],
 DataFrame[policyid: int, age: int, values: double],
 DataFrame[policyid: int, age: int, values: double],
 DataFrame[policyid: int, age: int, values: double],
 DataFrame[policyid: int, age: int, values: double],
 DataFrame[policyid: int, age: int, values: double],
 DataFrame[policyid: int, age: int, values: double],
 DataFrame[policyid: int, age: int, values: double],
 DataFrame[policyid: int, age: int, values: do

In [6]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

def runLinearRegression(samples):
    # обучим регрессию на сэмплах
    samples_coefficients = []
    
    #vector Assembler
    feature_columns = ['age']
    vectorAssembler = VectorAssembler(inputCols = feature_columns, outputCol = 'features_vector')
    
    #linear regresson model
    lr = LinearRegression(featuresCol ='features_vector', 
                          labelCol = 'values',
                          predictionCol = 'predicted_values',
                          maxIter=5, 
                          elasticNetParam = 0.5,
                          solver="l-bfgs")
    for i in range(0,len(samples)):
        sample_df = samples[i]
        sample_df1 = vectorAssembler.transform(sample_df)
        
        #Fit 
        sample_lr = lr.fit(sample_df1)
        
        #Save
        samples_coefficients.append(sample_lr.coefficients)
    
    return samples_coefficients

sampleCoefficients = runLinearRegression(sampleList)
sampleCoefficients

[DenseVector([-3.4693]),
 DenseVector([-2.3257]),
 DenseVector([-2.6926]),
 DenseVector([-1.25]),
 DenseVector([-2.1493]),
 DenseVector([-2.7645]),
 DenseVector([-1.6272]),
 DenseVector([-2.1566]),
 DenseVector([-2.3443]),
 DenseVector([-2.5109]),
 DenseVector([-2.7963]),
 DenseVector([-2.5325]),
 DenseVector([-1.886]),
 DenseVector([-1.2457]),
 DenseVector([-3.2977]),
 DenseVector([-0.4748]),
 DenseVector([-2.6444]),
 DenseVector([-2.6921]),
 DenseVector([-2.3257]),
 DenseVector([-2.0651])]