In [1]:
import pyspark
import os

myConf=pyspark.SparkConf()
spark = pyspark.sql.SparkSession.builder.master('local').appName('myApp').config(conf=myConf).getOrCreate()

In [2]:
marks = ['김하나, Engilsh, 100',
        '김하나, Math, 80',
        '임하나, English, 70',
        '임하나, Math, 100',
        '김갑돌, English, 82.3',
        '김갑돌, Math, 98.5']

# 문제 1-1 성적데이터로 DataFrame을 생성

In [3]:
myrdd = spark.sparkContext.parallelize(marks).map(lambda x : x.split(','))

In [4]:
mydf = spark.createDataFrame(myrdd, schema=['name','subject','score'])

In [5]:
mydf.show()

+------+--------+-----+
|  name| subject|score|
+------+--------+-----+
|김하나| Engilsh|  100|
|김하나|    Math|   80|
|임하나| English|   70|
|임하나|    Math|  100|
|김갑돌| English| 82.3|
|김갑돌|    Math| 98.5|
+------+--------+-----+



# 문제 1-2 zscore 컬럼을 생성 

In [6]:
import numpy as np

X = mydf.select('score').rdd.map(lambda x: float(x[0])).collect()
xbar = float(np.mean(X))
sx = float(np.std(X,ddof=1))

In [7]:
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType

get_zscore = udf(lambda x : (float(x)-xbar)/sx,FloatType())

In [8]:
mydf = mydf.withColumn('zscore',get_zscore(mydf['score']))

In [9]:
mydf.show()

+------+--------+-----+-----------+
|  name| subject|score|     zscore|
+------+--------+-----+-----------+
|김하나| Engilsh|  100| 0.90201485|
|김하나|    Math|   80|-0.66217273|
|임하나| English|   70| -1.4442666|
|임하나|    Math|  100| 0.90201485|
|김갑돌| English| 82.3|-0.48229116|
|김갑돌|    Math| 98.5| 0.78470075|
+------+--------+-----+-----------+



# 1-3 cdf 컬럼을 생성

In [10]:
from scipy.stats import norm

get_cdf = udf(lambda x : float(norm.cdf(x)),FloatType())
mydf = mydf.withColumn('cdf',get_cdf(mydf['zscore']))

In [11]:
mydf.show()

+------+--------+-----+-----------+-----------+
|  name| subject|score|     zscore|        cdf|
+------+--------+-----+-----------+-----------+
|김하나| Engilsh|  100| 0.90201485|  0.8164755|
|김하나|    Math|   80|-0.66217273| 0.25393027|
|임하나| English|   70| -1.4442666|0.074332014|
|임하나|    Math|  100| 0.90201485|  0.8164755|
|김갑돌| English| 82.3|-0.48229116| 0.31479958|
|김갑돌|    Math| 98.5| 0.78470075|  0.7836855|
+------+--------+-----+-----------+-----------+

