In [None]:
from pyspark.sql import Row
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType
from pyspark.sql.types import FloatType
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import udf
from pyspark.sql import functions as F 
from pyspark.sql.functions import explode, col, udf, mean as _mean, stddev as _stddev, log, log10
from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
from pyspark.sql.functions import lit


from pyspark.sql.functions import udf, expr, concat, col, count, when, isnan

spark = SparkSession(sc)
sqlc=SQLContext(sc)

In [None]:
# Read data set	
train_features = sqlc.read.csv('/FileStore/tables/train_features.csv', header = True,inferSchema=True)
train_targets_scored = sqlc.read.csv('/FileStore/tables/train_targets_scored.csv', header = True,inferSchema=True)

test_features = sqlc.read.csv('/FileStore/tables/test_features.csv', header = True,inferSchema=True)

In [None]:
# Define error function 
def error (prediction,target):
  add_prob = udf(lambda x: x[1].item(), FloatType())
  add_1_prob = udf(lambda x: x[0].item(), FloatType())
  prediction = prediction.withColumn('log_prob',F.log(add_prob('probability')))
  prediction = prediction.withColumn('log_(1-prob)',F.log(add_1_prob('probability')))
  prediction = prediction.withColumn('loss', -F.col('label')*F.col('log_prob')-(1.-F.col('label'))*F.col('log_(1-prob)'))
  
  
  loss = prediction.agg(F.mean('loss')).collect()[0]['avg(loss)']

  return loss

In [None]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression

In [None]:
# LogisticRegression
def LR (train_features, train_targets_scored, test_features):
    
    tot_loss = 0
    tot_loss_test = 0
    predicted = []
    for iterator in range(1,len(train_targets_scored.columns)-1): # logistic regression of each target 
        #add id to target
        targetdf = train_targets_scored.select(train_targets_scored.columns[0],train_targets_scored.columns[iterator])
        #merge features and target by id 
        targetname = train_targets_scored.columns[iterator]
        df_all = train_features.join(targetdf, "sig_id", "inner")
        df_all=df_all.drop("sig_id")
  
        #split data to train/validation sets     
        df_all.groupBy(targetname).count().show()
        df = df_all.sampleBy(targetname,fractions={0: 0.7, 1: 0.7},seed=99)
        df_test = df_all.subtract(df)
        #df.groupBy(targetname).count().show()
        #df_test.groupBy(targetname).count().show()
        print('done splitting')
        
        #get heaters for all features    
        cols = df.columns
        #Define the features to be encoded  
        categoricalColumns = ['cp_type', 'cp_time', 'cp_dose']
        stages = []
        #encoding categorical features 
        for categoricalCol in categoricalColumns:
            stringIndexer = StringIndexer(inputCol = categoricalCol, outputCol = categoricalCol + 'Index')
            encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
            stages += [stringIndexer, encoder]
        #merge the encoded categorical features with all other features 
        label_stringIdx = StringIndexer(inputCol = train_targets_scored.columns[iterator], outputCol = 'label')
        stages += [label_stringIdx]
        numericCols = train_features.columns[4:]
        assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols
        assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
        stages += [assembler]
        
        #use pipeline to prepare input data for trainning/validation 		
        pipeline = Pipeline(stages = stages)
        pipelineModel = pipeline.fit(df)
        df = pipelineModel.transform(df)
        selectedCols = ['label', 'features'] + cols
        df = df.select(selectedCols)
        #df.printSchema()
        print("train input ready")
        
        pipelineModel = pipeline.fit(df_test)
        df_test = pipelineModel.transform(df_test)
        df_test = df_test.select(selectedCols)
        #df_test.printSchema()
        print("test input ready")
        
        '''
        train with deflut setting except maxIter=10 to acc
        hyperparameters: 
        elasticNetParam = 0 
        maxIter = 10
        regParam = 0
        standardization = True
        tol = 1e-6
        weightCol = 1.0 for all features 
        '''
        lr = LogisticRegression(featuresCol = 'features', labelCol = 'label', maxIter=10)
        # Fit the model
        lrModel = lr.fit(df)
        # make predictions and evaluate
        predictions = lrModel.transform(df)
        predictions_test = lrModel.transform(df_test)
        predicted.append(predictions_test)
        
        tot_loss += error(predictions,df)
        tot_loss_test += error(predictions_test)        
        
        print("target #", iterator, train_targets_scored.columns[iterator])
        print("Current mean total loss:", tot_loss/iterator)
        print("Current mean total loss (test):", tot_loss_test/iterator)

    tot_loss = tot_loss/len(train_targets_scored.columns)
    tot_loss_test = tot_loss_test/len(train_targets_scored.columns)
    
    return predicted