In [0]:
dbutils.fs.ls("/FileStore/df/")


In [0]:
df = spark.read.csv("/FileStore/df/df3.csv")

In [0]:
display(df)

_c0,_c1,_c2,_c3,_c4,_c5,_c6,_c7,_c8,_c9,_c10
0,64.0,0,1,0,72.0,0,0,13112.6,60,24.3
1,75.0,0,1,0,78.0,1,1,9567.0,49,22.6
1,64.0,1,2,1,88.0,1,1,32734.2,32,17.8
1,53.0,0,1,1,72.0,1,0,48517.6,61,36.4
1,50.0,0,0,0,82.0,1,0,1731.7,19,20.6
1,89.0,0,0,0,78.0,0,0,6474.0,42,37.9
0,59.0,0,0,0,64.0,0,0,1705.6,18,23.8
0,52.0,0,0,0,74.0,1,0,1534.3,21,26.8
0,55.0,0,0,0,70.0,1,0,13390.6,63,29.4
1,69.0,0,0,0,64.0,1,1,5910.9,40,29.6


In [0]:
df_pandas=df.toPandas()

In [0]:
df_pandas.head(5)

Unnamed: 0,_c0,_c1,_c2,_c3,_c4,_c5,_c6,_c7,_c8,_c9,_c10
0,0,64.0,0,1,0,72.0,0,0,13112.6,60,24.3
1,1,75.0,0,1,0,78.0,1,1,9567.0,49,22.6
2,1,64.0,1,2,1,88.0,1,1,32734.2,32,17.8
3,1,53.0,0,1,1,72.0,1,0,48517.6,61,36.4
4,1,50.0,0,0,0,82.0,1,0,1731.7,19,20.6


In [0]:
import numpy as np
from sklearn.model_selection import train_test_split
import mlflow
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score, mean_absolute_error

In [0]:
def train_claim_insurance(data):
  
  # Evaluate metrics
  def eval_metrics(actual, pred):
      rmse = np.sqrt(mean_squared_error(actual, pred))
      mae = mean_absolute_error(actual, pred)
      r2 = r2_score(actual, pred)
      return rmse, mae, r2

  np.random.seed(40)

  # Split the data into training and test sets. (0.75, 0.25) split.
  train, test = train_test_split(data)

  # The predicted column is "progression" which is a quantitative measure of disease progression one year after baseline
  train_x = train.drop(["_c8"], axis=1)
  test_x = test.drop(["_c8"], axis=1)
  train_y = train[["_c8"]]
  test_y = test[["_c8"]]

    
  # Start an MLflow run; the "with" keyword ensures we'll close the run even if this cell crashes
  with mlflow.start_run():
    lr = LinearRegression(fit_intercept=True)
    lr.fit(train_x, train_y)

    predicted_qualities = lr.predict(test_x)

    (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

    # Print out ElasticNet model metrics
    #print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio))
    print("  RMSE: %s" % rmse)
    print("  MAE: %s" % mae)
    print("  R2: %s" % r2)

    # Log mlflow attributes for mlflow UI
    #mlflow.log_param("alpha", alpha)
    #mlflow.log_param("l1_ratio", l1_ratio)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)
    mlflow.sklearn.log_model(lr, "model")
    modelpath = "/dbfs/mlflow/test_insurance/model-%f-%f" % (rmse,r2)
    mlflow.sklearn.save_model(lr, modelpath)
    

In [0]:
train_claim_insurance(df_pandas)