# Data Preprocessing

## Creating a dataframe

### Loading dataset in Python

In [None]:
df = spark.read.json('file_name.json')
df = spark.read.csv('file_name.json', inferSchema=True, header=True)
df = spark.read.format('libsvm').load('file_name.txt') # PySpark data format

### Loading dataset in Databricks

In [None]:
df = spark.sql('SELECT * FROM file_name')
df = spark.read.csv('/FileStore/tables/file_name.csv', inferSchema=True, header=True)
df = spark.read.format('libsvm').load('/FileStore/tables/file_name.txt') # PySpark data format

## One-hot-encoding

In [None]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer
x_col_name_indexer = StringIndexer(inputCol='x_col_name', outputCol='x_col_name_Index')
x_col_name_encoder = OneHotEncoder(inputCol='x_col_name_Index', outputCol='x_col_name_Vec')

## Transforming dataframe into PySpark format

In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['x_col_1', 'x_col_2'],
                            outputCol='features')
final_df = assembler.transform(df).select('features', 'y_col')

## Splitting the dataset

In [None]:
train_data, test_data =  df.randomSplit([0.7, 0.3])

## Creating the pipeline

In [None]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[x_col_name_indexer, x_col_name_encoder, assembler, classifier])
fitted_classifier = pipeline.fit(train_data)

# Machine Learning models

## Multiple Linear Regression

In [None]:
# No need for feature scaling

### Fitting

In [None]:
from pyspark.ml.regression import LinearRegression
regressor = LinearRegression(featuresCol='x_col_name', labelCol='y_col_name', predictionCol='prediction')
fitted_regressor = regressor.fit(train_data)

### Predicting on labeled test set

In [None]:
preds = fitted_regressor.evaluate(test_data)

### Retrieving model statistics

In [None]:
preds.rootMeanSquaredError
preds.r2
df.describe()
from pyspark.sql.functions import corr
df.select(corr('y_col_name', 'x_col_name')).show()

### Predicting on unlabeled test set

In [None]:
unlabeled_data = test_data.select('x_col_name')
unlabelled_preds = fitted_regressor.transform(unlabeled_data)

## Logistic Regression

In [None]:
# No need for feature scaling

### Fitting

In [None]:
from pyspark.ml.classification import LogisticRegression
classifier = LogisticRegression(featuresCol='features', labelCol='y_col_name', predictionCol='prediction')
fitted_classifier = classifier.fit(train_data)

### Predicting on labeled test set

In [None]:
preds = fitted_classifier.transform(test_data)

### Retrieving model statistics

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol='predicton', labelCol='original_y_col_name')
area_under_curve = my_eval.evaluate(preds)

## Decision Tree Classifier

In [3]:
# No need for feature scalling

### Fitting

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier
classifier = DecisionTreeClassifier(featuresCol='features', labelCol='y_col_name', predictionCol='prediction')
fitted_classifier = classifier.fit(train_data)

### Predicting on labeled test set

In [None]:
preds = fitted_classifier.transform(test_data)

### Retrieving model statistics

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol='predicton', labelCol='original_y_col_name')
area_under_curve = my_eval.evaluate(preds)
accuracy = MulticlassClassificationEvaluator(metricName='accuracy', labelCol='original_y_col_name')
accuracy.evaluate(preds)
fitted_classifier.featureImportances

## Random Forest Classifier

In [None]:
# No need for feature scalling

### Fitting

In [None]:
from pyspark.ml.classification import RandomForestClassifier
classifier = RandomForestClassifier(featuresCol='features', labelCol='y_col_name', predictionCol='prediction')
fitted_classifier = classifier.fit(train_data)

### Predicting on labeled test set

In [None]:
preds = fitted_classifier.transform(test_data)

### Retrieving model statistics

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol='predicton', labelCol='original_y_col_name')
area_under_curve = my_eval.evaluate(preds)
accuracy = MulticlassClassificationEvaluator(metricName='accuracy', labelCol='original_y_col_name')
accuracy.evaluate(preds)
fitted_classifier.featureImportances

## Gradient-Boosted Trees (GBTs) Classifier

In [None]:
# No need for feature scalling

### Fitting

In [None]:
from pyspark.ml.classification import GBTClassifier
classifier = GBTClassifier(featuresCol='features', labelCol='y_col_name', predictionCol='prediction')
fitted_classifier = classifier.fit(train_data)

### Predicting on labeled test set

In [None]:
preds = fitted_classifier.transform(test_data)

### Retrieving model statistics

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol='predicton', labelCol='original_y_col_name')
area_under_curve = my_eval.evaluate(preds)
accuracy = MulticlassClassificationEvaluator(metricName='accuracy', labelCol='original_y_col_name')
accuracy.evaluate(preds)
fitted_classifier.featureImportances

## Clustering

## KMeans 

In [None]:
# Needs for feature scalling

### Fitting

In [None]:
from pyspark.ml.clustering import KMeans
kmeans = KMeans(featuresCol='features', k=number_of_clusters)
model = kmeans.fit(df)

### Predicting the clusters

In [None]:
# Getting the predictions
model.transform(df)

### Retrieving model statistics

In [None]:
wssse = model.computeCost(df)
centers = model.clusterCenters()