### 1. Spark ML - Linear Regression

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler

spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .getOrCreate()

In [None]:
# Reading the data
df = spark.read.csv('/content/sample_data/california_housing_train.csv', header=True, inferSchema="true")

# Shape of the dataset
print('Shape of the dataset: ', (df.count(), len(df.columns)))

# Displaying top n=10 rows
df.show(n=10)

In [None]:
from pyspark.ml.feature import VectorAssembler
featureassembler = VectorAssembler(inputCols=['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income'], outputCol='features')

In [None]:
output = featureassembler.transform(df)
output.show()

In [None]:
final_data = output.select("features", "median_house_value")
final_data.show()

In [None]:
train_data, test_data = final_data.randomSplit(weights=[0.75,0.25], seed=42)

In [None]:
(train_data.count(), len(train_data.columns)), (test_data.count(), len(test_data.columns))

In [None]:
from pyspark.ml.regression import LinearRegression

result = LinearRegression(featuresCol='features', labelCol='median_house_value')

In [None]:
result_fit = result.fit(train_data)

result_fit.coefficients

In [None]:
pred_train = result_fit.evaluate(train_data)
pred_train.predictions.show()

In [None]:
pred = result_fit.evaluate(test_data)
pred.predictions.show() 

In [None]:
# Printing the R2 Score
print('R2-Score for train set:', pred_train.r2)
print('R2-Score for test set:', pred.r2)

### 3. Spark ML - Random Forest

In [None]:
df = spark.read.csv('/content/iris.csv', header=True)
df.show()

In [None]:
df.printSchema()

In [None]:
_dtype = {
            'sepal_length' : T.DoubleType(),
            'sepal_width' : T.DoubleType(),
            'petal_length' : T.DoubleType(),
            'petal_width' : T.DoubleType()
        }

for _col in df.columns[:-1]:
    df = df.withColumn(_col, F.col(_col).cast(_dtype[_col]))

df.printSchema()

In [None]:
from pyspark.ml.feature import VectorAssembler

numericCols = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
assembler = VectorAssembler(inputCols=numericCols, outputCol="features")
df = assembler.transform(df)
df.show()

In [None]:
from pyspark.ml.feature import StringIndexer

label_stringIdx = StringIndexer(inputCol = 'species', outputCol = 'labelIndex')
df = label_stringIdx.fit(df).transform(df)
df.show()

In [None]:
df.dropDuplicates(subset = ['species', 'labelIndex']).show()

In [None]:
train, test = df.randomSplit([0.7, 0.3], seed = 2018)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

In [None]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'labelIndex')
rfModel = rf.fit(train)
predictions = rfModel.transform(test)
# predictions.select('sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'labelIndex', 'rawPrediction', 'prediction', 'probability').show(25)
predictions.show()

In [None]:
predictions.select("labelIndex", "prediction").show(20)

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol="prediction")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %s" % (accuracy))
print("Test Error = %s" % (1.0 - accuracy))

In [None]:
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql import types as T
from pyspark.sql import functions as F

preds_and_labels = predictions.select(['prediction','labelIndex']).withColumn('labelIndex', F.col('labelIndex').cast(T.FloatType())).orderBy('prediction')
preds_and_labels = preds_and_labels.select(['prediction','labelIndex'])
metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))
print(metrics.confusionMatrix().toArray())