In [None]:
!pip3 install pyspark

In [None]:
from pyspark.sql import SparkSession, Row
from pyspark.sql import types as T
from pyspark.sql import window as W
from pyspark.sql import functions as F

spark = SparkSession.builder \
        .master("local") \
        .appName("Colab") \
        .getOrCreate()

## 1. Linear Regression

In [None]:
df = spark.read.csv("/content/sample_data/california_housing_train.csv", header=True)
df.show()

In [None]:
df.count(), len(df.columns)

In [None]:
df.columns[:-1]

In [None]:
for col in df.columns:
  df = df.withColumn(col, F.col(col).cast(T.DoubleType()))


df.printSchema()

In [None]:
from pyspark.ml.feature import VectorAssembler

featureassembler = VectorAssembler(inputCols=df.columns[:-1], outputCol='features')

In [None]:
df_ = featureassembler.transform(df)

In [None]:
model_df = df_.select('median_house_value','features')
model_df.show(5, truncate=False)

In [None]:
train_df, test_df = model_df.randomSplit(weights=[0.75, 0.25], seed=42)

In [None]:
train_df.count(), test_df.count()

In [None]:
from pyspark.ml.regression import LinearRegression

lg_model = LinearRegression(featuresCol='features', labelCol='median_house_value')

In [None]:
lg_fit = lg_model.fit(train_df)

lg_fit.coefficients

In [None]:
pred_train = lg_fit.evaluate(train_df)

pred_train.predictions.show()

In [None]:
pred_test = lg_fit.evaluate(test_df)

pred_test.predictions.show()

In [None]:
print("R2 for train data : ", pred_train.r2)
print("R2 for test data : ", pred_test.r2)

## 2. Random Forest

In [None]:
iris = spark.read.csv('/content/iris.csv', header=True)
iris.show(5)

In [None]:
for cols in iris.columns[:-1]:
  iris = iris.withColumn(cols, F.col(cols).cast(T.DoubleType()))

iris.printSchema()

In [None]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols = iris.columns[:-1], outputCol= "features")

In [None]:
iris_assemble = assembler.transform(iris)

iris_assemble.show()

In [None]:
iris.groupby('species').count().show()

In [None]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol = 'species', outputCol='label_index')

In [None]:
iris_index = indexer.fit(iris_assemble).transform(iris_assemble)

In [None]:
iris_index.dropDuplicates(subset = ['species', 'label_index']).show()

In [None]:
iris_train, iris_test = iris_index.randomSplit([0.8, 0.2], seed=42)

In [None]:
iris_train.count(), iris_test.count()

In [None]:
iris_train.columns

In [None]:
from pyspark.ml.classification import RandomForestClassifier

randomforest = RandomForestClassifier(featuresCol='features', labelCol='label_index')

In [None]:
rf_model = randomforest.fit(iris_test)

pred_test = rf_model.transform(iris_train)

In [None]:
pred_test.show()

In [None]:
iris_prediction = pred_test.select('label_index', 'prediction')
iris_prediction.show()

In [None]:
iris_prediction.printSchema()

In [None]:
iris_prediction.filter(F.col('label_index') != F.col('prediction')).show()

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol = 'label_index', predictionCol='prediction')

evaluator.evaluate(iris_prediction)

In [None]:
from pyspark.mllib.evaluation import MulticlassMetrics

metrix = MulticlassMetrics(iris_prediction.rdd.map(tuple))

In [None]:
metrix.confusionMatrix().toArray()