In [2]:
import findspark
findspark.init()
import pyspark
sc = pyspark.SparkContext(appName="RANDFORSEST")
from pyspark.sql.session import SparkSession
spark = SparkSession(sc)

In [3]:
#read the dataset
df=spark.read.csv('data.csv',inferSchema=True,header=True)

In [4]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- date: string (nullable = true)
 |-- news: string (nullable = true)
 |-- final_manual_labelling: integer (nullable = true)



In [5]:
import pandas as pd
pd.DataFrame(df.take(5), columns=df.columns).transpose()

Unnamed: 0,0,1,2,3,4
id,0,1,2,3,4
date,1/25/2022,1/25/2022,1/25/2022,1/25/2022,1/25/2022
news,"Ripple announces stock buyback, nabs $15 billi...",IMF directors urge El Salvador to remove Bitco...,Dragonfly Capital is raising $500 million for ...,Rick and Morty co-creator collaborates with Pa...,How fintech SPACs lost their shine
final_manual_labelling,1,-1,1,0,0


## Tokenizacion
En un principio el proceso de extacción de Features y de Tokenizar es el mismo que se utiliza para aplicar Logistic Regression.

Primero separamos las palabras para hacer una lista de "Tokens"

In [6]:
from pyspark.ml.feature import Tokenizer
tokenization=Tokenizer(inputCol='news',outputCol='tokens')
tokenized_df=tokenization.transform(df)
pd.DataFrame(tokenized_df.take(5), columns=tokenized_df.columns).transpose()

Unnamed: 0,0,1,2,3,4
id,0,1,2,3,4
date,1/25/2022,1/25/2022,1/25/2022,1/25/2022,1/25/2022
news,"Ripple announces stock buyback, nabs $15 billi...",IMF directors urge El Salvador to remove Bitco...,Dragonfly Capital is raising $500 million for ...,Rick and Morty co-creator collaborates with Pa...,How fintech SPACs lost their shine
final_manual_labelling,1,-1,1,0,0
tokens,"[ripple, announces, stock, buyback,, nabs, $15...","[imf, directors, urge, el, salvador, to, remov...","[dragonfly, capital, is, raising, $500, millio...","[rick, and, morty, co-creator, collaborates, w...","[how, fintech, spacs, lost, their, shine]"


Quitamos las stopwords para tener unicamente tokens con valor

In [7]:
from pyspark.ml.feature import StopWordsRemover
stopword_removal=StopWordsRemover(inputCol='tokens',outputCol='refined_tokens')
refined_df=stopword_removal.transform(tokenized_df)
pd.DataFrame(refined_df.take(5), columns=refined_df.columns).transpose()


Unnamed: 0,0,1,2,3,4
id,0,1,2,3,4
date,1/25/2022,1/25/2022,1/25/2022,1/25/2022,1/25/2022
news,"Ripple announces stock buyback, nabs $15 billi...",IMF directors urge El Salvador to remove Bitco...,Dragonfly Capital is raising $500 million for ...,Rick and Morty co-creator collaborates with Pa...,How fintech SPACs lost their shine
final_manual_labelling,1,-1,1,0,0
tokens,"[ripple, announces, stock, buyback,, nabs, $15...","[imf, directors, urge, el, salvador, to, remov...","[dragonfly, capital, is, raising, $500, millio...","[rick, and, morty, co-creator, collaborates, w...","[how, fintech, spacs, lost, their, shine]"
refined_tokens,"[ripple, announces, stock, buyback,, nabs, $15...","[imf, directors, urge, el, salvador, remove, b...","[dragonfly, capital, raising, $500, million, n...","[rick, morty, co-creator, collaborates, paradi...","[fintech, spacs, lost, shine]"


## Feature Extraction

Extraemos las features (Dar valores númericos o pesos a los tokens)

Esta vez en vez de aplicar CountVectorizer que cuenta el numero de veces que aparece la palabra sin aplicar ningun tipo de pesos, aplicamos TF-IDF, yo tampoco tengo muy claro el funcionamiento de esta metodología para sacar features, para mas información:

https://monkeylearn.com/blog/what-is-tf-idf/

In [8]:
from pyspark.ml.feature import HashingTF, IDF

hashingTF = HashingTF(inputCol="refined_tokens", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(refined_df)

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

pd.DataFrame(rescaledData.take(5), columns=rescaledData.columns).transpose()

Unnamed: 0,0,1,2,3,4
id,0,1,2,3,4
date,1/25/2022,1/25/2022,1/25/2022,1/25/2022,1/25/2022
news,"Ripple announces stock buyback, nabs $15 billi...",IMF directors urge El Salvador to remove Bitco...,Dragonfly Capital is raising $500 million for ...,Rick and Morty co-creator collaborates with Pa...,How fintech SPACs lost their shine
final_manual_labelling,1,-1,1,0,0
tokens,"[ripple, announces, stock, buyback,, nabs, $15...","[imf, directors, urge, el, salvador, to, remov...","[dragonfly, capital, is, raising, $500, millio...","[rick, and, morty, co-creator, collaborates, w...","[how, fintech, spacs, lost, their, shine]"
refined_tokens,"[ripple, announces, stock, buyback,, nabs, $15...","[imf, directors, urge, el, salvador, remove, b...","[dragonfly, capital, raising, $500, million, n...","[rick, morty, co-creator, collaborates, paradi...","[fintech, spacs, lost, shine]"
rawFeatures,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","(1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
features,"(1.026049047425866, 0.0, 0.0, 0.0, 0.0, 0.0, 0...","(1.026049047425866, 0.0, 1.0177674265941439, 0...","(0.0, 0.9903127281297341, 0.0, 0.0, 0.0, 0.940...","(0.0, 2.9709381843892024, 0.0, 0.0, 0.0, 0.0, ...","(1.026049047425866, 0.0, 0.0, 0.0, 0.0, 0.0, 0..."


## Label Fix

Para poder entrenar el Random Forest, es necesario que nuestras labels sean numeros positivos siendo el "-1" de negativo inviable, entonces sumamos "1" a toda la columna. Esto nos deja que:

    * 2 = Positivo
    * 1 = Neutral
    * 0 = Negativo

In [9]:
from pyspark.sql.functions import col, lit

column = 'final_manual_labelling'

rescaledData = rescaledData.withColumn(column, col(column) + lit(1))

pd.DataFrame(rescaledData.take(5), columns=rescaledData.columns).transpose()

Unnamed: 0,0,1,2,3,4
id,0,1,2,3,4
date,1/25/2022,1/25/2022,1/25/2022,1/25/2022,1/25/2022
news,"Ripple announces stock buyback, nabs $15 billi...",IMF directors urge El Salvador to remove Bitco...,Dragonfly Capital is raising $500 million for ...,Rick and Morty co-creator collaborates with Pa...,How fintech SPACs lost their shine
final_manual_labelling,2,0,2,1,1
tokens,"[ripple, announces, stock, buyback,, nabs, $15...","[imf, directors, urge, el, salvador, to, remov...","[dragonfly, capital, is, raising, $500, millio...","[rick, and, morty, co-creator, collaborates, w...","[how, fintech, spacs, lost, their, shine]"
refined_tokens,"[ripple, announces, stock, buyback,, nabs, $15...","[imf, directors, urge, el, salvador, remove, b...","[dragonfly, capital, raising, $500, million, n...","[rick, morty, co-creator, collaborates, paradi...","[fintech, spacs, lost, shine]"
rawFeatures,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","(1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
features,"(1.026049047425866, 0.0, 0.0, 0.0, 0.0, 0.0, 0...","(1.026049047425866, 0.0, 1.0177674265941439, 0...","(0.0, 0.9903127281297341, 0.0, 0.0, 0.0, 0.940...","(0.0, 2.9709381843892024, 0.0, 0.0, 0.0, 0.0, ...","(1.026049047425866, 0.0, 0.0, 0.0, 0.0, 0.0, 0..."


## TEST and TRAIN
Separamos el dataframe para hacer un test y un train

In [10]:
train, test = rescaledData.randomSplit([0.7, 0.3], seed = 2018)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

Training Dataset Count: 1897
Test Dataset Count: 786


## Random Forest
Aplicamos un el modelo de clasificación RandomForest de la librería de spark.
lo entrenamos con el train y lo probamos con el test.

In [11]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'final_manual_labelling')
rfModel = rf.fit(train)
predictions = rfModel.transform(test)

pd.DataFrame(predictions.take(5), columns=predictions.columns).transpose()

Unnamed: 0,0,1,2,3,4
id,0,8,11,12,16
date,1/25/2022,1/25/2022,1/25/2022,1/24/2022,1/24/2022
news,"Ripple announces stock buyback, nabs $15 billi...",GoodDollar Launches Key Protocol Upgrade to Ex...,Twitter is growing its in-house crypto team,Bitcoin climbs more than 10% following an ext...,Ribbon Finance: automated options selling stra...
final_manual_labelling,2,2,2,2,1
tokens,"[ripple, announces, stock, buyback,, nabs, $15...","[gooddollar, launches, key, protocol, upgrade,...","[twitter, is, growing, its, in-house, crypto, ...","[bitcoin, climbs, more, than, 10%, following, ...","[ribbon, finance:, automated, options, selling..."
refined_tokens,"[ripple, announces, stock, buyback,, nabs, $15...","[gooddollar, launches, key, protocol, upgrade,...","[twitter, growing, in-house, crypto, team]","[bitcoin, climbs, 10%, following, extraordina...","[ribbon, finance:, automated, options, selling..."
rawFeatures,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","(2.0, 0.0, 2.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 2.0, ...","(2.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, ...","(1.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
features,"(1.026049047425866, 0.0, 0.0, 0.0, 0.0, 0.0, 0...","(2.052098094851732, 0.0, 2.0355348531882878, 0...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.3144243608066...","(2.052098094851732, 0.9903127281297341, 1.0177...","(1.026049047425866, 1.9806254562594683, 0.0, 0..."
rawPrediction,"[1.8068716076600375, 9.444256742246022, 8.7488...","[1.7542284201654748, 7.287756601382291, 10.958...","[2.3417841318765644, 10.851230959916947, 6.806...","[1.8054236496895064, 10.363713285885995, 7.830...","[1.2135444315435482, 10.927276396302474, 7.859..."
probability,"[0.09034358038300187, 0.4722128371123011, 0.43...","[0.08771142100827374, 0.36438783006911457, 0.5...","[0.11708920659382822, 0.5425615479958473, 0.34...","[0.09027118248447533, 0.5181856642942997, 0.39...","[0.060677221577177406, 0.5463638198151237, 0.3..."


In [12]:
predictions.select("final_manual_labelling", "prediction", "probability").show(10)

+----------------------+----------+--------------------+
|final_manual_labelling|prediction|         probability|
+----------------------+----------+--------------------+
|                     2|       1.0|[0.09034358038300...|
|                     2|       2.0|[0.08771142100827...|
|                     2|       1.0|[0.11708920659382...|
|                     2|       1.0|[0.09027118248447...|
|                     1|       1.0|[0.06067722157717...|
|                     1|       1.0|[0.14097721942324...|
|                     2|       1.0|[0.11645649011588...|
|                     1|       1.0|[0.10298193088318...|
|                     1|       1.0|[0.06802591017661...|
|                     1|       1.0|[0.08605453526451...|
+----------------------+----------+--------------------+
only showing top 10 rows



In [13]:
pd.set_option('display.max_colwidth', None)

pd.DataFrame(predictions.take(10), columns=predictions.columns).transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
id,0,8,11,12,16,19,23,30,32,33
date,1/25/2022,1/25/2022,1/25/2022,1/24/2022,1/24/2022,1/24/2022,1/23/2022,1/21/2022,1/21/2022,1/21/2022
news,"Ripple announces stock buyback, nabs $15 billion valuation",GoodDollar Launches Key Protocol Upgrade to Expand Crypto-Backed UBI Ecosystem,Twitter is growing its in-house crypto team,Bitcoin climbs more than 10% following an extraordinary recovery in US equities,Ribbon Finance: automated options selling strategies,Dev builds tool that lets you auto-block NFT people on Twitter,Sneakmart's MetaKicks Features The World's First Mystery NFT Sneakers Box Collection,SEC Commissioner Roisman steps down two years ahead of schedule,Citi kickstarts recruitment for top job in digital assets unit,Inside the Cardano ecosystem with Charles Hoskinson
final_manual_labelling,2,2,2,2,1,1,2,1,1,1
tokens,"[ripple, announces, stock, buyback,, nabs, $15, billion, valuation]","[gooddollar, launches, key, protocol, upgrade, to, expand, crypto-backed, ubi, ecosystem]","[twitter, is, growing, its, in-house, crypto, team]","[bitcoin, climbs, more, than, 10%, following, an, extraordinary, recovery, in, us, equities]","[ribbon, finance:, automated, options, selling, strategies]","[dev, builds, tool, that, lets, you, auto-block, nft, people, on, twitter]","[sneakmart's, metakicks, features, the, world's, first, mystery, nft, sneakers, box, collection]","[sec, commissioner, roisman, steps, down, two, years, ahead, of, schedule]","[citi, kickstarts, recruitment, for, top, job, in, digital, assets, unit]","[inside, the, cardano, ecosystem, with, charles, hoskinson]"
refined_tokens,"[ripple, announces, stock, buyback,, nabs, $15, billion, valuation]","[gooddollar, launches, key, protocol, upgrade, expand, crypto-backed, ubi, ecosystem]","[twitter, growing, in-house, crypto, team]","[bitcoin, climbs, 10%, following, extraordinary, recovery, us, equities]","[ribbon, finance:, automated, options, selling, strategies]","[dev, builds, tool, lets, auto-block, nft, people, twitter]","[sneakmart's, metakicks, features, world's, first, mystery, nft, sneakers, box, collection]","[sec, commissioner, roisman, steps, two, years, ahead, schedule]","[citi, kickstarts, recruitment, top, job, digital, assets, unit]","[inside, cardano, ecosystem, charles, hoskinson]"
rawFeatures,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 2.0, 0.0)","(2.0, 0.0, 2.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 2.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(2.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0)","(1.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0)","(0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 2.0, 0.0, 0.0)","(0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 1.0)","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0)"
features,"(1.026049047425866, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.1083465474864913, 0.0, 1.0116009116784799, 0.8234901338800403, 0.0, 1.075047133417442, 0.0, 1.1416255794937926, 0.0, 0.0, 1.2831862308140898, 0.0)","(2.052098094851732, 0.0, 2.0355348531882878, 0.0, 0.0, 0.0, 1.3144243608066233, 1.026049047425866, 0.0, 0.7851840350193011, 0.0, 0.8234901338800403, 0.0, 0.0, 0.0, 0.0, 0.0, 1.247375124528243, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.3144243608066233, 0.0, 2.2166930949729826, 0.7851840350193011, 0.0, 0.0, 1.075047133417442, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(2.052098094851732, 0.9903127281297341, 1.0177674265941439, 0.0, 0.0, 0.0, 1.3144243608066233, 0.0, 1.1083465474864913, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.1416255794937926, 0.0, 0.0, 0.6415931154070449, 0.0)","(1.026049047425866, 1.9806254562594683, 0.0, 0.0, 0.0, 0.0, 0.0, 1.026049047425866, 0.0, 0.0, 0.0, 0.0, 0.0, 1.075047133417442, 0.0, 0.0, 0.0, 0.0, 0.0, 1.186979414238503)","(0.0, 0.9903127281297341, 0.0, 1.2217655303239185, 0.0, 0.0, 0.0, 0.0, 1.1083465474864913, 0.0, 0.0, 0.8234901338800403, 0.0, 1.075047133417442, 0.0, 0.0, 1.1231279422519702, 2.494750249056486, 0.0, 0.0)","(0.0, 0.9903127281297341, 1.0177674265941439, 1.2217655303239185, 1.3075134832667763, 0.0, 0.0, 0.0, 0.0, 2.355552105057903, 0.0, 0.0, 0.0, 0.0, 0.0, 2.2832511589875852, 0.0, 0.0, 0.0, 1.186979414238503)","(0.0, 0.0, 1.0177674265941439, 0.0, 0.0, 0.0, 1.3144243608066233, 0.0, 3.325039642459474, 0.0, 0.0, 0.0, 0.0, 0.0, 2.3690807772782883, 1.1416255794937926, 0.0, 0.0, 0.0, 0.0)","(0.0, 0.9903127281297341, 1.0177674265941439, 0.0, 0.0, 0.0, 0.0, 1.026049047425866, 1.1083465474864913, 0.0, 0.0, 0.0, 1.075047133417442, 1.075047133417442, 0.0, 1.1416255794937926, 0.0, 1.247375124528243, 0.0, 0.0)","(1.026049047425866, 0.0, 0.0, 0.0, 0.0, 0.0, 2.6288487216132466, 0.0, 0.0, 0.0, 1.0116009116784799, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.247375124528243, 0.0, 0.0)"
rawPrediction,"[1.8068716076600375, 9.444256742246022, 8.748871650093939]","[1.7542284201654748, 7.287756601382291, 10.958014978452233]","[2.3417841318765644, 10.851230959916947, 6.806984908206489]","[1.8054236496895064, 10.363713285885995, 7.8308630644244985]","[1.2135444315435482, 10.927276396302474, 7.859179172153977]","[2.8195443884648146, 9.637579229032639, 7.542876382502548]","[2.3291298023177256, 10.244394020888224, 7.426476176794049]","[2.0596386176636607, 9.32373842635117, 8.616622955985171]","[1.360518203532239, 10.240248221471631, 8.399233574996131]","[1.7210907052902085, 12.197139590910743, 6.081769703799049]"
probability,"[0.09034358038300187, 0.4722128371123011, 0.43744358250469695]","[0.08771142100827374, 0.36438783006911457, 0.5479007489226116]","[0.11708920659382822, 0.5425615479958473, 0.3403492454103244]","[0.09027118248447533, 0.5181856642942997, 0.3915431532212249]","[0.060677221577177406, 0.5463638198151237, 0.39295895860769886]","[0.14097721942324073, 0.48187896145163195, 0.3771438191251274]","[0.11645649011588628, 0.5122197010444112, 0.37132380883970245]","[0.10298193088318304, 0.4661869213175585, 0.4308311477992586]","[0.06802591017661194, 0.5120124110735815, 0.41996167874980656]","[0.08605453526451043, 0.6098569795455371, 0.30408848518995246]"


### True and false predictions

In [14]:
true_postives = predictions[(predictions.final_manual_labelling == 2) & (predictions.prediction == 2)].count()
true_negatives = predictions[(predictions.final_manual_labelling == 0) & (predictions.prediction == 0)].count()
true_neutral = predictions[(predictions.final_manual_labelling == 1) & (predictions.prediction == 1)].count()


false_postives = predictions[(predictions.final_manual_labelling == 2) & (predictions.prediction != 2)].count()
false_negatives = predictions[(predictions.final_manual_labelling == 0) & (predictions.prediction != 0)].count()
false_neutral = predictions[(predictions.final_manual_labelling == 1) & (predictions.prediction != 1)].count()

print(true_postives, true_postives, true_neutral)
print(false_postives, false_negatives, false_neutral)

159 159 248
201 73 105


## Accuracy and Test Error
Importamos la libreria MulticlassClassificationEvaluator, este objeto simplemente calcula la accuracy de nuestro modelo multiclase (nuestras clases son "positivo, negativo y neutral")

In [15]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="final_manual_labelling", predictionCol="prediction")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %s" % (accuracy))
print("Test Error = %s" % (1.0 - accuracy))

Accuracy = 0.4864175988236889
Test Error = 0.513582401176311


## Guardamos el modelo que hemos entrenado

In [16]:
rfModel.write().save("./Model_RF_V1")

Py4JJavaError: An error occurred while calling o386.save.
: java.io.IOException: Path ./Model_RF_V1 already exists. To overwrite it, please use write.overwrite().save(path) for Scala and use write().overwrite().save(path) for Java and Python.
	at org.apache.spark.ml.util.FileSystemOverwrite.handleOverwrite(ReadWrite.scala:683)
	at org.apache.spark.ml.util.MLWriter.save(ReadWrite.scala:167)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:748)
