In [1]:
from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel

import findspark
findspark.init()
import pyspark
sc = pyspark.SparkContext(appName="RFTEST")
from pyspark.sql.session import SparkSession
spark = SparkSession(sc)

### Cargamos el modelo RandomForest guardado

In [2]:
rf_model = RandomForestClassificationModel.load("./Model_RF_V1")

### Creamos un titulo para hacer la prueba

In [8]:
title = "Crypto is the best"

df = spark.createDataFrame([(title,)], ["news"])

### Tokenizamos

In [9]:
import pandas as pd
from pyspark.ml.feature import Tokenizer
tokenization=Tokenizer(inputCol='news',outputCol='tokens')
tokenized_df=tokenization.transform(df)
pd.DataFrame(tokenized_df.take(1), columns=tokenized_df.columns).transpose()

Unnamed: 0,0
news,Crypto is the best
tokens,"[crypto, is, the, best]"


### Quitamos las StopWords

In [11]:
from pyspark.ml.feature import StopWordsRemover
stopword_removal=StopWordsRemover(inputCol='tokens',outputCol='refined_tokens')
refined_df=stopword_removal.transform(tokenized_df)
pd.DataFrame(refined_df.take(1), columns=refined_df.columns).transpose()

Unnamed: 0,0
news,Crypto is the best
tokens,"[crypto, is, the, best]"
refined_tokens,"[crypto, best]"


### Extraemos las Features

In [13]:
from pyspark.ml.feature import HashingTF, IDF

hashingTF = HashingTF(inputCol="refined_tokens", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(refined_df)

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

pd.DataFrame(rescaledData.take(5), columns=rescaledData.columns).transpose()

Unnamed: 0,0
news,Crypto is the best
tokens,"[crypto, is, the, best]"
refined_tokens,"[crypto, best]"
rawFeatures,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
features,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


### Aplicamos el modelo que hemos cargado

In [15]:
predictions = rf_model.transform(rescaledData)
pd.set_option('display.max_colwidth', None)
pd.DataFrame(predictions.take(1), columns=predictions.columns).transpose()

Unnamed: 0,0
news,Crypto is the best
tokens,"[crypto, is, the, best]"
refined_tokens,"[crypto, best]"
rawFeatures,"(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)"
features,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)"
rawPrediction,"[1.5607424214460008, 12.055587040225552, 6.383670538328446]"
probability,"[0.07803712107230004, 0.6027793520112776, 0.3191835269164223]"
prediction,1.0
