In [4]:
import string

from pyspark.sql import SparkSession

from pyspark.sql.functions import col, when, length, size, split, udf, rand, size, regexp_replace
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, IntegerType

from pyspark.ml import Pipeline
from pyspark.ml.feature import CountVectorizer,StringIndexer, RegexTokenizer,StopWordsRemover

from pyspark.ml.classification import NaiveBayes, RandomForestClassifier, LogisticRegression, DecisionTreeClassifier, GBTClassifier

from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

# Spark Session

In [5]:
spark = SparkSession.builder.getOrCreate()

# Data

## Import

In [8]:
schema = StructType([
    StructField("target", IntegerType(), True),
    StructField("id", StringType(), True),
    StructField("date", StringType(), True),
    StructField("query", StringType(), True),
    StructField("user", StringType(), True),
    StructField("text", StringType(), True)
])

data = spark.read.csv("../../data/kaggle/sentiment", schema=schema, header=False)

data.show()

[Stage 0:>                                                          (0 + 1) / 1]

+------+----------+--------------------+--------+---------------+--------------------+
|target|        id|                date|   query|           user|                text|
+------+----------+--------------------+--------+---------------+--------------------+
|  null|        id|                date|   query|           user|                text|
|     4|1999177446|Mon Jun 01 19:46:...|NO_QUERY|       firewalk|oooo... KJ Parker...|
|     4|1999177528|Mon Jun 01 19:46:...|NO_QUERY|       TimB5150|@Ericatwitts  Ur ...|
|     4|1999177648|Mon Jun 01 19:46:...|NO_QUERY|       JesicaXD|I think im havin ...|
|     4|1999177836|Mon Jun 01 19:46:...|NO_QUERY|  MaggieMcNulty|Just made a yummy...|
|     4|1999177901|Mon Jun 01 19:46:...|NO_QUERY|       Exklusiv|@alice_verney no ...|
|     4|1999178123|Mon Jun 01 19:46:...|NO_QUERY|  BEAUTIFULL308|@UniqueZayas Oooh...|
|     4|1999178168|Mon Jun 01 19:46:...|NO_QUERY|   SaturnMoonie|Night my twits. L...|
|     4|1999178183|Mon Jun 01 19:46:...|NO_

                                                                                

In [9]:
data = data.dropna()
data.count()

                                                                                

1600000

## Train Test Split

In order to include all three dataframes in the training stage, the data is randomly mixed.

In [10]:
data = data.orderBy(rand(seed=43))

The data is divided in train and test. Being the train the 80% of the total data.

In [11]:
split_index = int(data.count() * 0.8)

df_train = data.limit(split_index)
df_test = data.subtract(df_train)

                                                                                

# Text Cleaning

Once the data analysis has been completed in Jupyter, the text cleaning stage is started.

Tweets require lots of cleaning but it is inefficient to clean every single tweet because that would consume too much time. A general approach must be implemented for cleaning.

* The most common type of words that require cleaning in oov have punctuations at the start or end. Those words doesn't have embeddings because of the trailing punctuations. Punctuations #, @, !, ?, +, &, -, $, =, <, >, |, {, }, ^, ', (, ),[, ], *, %, ..., ', ., :, ; are separated from words
* Special characters that are attached to words are removed completely
* Contractions are expanded
* Urls are removed
* Character entity references are replaced with their actual symbols
* Typos and slang are corrected, and informal abbreviations are written in their long forms
* Some words are replaced with their acronyms and some words are grouped into one
* Finally, hashtags and usernames contain lots of information about the context but they are written without spaces in between words so they don't have embeddings. Informational usernames and hashtags should be expanded but there are too many of them. Due to the project deadline, hashtags and usernames haven't been expanded in detail, a list of expanded usernames was taken in order to achive this.

In [12]:
replace_text = udf(lambda text: 
                    text.replace("å_", "")
                        .replace("fromåÊwounds", "from wounds")
                        .replace("åÊ", "")
                        .replace("åÈ", "")
                        .replace("JapÌ_n", "Japan")
                        .replace("Ì©", "e")
                        .replace("å¨", "")
                        .replace("SuruÌ¤", "Suruc")
                        .replace("åÇ", "")
                        .replace("å£3million", "3 million")
                        .replace("åÀ", "")
                        .replace("he's", "he is")
                        .replace("there's", "there is")
                        .replace("We're", "We are")
                        .replace("That's", "That is")
                        .replace("won't", "will not")
                        .replace("they're", "they are")
                        .replace("Can't", "Cannot")
                        .replace("wasn't", "was not")
                        .replace("aren't", "are not")
                        .replace("isn't", "is not")
                        .replace("What's", "What is")
                        .replace("haven't", "have not")
                        .replace("hasn't", "has not")
                        .replace("There's", "There is")
                        .replace("He's", "He is")
                        .replace("It's", "It is")
                        .replace("You're", "You are")
                        .replace("I'M", "I am")
                        .replace("shouldn't", "should not")
                        .replace("wouldn't", "would not")
                        .replace("i'm", "I am")
                        .replace("I'm", "I am")
                        .replace("Isn't", "is not")
                        .replace("Here's", "Here is")
                        .replace("you've", "you have")
                        .replace("we're", "we are")
                        .replace("what's", "what is")
                        .replace("couldn't", "could not")
                        .replace("we've", "we have")
                        .replace("who's", "who is")
                        .replace("y'all", "you all")
                        .replace("would've", "would have")
                        .replace("it'll", "it will")
                        .replace("we'll", "we will")
                        .replace("We've", "We have")
                        .replace("he'll", "he will")
                        .replace("Y'all", "You all")
                        .replace("Weren't", "Were not")
                        .replace("Didn't", "Did not")
                        .replace("they'll", "they will")
                        .replace("they'd", "they would")
                        .replace("DON'T", "DO NOT")
                        .replace("they've", "they have")
                        .replace("i'd", "I would")
                        .replace("should've", "should have")
                        .replace("where's", "where is")
                        .replace("we'd", "we would")
                        .replace("i'll", "I will")
                        .replace("weren't", "were not")
                        .replace("They're", "They are")
                        .replace("let's", "let us")
                        .replace("it's", "it is")
                        .replace("can't", "cannot")
                        .replace("don't", "do not")
                        .replace("you're", "you are")
                        .replace("i've", "I have")
                        .replace("that's", "that is")
                        .replace("i'll", "I will")
                        .replace("doesn't", "does not")
                        .replace("i'd", "I would")
                        .replace("didn't", "did not")
                        .replace("ain't", "am not")
                        .replace("you'll", "you will")
                        .replace("I've", "I have")
                        .replace("Don't", "do not")
                        .replace("I'll", "I will")
                        .replace("I'd", "I would")
                        .replace("Let's", "Let us")
                        .replace("you'd", "You would")
                        .replace("It's", "It is")
                        .replace("Ain't", "am not")
                        .replace("Haven't", "Have not")
                        .replace("Could've", "Could have")
                        .replace("youve", "you have")
                        .replace("donå«t", "do not")
                        .replace("@", " @ ")
                        .replace("#", " # ")
                        .replace("!", " ! ")
                        .replace("?", " ? ")
                        .replace("+", " + ")
                        .replace("&", " & ")
                        .replace("*", " * ")
                        .replace("[", " [ ")
                        .replace("]", " ] ")
                        .replace("-", " - ")
                        .replace("%", " % ")
                        .replace(".", " . ")
                        .replace(":", " : ")
                        .replace("/", " / ")
                        .replace("(", " ( ")
                        .replace(")", " ) ")
                        .replace(";", " ; ")
                        .replace("$", " $ ")
                        .replace("=", " = ")
                        .replace(">", " > ")
                        .replace("<", " < ")
                        .replace("|", " | ")
                        .replace("{", " { ")
                        .replace("}", " } ")
                        .replace("^", " ^ ")
                        .replace("'", " ' ")
                        .replace("`", " ` ")
                        .replace("...", " ... ")
                        .replace("..", " ... ") if text is not None else None, StringType())



df_train = df_train.withColumn("text", replace_text(df_train["text"]))
df_test = df_test.withColumn("text", replace_text(df_test["text"]))

# Mostrar el DataFrame transformado
df_train.show(5)
df_test.show(5)

                                                                                

+------+----------+--------------------+--------+------------+--------------------+
|target|        id|                date|   query|        user|                text|
+------+----------+--------------------+--------+------------+--------------------+
|     0|2264093836|Sun Jun 21 02:45:...|NO_QUERY|  darcyemily| `  then they han...|
|     0|1550967451|Sat Apr 18 07:46:...|NO_QUERY|serendipitie|Cannot remember t...|
|     4|1827780885|Sun May 17 11:26:...|NO_QUERY|  aliceokoye|watching televisi...|
|     0|2190184804|Tue Jun 16 02:00:...|NO_QUERY|   Rubybelle|Book club was goo...|
|     4|1982804907|Sun May 31 11:56:...|NO_QUERY|      bwg_uk|On the choo choo ...|
+------+----------+--------------------+--------+------------+--------------------+
only showing top 5 rows



[Stage 26:>                                                         (0 + 8) / 8]

+------+----------+--------------------+--------+------------+--------------------+
|target|        id|                date|   query|        user|                text|
+------+----------+--------------------+--------+------------+--------------------+
|     0|2048943655|Fri Jun 05 16:01:...|NO_QUERY|         rem| .  @ kswedberg  ...|
|     0|2261266560|Sat Jun 20 20:31:...|NO_QUERY|chelsiegreen|Mom forcing me to...|
|     0|2283032051|Mon Jun 22 12:06:...|NO_QUERY|  AlyxxDione| @ SexXyBlackines...|
|     4|1793395522|Thu May 14 02:31:...|NO_QUERY|    wendy_uk| @ aliflyby Hope ...|
|     4|2053133671|Sat Jun 06 03:24:...|NO_QUERY|SayuriYubari|I´m doing a cake !  |
+------+----------+--------------------+--------+------------+--------------------+
only showing top 5 rows



                                                                                

# Preprocess

## Tokenizer 

* **Tokenizer:** <br>
The Tokenizer is a feature transformer that takes an input text column and splits it into individual words or tokens. It is used to preprocess the text data before applying any machine learning algorithms. In this case, the input text column is "text_cleaned" which contains the preprocessed and cleaned text data. The Tokenizer transforms the "text_cleaned" column into a new column called "tokens" where each row contains an array of tokens (words).

In [13]:
regex_tokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W")
raw_words_train = regex_tokenizer.transform(df_train)
raw_words_test = regex_tokenizer.transform(df_test)
raw_words_train.show(5)
raw_words_test.show(5)

                                                                                

+------+----------+--------------------+--------+------------+--------------------+--------------------+
|target|        id|                date|   query|        user|                text|               words|
+------+----------+--------------------+--------+------------+--------------------+--------------------+
|     0|2264093836|Sun Jun 21 02:45:...|NO_QUERY|  darcyemily| `  then they han...|[then, they, hand...|
|     0|1550967451|Sat Apr 18 07:46:...|NO_QUERY|serendipitie|Cannot remember t...|[cannot, remember...|
|     4|1827780885|Sun May 17 11:26:...|NO_QUERY|  aliceokoye|watching televisi...|[watching, televi...|
|     0|2190184804|Tue Jun 16 02:00:...|NO_QUERY|   Rubybelle|Book club was goo...|[book, club, was,...|
|     4|1982804907|Sun May 31 11:56:...|NO_QUERY|      bwg_uk|On the choo choo ...|[on, the, choo, c...|
+------+----------+--------------------+--------+------------+--------------------+--------------------+
only showing top 5 rows



[Stage 52:>                                                         (0 + 8) / 8]

+------+----------+--------------------+--------+------------+--------------------+--------------------+
|target|        id|                date|   query|        user|                text|               words|
+------+----------+--------------------+--------+------------+--------------------+--------------------+
|     0|2048943655|Fri Jun 05 16:01:...|NO_QUERY|         rem| .  @ kswedberg  ...|[kswedberg, amp, ...|
|     0|2261266560|Sat Jun 20 20:31:...|NO_QUERY|chelsiegreen|Mom forcing me to...|[mom, forcing, me...|
|     0|2283032051|Mon Jun 22 12:06:...|NO_QUERY|  AlyxxDione| @ SexXyBlackines...|[sexxyblackinese,...|
|     4|1793395522|Thu May 14 02:31:...|NO_QUERY|    wendy_uk| @ aliflyby Hope ...|[aliflyby, hope, ...|
|     4|2053133671|Sat Jun 06 03:24:...|NO_QUERY|SayuriYubari|I´m doing a cake !  |[i, m, doing, a, ...|
+------+----------+--------------------+--------+------------+--------------------+--------------------+
only showing top 5 rows



                                                                                

## Stop Words

* **StopWords:** <br>
Stop words are commonly used words in a language that typically do not carry much meaning or contribute significantly to the overall understanding of a text. Examples of stop words in English include "the", "is", "and", "a", and "an". These words are often filtered out or removed from text data during natural language processing tasks, such as text classification or sentiment analysis.

In [14]:
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
words_df_train = remover.transform(raw_words_train)
words_df_test = remover.transform(raw_words_test)
words_df_train.show(5)
words_df_test.show(5)

24/04/01 21:21:06 WARN PythonUDFRunner: Detected deadlock while completing task 0.0 in stage 62 (TID 193): Attempting to kill Python Worker
                                                                                

+------+----------+--------------------+--------+------------+--------------------+--------------------+--------------------+
|target|        id|                date|   query|        user|                text|               words|            filtered|
+------+----------+--------------------+--------+------------+--------------------+--------------------+--------------------+
|     0|2264093836|Sun Jun 21 02:45:...|NO_QUERY|  darcyemily| `  then they han...|[then, they, hand...|[handed, folded, ...|
|     0|1550967451|Sat Apr 18 07:46:...|NO_QUERY|serendipitie|Cannot remember t...|[cannot, remember...|[remember, websit...|
|     4|1827780885|Sun May 17 11:26:...|NO_QUERY|  aliceokoye|watching televisi...|[watching, televi...|[watching, televi...|
|     0|2190184804|Tue Jun 16 02:00:...|NO_QUERY|   Rubybelle|Book club was goo...|[book, club, was,...|[book, club, good...|
|     4|1982804907|Sun May 31 11:56:...|NO_QUERY|      bwg_uk|On the choo choo ...|[on, the, choo, c...|[choo, choo, l



+------+----------+--------------------+--------+------------+--------------------+--------------------+--------------------+
|target|        id|                date|   query|        user|                text|               words|            filtered|
+------+----------+--------------------+--------+------------+--------------------+--------------------+--------------------+
|     0|2048943655|Fri Jun 05 16:01:...|NO_QUERY|         rem| .  @ kswedberg  ...|[kswedberg, amp, ...|[kswedberg, amp, ...|
|     0|2261266560|Sat Jun 20 20:31:...|NO_QUERY|chelsiegreen|Mom forcing me to...|[mom, forcing, me...|[mom, forcing, ea...|
|     0|2283032051|Mon Jun 22 12:06:...|NO_QUERY|  AlyxxDione| @ SexXyBlackines...|[sexxyblackinese,...|[sexxyblackinese,...|
|     4|1793395522|Thu May 14 02:31:...|NO_QUERY|    wendy_uk| @ aliflyby Hope ...|[aliflyby, hope, ...|[aliflyby, hope, ...|
|     4|2053133671|Sat Jun 06 03:24:...|NO_QUERY|SayuriYubari|I´m doing a cake !  |[i, m, doing, a, ...|           [m,

                                                                                

## Count Vectorizer

* **CountVectorizer:** <br>
The CountVectorizer is a feature transformer that converts a collection of text documents into a matrix of token counts. It takes an input column of tokens and outputs a sparse vector representation of the token counts. In this case, the input column is "tokens" which contains the array of tokens generated by the Tokenizer. The CountVectorizer learns a vocabulary of distinct tokens from the training data and represents each document as a vector of token counts.

In [15]:
cv = CountVectorizer(inputCol="filtered", outputCol="features")

# train
model_train = cv.fit(words_df_train)
countVectorizer_train = model_train.transform(words_df_train)
countVectorizer_train = countVectorizer_train.withColumn("label",col('target'))
countVectorizer_train.show(5)

# test
model_test = cv.fit(words_df_test)
countVectorizer_test = model_test.transform(words_df_test)
countVectorizer_test= countVectorizer_test.withColumn("label",col('target'))
countVectorizer_test.show(5)

24/04/01 21:21:46 WARN DAGScheduler: Broadcasting large task binary with size 3.0 MiB
                                                                                

+------+----------+--------------------+--------+------------+--------------------+--------------------+--------------------+--------------------+-----+
|target|        id|                date|   query|        user|                text|               words|            filtered|            features|label|
+------+----------+--------------------+--------+------------+--------------------+--------------------+--------------------+--------------------+-----+
|     0|2264093836|Sun Jun 21 02:45:...|NO_QUERY|  darcyemily| `  then they han...|[then, they, hand...|[handed, folded, ...|(262144,[34,153,9...|    0|
|     0|1550967451|Sat Apr 18 07:46:...|NO_QUERY|serendipitie|Cannot remember t...|[cannot, remember...|[remember, websit...|(262144,[386,725,...|    0|
|     4|1827780885|Sun May 17 11:26:...|NO_QUERY|  aliceokoye|watching televisi...|[watching, televi...|[watching, televi...|(262144,[13,63,24...|    4|
|     0|2190184804|Tue Jun 16 02:00:...|NO_QUERY|   Rubybelle|Book club was goo...

24/04/01 21:22:21 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB


+------+----------+--------------------+--------+------------+--------------------+--------------------+--------------------+--------------------+-----+
|target|        id|                date|   query|        user|                text|               words|            filtered|            features|label|
+------+----------+--------------------+--------+------------+--------------------+--------------------+--------------------+--------------------+-----+
|     0|2048943655|Fri Jun 05 16:01:...|NO_QUERY|         rem| .  @ kswedberg  ...|[kswedberg, amp, ...|[kswedberg, amp, ...|(223420,[21,503,7...|    0|
|     0|2261266560|Sat Jun 20 20:31:...|NO_QUERY|chelsiegreen|Mom forcing me to...|[mom, forcing, me...|[mom, forcing, ea...|(223420,[24,44,59...|    0|
|     0|2283032051|Mon Jun 22 12:06:...|NO_QUERY|  AlyxxDione| @ SexXyBlackines...|[sexxyblackinese,...|[sexxyblackinese,...|(223420,[13,33,28...|    0|
|     4|1793395522|Thu May 14 02:31:...|NO_QUERY|    wendy_uk| @ aliflyby Hope ...

                                                                                

# Models
As evaluation metrics ROC and Accuracy will be used. <br>

* **ROC curve** is a graphical representation that shows the performance of a binary classification model as the decision threshold is varied. On the x-axis, the false positive rate (FPR) is plotted, which is the proportion of negative instances incorrectly classified as positive. On the y-axis, the true positive rate (TPR) is plotted, which is the proportion of positive instances correctly classified as positive.



Separate train and validate data

In [16]:
(train, validate) = countVectorizer_train.randomSplit([0.8, 0.2],seed = 42)

In [17]:
trainData = countVectorizer_train
testData = countVectorizer_test

## Naive Bayes

In [18]:
nb = NaiveBayes(modelType="multinomial",labelCol="label", featuresCol="features")
nbModel = nb.fit(train)
nb_predictions = nbModel.transform(validate)

24/04/01 21:22:26 WARN DAGScheduler: Broadcasting large task binary with size 3.0 MiB
24/04/01 21:22:44 WARN DAGScheduler: Broadcasting large task binary with size 3.0 MiB
24/04/01 21:22:59 WARN DAGScheduler: Broadcasting large task binary with size 3.0 MiB
                                                                                

In [19]:
evaluator = BinaryClassificationEvaluator()
print('Test Area Under ROC', evaluator.evaluate(nb_predictions))

24/04/01 21:23:13 WARN DAGScheduler: Broadcasting large task binary with size 3.0 MiB
24/04/01 21:23:25 WARN DAGScheduler: Broadcasting large task binary with size 7.0 MiB
                                                                                

Test Area Under ROC 0.5084758366680321


In [20]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
nb_accuracy = evaluator.evaluate(nb_predictions)
print("Accuracy of NaiveBayes is = %g"% (nb_accuracy))

24/04/01 21:23:40 WARN DAGScheduler: Broadcasting large task binary with size 3.0 MiB
24/04/01 21:23:57 WARN DAGScheduler: Broadcasting large task binary with size 7.0 MiB
24/04/01 21:24:03 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


Accuracy of NaiveBayes is = 0.393824


                                                                                

## Decision Tree

In [21]:
from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'target', maxDepth = 3)
dtModel = dt.fit(train)
dtPreds = dtModel.transform(validate)
dtPreds.show(5)

24/04/01 21:24:09 WARN DAGScheduler: Broadcasting large task binary with size 3.0 MiB
24/04/01 21:24:26 WARN DAGScheduler: Broadcasting large task binary with size 3.0 MiB
24/04/01 21:24:38 WARN DAGScheduler: Broadcasting large task binary with size 3.0 MiB
24/04/01 21:24:48 WARN DAGScheduler: Broadcasting large task binary with size 3.0 MiB
24/04/01 21:25:01 WARN DAGScheduler: Broadcasting large task binary with size 3.0 MiB
24/04/01 21:25:07 WARN DAGScheduler: Broadcasting large task binary with size 3.0 MiB
24/04/01 21:25:15 WARN DAGScheduler: Broadcasting large task binary with size 5.6 MiB
24/04/01 21:27:00 WARN DAGScheduler: Broadcasting large task binary with size 1033.7 KiB
24/04/01 21:27:01 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB
24/04/01 21:27:07 WARN MemoryStore: Not enough space to cache rdd_412_0 in memory! (computed 113.0 MiB so far)
24/04/01 21:27:07 WARN BlockManager: Persisting block rdd_412_0 to disk instead.
24/04/01 22:06:12 WARN MemorySt

+------+----------+--------------------+--------+----------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+----------+
|target|        id|                date|   query|      user|                text|               words|            filtered|            features|label|       rawPrediction|         probability|prediction|
+------+----------+--------------------+--------+----------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+----------+
|     0|1467810917|Mon Apr 06 22:19:...|NO_QUERY|  mattycus| @ Kenichan I div...|[kenichan, i, div...|[kenichan, dived,...|(262144,[4,218,25...|    0|[472520.0,0.0,0.0...|[0.49273083332116...|       4.0|
|     0|1467811592|Mon Apr 06 22:20:...|NO_QUERY|   mybirch|         Need a hug |      [need, a, hug]|         [need, hug]|(262144,[35,810],...|    0|[472520.0,0.0,0.0...|[0.4927308333

                                                                                

In [22]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
dt_accuracy = evaluator.evaluate(dtPreds)
print("Accuracy of Decision Trees is = %g"% (dt_accuracy))

24/04/01 23:17:30 WARN DAGScheduler: Broadcasting large task binary with size 3.0 MiB
24/04/01 23:17:42 WARN DAGScheduler: Broadcasting large task binary with size 3.0 MiB
[Stage 241:>                                                        (0 + 1) / 1]

Accuracy of Decision Trees is = 0.52991


                                                                                

## GBT Classifier

In [None]:
from pyspark.ml.classification import GBTClassifier

gbt = GBTClassifier(maxIter=10)
gbtModel = gbt.fit(train)
gbtPreds = gbtModel.transform(validate)
gbtPreds.show(5)

In [None]:
gbtEval = BinaryClassificationEvaluator()
gbtROC = gbtEval.evaluate(gbtPreds, {gbtEval.metricName: "areaUnderROC"})
print("Test Area Under ROC: " + str(gbtROC))

                                                                                

Test Area Under ROC: 0.7921680466831243


In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
gb_accuracy = evaluator.evaluate(gbtPreds)
print("Accuracy of GBT is = %g"% (gb_accuracy))

[Stage 487:>                                                        (0 + 1) / 1]

Accuracy of GBT is = 0.759571


                                                                                

# Prediction

GBTClassifier is used to make predictions because it has the best evaluation metrics

In [None]:
gbt = GBTClassifier(maxIter=10)
gbtModel = gbt.fit(trainData)
gbtPreds = gbtModel.transform(testData)
predictions = gbtPreds.select('id','prediction')
predictions.show(5)

[Stage 649:>                                                        (0 + 1) / 1]