In [27]:
# run this to shorten the data import from the files
path_data = '/home/nero/Documents/Estudos/DataCamp/Python/Machine_Learning_with_PySpark/datasets/'

# start spark session

from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local[*]').appName('chapter_02').getOrCreate()

23/07/12 08:09:52 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/07/12 08:09:52 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [28]:
flights = spark.read.csv(path_data + 'flights.csv',
                         sep=',',
                         header=True,
                         inferSchema=True,
                         nullValue='NA')

from pyspark.sql.types import StructType, StructField, IntegerType, StringType

# Specify column names and types
schema = StructType([
    StructField("id", IntegerType()),
    StructField("text", StringType()),
    StructField("label", IntegerType())
])

# Load data from a delimited file
sms = spark.read.csv(path_data+'sms.csv', sep=';', header=False, schema=schema)

In [29]:
sms.createOrReplaceTempView('sms')
flights.createOrReplaceTempView('flights')

spark.catalog.listTables()

[Table(name='flights', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='sms', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True)]

In [30]:
flights.show(5)

+---+---+---+-------+------+---+----+------+--------+-----+
|mon|dom|dow|carrier|flight|org|mile|depart|duration|delay|
+---+---+---+-------+------+---+----+------+--------+-----+
| 11| 20|  6|     US|    19|JFK|2153|  9.48|     351| null|
|  0| 22|  2|     UA|  1107|ORD| 316| 16.33|      82|   30|
|  2| 20|  4|     UA|   226|SFO| 337|  6.17|      82|   -8|
|  9| 13|  1|     AA|   419|ORD|1236| 10.33|     195|   -5|
|  4|  2|  5|     AA|   325|ORD| 258|  8.92|      65| null|
+---+---+---+-------+------+---+----+------+--------+-----+
only showing top 5 rows



In [31]:
# exercise 01

"""
Removing columns and rows

You previously loaded airline flight data from a CSV file. You're going to develop a model which will predict whether or not a given flight will be delayed.

In this exercise you need to trim those data down by:

    removing an uninformative column and
    removing rows which do not have information about whether or not a flight was delayed.

The data are available as flights.

Note:: You might find it useful to revise the slides from the lessons in the Slides panel next to the IPython Shell.
"""

# Instructions

"""

    Remove the flight column.
    Find out how many records have missing values in the delay column.
    Remove records with missing values in the delay column.
    Remove records with missing values in any column and get the number of remaining rows.

"""

# solution

# Remove the 'flight' column
flights = flights.drop('flight')

# Number of records with missing 'delay' values
print(flights.filter('delay IS NULL').count())

# Remove records with missing 'delay' values
flights = flights.filter('delay IS NOT NULL')

# Remove records with missing values in any column and get the number of remaining rows
flights = flights.dropna()
print(flights.count())

#----------------------------------#

# Conclusion

"""
You've discarded the columns and rows which will certainly not contribute to a model.
"""

2978
47022


"\nYou've discarded the columns and rows which will certainly not contribute to a model.\n"

In [32]:
flights.show(5)

+---+---+---+-------+---+----+------+--------+-----+
|mon|dom|dow|carrier|org|mile|depart|duration|delay|
+---+---+---+-------+---+----+------+--------+-----+
|  0| 22|  2|     UA|ORD| 316| 16.33|      82|   30|
|  2| 20|  4|     UA|SFO| 337|  6.17|      82|   -8|
|  9| 13|  1|     AA|ORD|1236| 10.33|     195|   -5|
|  5|  2|  1|     UA|SFO| 550|  7.98|     102|    2|
|  7|  2|  6|     AA|ORD| 733| 10.83|     135|   54|
+---+---+---+-------+---+----+------+--------+-----+
only showing top 5 rows



In [36]:
df = flights.toPandas()
print(df.head())
df.to_csv(path_data + 'clean_flights.csv',index=False)

   mon  dom  dow carrier  org  mile  depart  duration  delay
0    0   22    2      UA  ORD   316   16.33        82     30
1    2   20    4      UA  SFO   337    6.17        82     -8
2    9   13    1      AA  ORD  1236   10.33       195     -5
3    5    2    1      UA  SFO   550    7.98       102      2
4    7    2    6      AA  ORD   733   10.83       135     54


In [7]:
# exercise 02

"""
Column manipulation

The Federal Aviation Administration (FAA) considers a flight to be "delayed" when it arrives 15 minutes or more after its scheduled time.

The next step of preparing the flight data has two parts:

    convert the units of distance, replacing the mile column with a kmcolumn; and
    create a Boolean column indicating whether or not a flight was delayed.

"""

# Instructions

"""

    Import a function which will allow you to round a number to a specific number of decimal places.
    Derive a new km column from the mile column, rounding to zero decimal places. One mile is 1.60934 km.
    Remove the mile column.
    Create a label column with a value of 1 indicating the delay was 15 minutes or more and 0 otherwise. Think carefully about the logical condition.

"""

# solution

# Import the required function
from pyspark.sql.functions import round

# Convert 'mile' to 'km' and drop 'mile' column (1 mile is equivalent to 1.60934 km)
flights = flights.withColumn('km', round(flights.mile * 1.60934, 0)) \
                    .drop('mile')

# Create 'label' column indicating whether flight delayed (1) or not (0)
flights = flights.withColumn('label', (flights.delay >= 15).cast('integer'))

# Check first five records
flights.show(5)

#----------------------------------#

# Conclusion

"""
Fifteen minutes seems like quite a wide margin, but who are you to argue with the FAA?
"""

+---+---+---+-------+---+------+--------+-----+------+-----+
|mon|dom|dow|carrier|org|depart|duration|delay|    km|label|
+---+---+---+-------+---+------+--------+-----+------+-----+
|  0| 22|  2|     UA|ORD| 16.33|      82|   30| 509.0|    1|
|  2| 20|  4|     UA|SFO|  6.17|      82|   -8| 542.0|    0|
|  9| 13|  1|     AA|ORD| 10.33|     195|   -5|1989.0|    0|
|  5|  2|  1|     UA|SFO|  7.98|     102|    2| 885.0|    0|
|  7|  2|  6|     AA|ORD| 10.83|     135|   54|1180.0|    1|
+---+---+---+-------+---+------+--------+-----+------+-----+
only showing top 5 rows



'\nFifteen minutes seems like quite a wide margin, but who are you to argue with the FAA?\n'

In [8]:
# exercise 03

"""
Categorical columns

In the flights data there are two columns, carrier and org, which hold categorical data. You need to transform those columns into indexed numerical values.
"""

# Instructions

"""

    Import the appropriate class and create an indexer object to transform the carrier column from a string to an numeric index.
    Prepare the indexer object on the flight data.
    Use the prepared indexer to create the numeric index column.
    Repeat the process for the org column.

"""

# solution

from pyspark.ml.feature import StringIndexer

# Create an indexer
indexer = StringIndexer(inputCol='carrier', outputCol='carrier_idx')

# Indexer identifies categories in the data
indexer_model = indexer.fit(flights)

# Indexer creates a new column with numeric index values
flights = indexer_model.transform(flights)

# Repeat the process for the other categorical feature
flights = StringIndexer(inputCol='org', outputCol='org_idx').fit(flights).transform(flights)
flights.show(5)

#----------------------------------#

# Conclusion

"""
Our Machine Learning model needs numbers not strings, so these transformations are vital!
"""

                                                                                

+---+---+---+-------+---+------+--------+-----+------+-----+-----------+-------+
|mon|dom|dow|carrier|org|depart|duration|delay|    km|label|carrier_idx|org_idx|
+---+---+---+-------+---+------+--------+-----+------+-----+-----------+-------+
|  0| 22|  2|     UA|ORD| 16.33|      82|   30| 509.0|    1|        0.0|    0.0|
|  2| 20|  4|     UA|SFO|  6.17|      82|   -8| 542.0|    0|        0.0|    1.0|
|  9| 13|  1|     AA|ORD| 10.33|     195|   -5|1989.0|    0|        1.0|    0.0|
|  5|  2|  1|     UA|SFO|  7.98|     102|    2| 885.0|    0|        0.0|    1.0|
|  7|  2|  6|     AA|ORD| 10.83|     135|   54|1180.0|    1|        1.0|    0.0|
+---+---+---+-------+---+------+--------+-----+------+-----+-----------+-------+
only showing top 5 rows



'\nOur Machine Learning model needs numbers not strings, so these transformations are vital!\n'

In [9]:
# exercise 04

"""
Assembling columns

The final stage of data preparation is to consolidate all of the predictor columns into a single column.

An updated version of the flights data, which takes into account all of the changes from the previous few exercises, has the following predictor columns:

    mon, dom and dow
    carrier_idx (indexed value from carrier)
    org_idx (indexed value from org)
    km
    depart
    duration

"""

# Instructions

"""

    Import the class which will assemble the predictors.
    Create an assembler object that will allow you to merge the predictors columns into a single column.
    Use the assembler to generate a new consolidated column.

"""

# solution

# Import the necessary class
from pyspark.ml.feature import VectorAssembler

# Create an assembler object
assembler = VectorAssembler(inputCols=[
    'mon','dom','dow','carrier_idx','org_idx','km','depart','duration'
], outputCol='features')

# Consolidate predictor columns
flights_assembled = assembler.transform(flights)

# Check the resulting column
flights_assembled.select('features', 'delay').show(5, truncate=False)

#----------------------------------#

# Conclusion

"""
The data are now ready for building our first Machine Learning model. You've worked hard to get this sorted: well done!
"""

+-----------------------------------------+-----+
|features                                 |delay|
+-----------------------------------------+-----+
|[0.0,22.0,2.0,0.0,0.0,509.0,16.33,82.0]  |30   |
|[2.0,20.0,4.0,0.0,1.0,542.0,6.17,82.0]   |-8   |
|[9.0,13.0,1.0,1.0,0.0,1989.0,10.33,195.0]|-5   |
|[5.0,2.0,1.0,0.0,1.0,885.0,7.98,102.0]   |2    |
|[7.0,2.0,6.0,1.0,0.0,1180.0,10.83,135.0] |54   |
+-----------------------------------------+-----+
only showing top 5 rows



"\nThe data are now ready for building our first Machine Learning model. You've worked hard to get this sorted: well done!\n"

In [10]:
flights.show(2)

+---+---+---+-------+---+------+--------+-----+-----+-----+-----------+-------+
|mon|dom|dow|carrier|org|depart|duration|delay|   km|label|carrier_idx|org_idx|
+---+---+---+-------+---+------+--------+-----+-----+-----+-----------+-------+
|  0| 22|  2|     UA|ORD| 16.33|      82|   30|509.0|    1|        0.0|    0.0|
|  2| 20|  4|     UA|SFO|  6.17|      82|   -8|542.0|    0|        0.0|    1.0|
+---+---+---+-------+---+------+--------+-----+-----+-----+-----------+-------+
only showing top 2 rows



In [11]:
# exercise 05

"""
Train/test split

To objectively assess a Machine Learning model you need to be able to test it on an independent set of data. You can't use the same data that you used to train the model: of course the model will perform (relatively) well on those data!

You will split the data into two components:

    training data (used to train the model) and
    testing data (used to test the model).

Note: From here on you'll be working with a smaller subset of the flights data, which just makes the exercises run more quickly.
"""

# Instructions

"""

    Randomly split the flights data into two sets with 80:20 proportions. For repeatability set a random number seed of 43 for the split.
    Check that the training data has roughly 80% of the records from the original data.

"""

# solution

# Split into training and testing sets in a 80:20 ratio
flights_train, flights_test = flights_assembled.randomSplit([0.80,0.20], seed=43)

# Check that training set has around 80% of records
training_ratio = flights_train.count() / flights_assembled.count()
print(training_ratio)

#----------------------------------#

# Conclusion

"""
The ratio looks as expected. You're ready to train and test a Decision Tree model!
"""

                                                                                

0.8025392369529156


"\nThe ratio looks as expected. You're ready to train and test a Decision Tree model!\n"

In [12]:
# exercise 06

"""
Build a Decision Tree

Now that you've split the flights data into training and testing sets, you can use the training set to fit a Decision Tree model.

The data are available as flights_train and flights_test.

NOTE: It will take a few seconds for the model to train… please be patient!
"""

# Instructions

"""

    Import the class for creating a Decision Tree classifier.
    Create a classifier object and fit it to the training data.
    Make predictions for the testing data and take a look at the predictions.

"""

# solution

# Import the Decision Tree Classifier class
from pyspark.ml.classification import DecisionTreeClassifier

# Create a classifier object and fit to the training data
tree = DecisionTreeClassifier()
tree_model = tree.fit(flights_train)

# Create predictions for the testing data and take a look at the predictions
prediction = tree_model.transform(flights_test)
prediction.select('label', 'prediction', 'probability').show(5, False)

#----------------------------------#

# Conclusion

"""
Congratulations! You've built your first Machine Learning model with PySpark. Now to test!
"""

[Stage 46:>                                                         (0 + 1) / 1]

+-----+----------+----------------------------------------+
|label|prediction|probability                             |
+-----+----------+----------------------------------------+
|1    |0.0       |[0.5293367346938775,0.47066326530612246]|
|1    |0.0       |[0.5293367346938775,0.47066326530612246]|
|0    |1.0       |[0.3564541013113911,0.6435458986886089] |
|1    |1.0       |[0.3564541013113911,0.6435458986886089] |
|1    |1.0       |[0.3564541013113911,0.6435458986886089] |
+-----+----------+----------------------------------------+
only showing top 5 rows



                                                                                

"\nCongratulations! You've built your first Machine Learning model with PySpark. Now to test!\n"

In [13]:
# exercise 07

"""
Evaluate the Decision Tree

You can assess the quality of your model by evaluating how well it performs on the testing data. Because the model was not trained on these data, this represents an objective assessment of the model.

A confusion matrix gives a useful breakdown of predictions versus known values. It has four cells which represent the counts of:

    True Negatives (TN) — model predicts negative outcome & known outcome is negative
    True Positives (TP) — model predicts positive outcome & known outcome is positive
    False Negatives (FN) — model predicts negative outcome but known outcome is positive
    False Positives (FP) — model predicts positive outcome but known outcome is negative.

These counts (TN, TP, FN and FP) should sum to the number of records in the testing data, which is only a subset of the flights data. You can compare to the number of records in the tests data, which is flights_test.count().

Note: These predictions are made on the testing data, so the counts are smaller than they would have been for predictions on the training data.
"""

# Instructions

"""

    Create a confusion matrix by counting the combinations of label and prediction. Display the result.
    Count the number of True Negatives, True Positives, False Negatives and False Positives.
    Calculate the accuracy.

"""

# solution

# Create a confusion matrix
prediction.groupBy('label', 'prediction').count().show()

# Calculate the elements of the confusion matrix
TN = prediction.filter('prediction = 0 AND label = prediction').count()
TP = prediction.filter('prediction = 1 AND label = prediction').count()
FN = prediction.filter('prediction = 0 AND label = 1').count()
FP = prediction.filter('prediction = 1 AND label = 0').count()

# Accuracy measures the proportion of correct predictions
accuracy = (TP + TN) / (TP + TN + FN + FP)
print(accuracy)

#----------------------------------#

# Conclusion

"""
The accuracy is decent but there are a lot of false predictions. We can make this model better!
"""

                                                                                

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0| 1207|
|    0|       0.0| 2373|
|    1|       1.0| 3556|
|    0|       1.0| 2149|
+-----+----------+-----+



[Stage 59:>                                                         (0 + 1) / 1]

0.6385568120624664


                                                                                

'\nThe accuracy is decent but there are a lot of false predictions. We can make this model better!\n'

In [14]:
#other metrics
precision = TP / (TP + FP)
recall = TP / (TP + FN)

print(f'recall = {recall}\nprecision = {precision}')

recall = 0.7465882846945202
precision = 0.6233128834355828


In [15]:
# exercise 08

"""
Build a Logistic Regression model

You've already built a Decision Tree model using the flights data. Now you're going to create a Logistic Regression model on the same data.

The objective is to predict whether a flight is likely to be delayed by at least 15 minutes (label 1) or not (label 0).

Although you have a variety of predictors at your disposal, you'll only use the mon, depart and duration columns for the moment. These are numerical features which can immediately be used for a Logistic Regression model. You'll need to do a little more work before you can include categorical features. Stay tuned!

The data have been split into training and testing sets and are available as flights_train and flights_test.
"""

# Instructions

"""

    Import the class for creating a Logistic Regression classifier.
    Create a classifier object and train it on the training data.
    Make predictions for the testing data and create a confusion matrix.

"""

# solution

# Import the logistic regression class
from pyspark.ml.classification import LogisticRegression

# Create a classifier object and train on training data
logistic = LogisticRegression().fit(flights_train)

# Create predictions for the testing data and show confusion matrix
prediction = logistic.transform(flights_test)
prediction.groupBy('label', 'prediction').count().show()

#----------------------------------#

# Conclusion

"""
Now let's unpack that confusion matrix.
"""

[Stage 81:>                                                         (0 + 1) / 1]

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0| 1705|
|    0|       0.0| 2561|
|    1|       1.0| 3058|
|    0|       1.0| 1961|
+-----+----------+-----+



                                                                                

"\nNow let's unpack that confusion matrix.\n"

In [16]:
# exercise 09

"""
Evaluate the Logistic Regression model

Accuracy is generally not a very reliable metric because it can be biased by the most common target class.

There are two other useful metrics:

    precision and
    recall.

Check the slides for this lesson to get the relevant expressions.

Precision is the proportion of positive predictions which are correct. For all flights which are predicted to be delayed, what proportion is actually delayed?

Recall is the proportion of positives outcomes which are correctly predicted. For all delayed flights, what proportion is correctly predicted by the model?

The precision and recall are generally formulated in terms of the positive target class. But it's also possible to calculate weighted versions of these metrics which look at both target classes.

The components of the confusion matrix are available as TN, TP, FN and FP, as well as the object prediction.
"""

# Instructions

"""

    Find the precision and recall.
    Create a multi-class evaluator and evaluate weighted precision.
    Create a binary evaluator and evaluate AUC using the "areaUnderROC" metric.

"""

# solution

from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

# Calculate precision and recall
precision = TP / (TP + FP)
recall = TP / (TP + FN)
print('precision = {:.2f}\nrecall    = {:.2f}'.format(precision, recall))

# Find weighted precision
multi_evaluator = MulticlassClassificationEvaluator()
weighted_precision = multi_evaluator.evaluate(prediction, {multi_evaluator.metricName: "weightedPrecision"})

# Find AUC
binary_evaluator = BinaryClassificationEvaluator()
auc = binary_evaluator.evaluate(prediction, {binary_evaluator.metricName : "areaUnderROC"})

#----------------------------------#
print(weighted_precision, auc)
# Conclusion

"""
The weighted precision indicates what proportion of predictions (positive and negative) are correct.
"""

precision = 0.62
recall    = 0.75


                                                                                

0.6049226844696598 0.6416634313426791


'\nThe weighted precision indicates what proportion of predictions (positive and negative) are correct.\n'

In [19]:
# exercise 10

"""
Punctuation, numbers and tokens

At the end of the previous chapter you loaded a dataset of SMS messages which had been labeled as either "spam" (label 1) or "ham" (label 0). You're now going to use those data to build a classifier model.

But first you'll need to prepare the SMS messages as follows:

    remove punctuation and numbers
    tokenize (split into individual words)
    remove stop words
    apply the hashing trick
    convert to TF-IDF representation.

In this exercise you'll remove punctuation and numbers, then tokenize the messages.

The SMS data are available as sms.
"""

# Instructions

"""

    Import the function to replace regular expressions and the feature to tokenize.
    Replace all punctuation characters from the text column with a space. Do the same for all numbers in the text column.
    Split the text column into tokens. Name the output column words.

"""

# solution

# Import the necessary functions
from pyspark.sql.functions import regexp_replace
from pyspark.ml.feature import Tokenizer

# Remove punctuation (REGEX provided) and numbers
wrangled = sms.withColumn('text', regexp_replace(sms.text, '[_():;,.!?\\-]', ' '))
wrangled = wrangled.withColumn('text', regexp_replace(wrangled.text,'[0-9]', ' '))

# Merge multiple spaces
wrangled = wrangled.withColumn('text', regexp_replace(wrangled.text, ' +', ' '))

# Split the text into words
wrangled = Tokenizer(inputCol='text', outputCol='words').transform(wrangled)

wrangled.show(4, truncate=False)

#----------------------------------#

# Conclusion

"""
Well done! Next you'll remove stop words and apply the hashing trick.
"""

+---+----------------------------------+-----+------------------------------------------+
|id |text                              |label|words                                     |
+---+----------------------------------+-----+------------------------------------------+
|1  |Sorry I'll call later in meeting  |0    |[sorry, i'll, call, later, in, meeting]   |
|2  |Dont worry I guess he's busy      |0    |[dont, worry, i, guess, he's, busy]       |
|3  |Call FREEPHONE now                |1    |[call, freephone, now]                    |
|4  |Win a cash prize or a prize worth |1    |[win, a, cash, prize, or, a, prize, worth]|
+---+----------------------------------+-----+------------------------------------------+
only showing top 4 rows



"\nWell done! Next you'll remove stop words and apply the hashing trick.\n"

In [20]:
# exercise 11

"""
Stop words and hashing

The next steps will be to remove stop words and then apply the hashing trick, converting the results into a TF-IDF.

A quick reminder about these concepts:

    The hashing trick provides a fast and space-efficient way to map a very large (possibly infinite) set of items (in this case, all words contained in the SMS messages) onto a smaller, finite number of values.
    The TF-IDF matrix reflects how important a word is to each document. It takes into account both the frequency of the word within each document but also the frequency of the word across all of the documents in the collection.

The tokenized SMS data are stored in sms in a column named words. You've cleaned up the handling of spaces in the data so that the tokenized text is neater.
"""

# Instructions

"""

    Import the StopWordsRemover, HashingTF and IDF classes.
    Create a StopWordsRemover object (input column words, output column terms). Apply to sms.
    Create a HashingTF object (input results from previous step, output column hash). Apply to wrangled.
    Create an IDF object (input results from previous step, output column features). Apply to wrangled.

"""

# solution

from pyspark.ml.feature import StopWordsRemover, HashingTF, IDF

# Remove stop words.
wrangled = StopWordsRemover(inputCol='words', outputCol='terms')\
      .transform(wrangled)

# Apply the hashing trick
wrangled = HashingTF(inputCol='terms', outputCol='hash', numFeatures=1024)\
      .transform(wrangled)

# Convert hashed symbols to TF-IDF
tf_idf = IDF(inputCol='hash', outputCol='features')\
      .fit(wrangled).transform(wrangled)
      
tf_idf.select('terms', 'features').show(4, truncate=False)

#----------------------------------#

# Conclusion

"""
Great! Now you're ready to build a spam classifier.
"""

                                                                                

+--------------------------------+----------------------------------------------------------------------------------------------------+
|terms                           |features                                                                                            |
+--------------------------------+----------------------------------------------------------------------------------------------------+
|[sorry, call, later, meeting]   |(1024,[138,384,577,996],[2.273418200008753,3.6288353225642043,3.5890949939146903,4.104259019279279])|
|[dont, worry, guess, busy]      |(1024,[215,233,276,329],[3.9913186080986836,3.3790235241678332,4.734227298217693,4.58299632849377]) |
|[call, freephone]               |(1024,[133,138],[5.367951058306837,2.273418200008753])                                              |
|[win, cash, prize, prize, worth]|(1024,[31,47,62,389],[3.6632029660684124,4.754846585420428,4.072170704727778,7.064594791043114])    |
+--------------------------------+--------------

"\nGreat! Now you're ready to build a spam classifier.\n"

In [22]:
from pyspark.ml.classification import LogisticRegression

In [24]:
tf_idf.show()

+---+--------------------+-----+--------------------+--------------------+--------------------+--------------------+
| id|                text|label|               words|               terms|                hash|            features|
+---+--------------------+-----+--------------------+--------------------+--------------------+--------------------+
|  1|Sorry I'll call l...|    0|[sorry, i'll, cal...|[sorry, call, lat...|(1024,[138,384,57...|(1024,[138,384,57...|
|  2|Dont worry I gues...|    0|[dont, worry, i, ...|[dont, worry, gue...|(1024,[215,233,27...|(1024,[215,233,27...|
|  3| Call FREEPHONE now |    1|[call, freephone,...|   [call, freephone]|(1024,[133,138],[...|(1024,[133,138],[...|
|  4|Win a cash prize ...|    1|[win, a, cash, pr...|[win, cash, prize...|(1024,[31,47,62,3...|(1024,[31,47,62,3...|
|  5|Go until jurong p...|    0|[go, until, juron...|[go, jurong, poin...|(1024,[12,171,191...|(1024,[12,171,191...|
|  6|Ok lar Joking wif...|    0|[ok, lar, joking,...|[ok, lar, j

In [25]:
# exercise 12

"""
Training a spam classifier

The SMS data have now been prepared for building a classifier. Specifically, this is what you have done:

    removed numbers and punctuation
    split the messages into words (or "tokens")
    removed stop words
    applied the hashing trick and
    converted to a TF-IDF representation.

Next you'll need to split the TF-IDF data into training and testing sets. Then you'll use the training data to fit a Logistic Regression model and finally evaluate the performance of that model on the testing data.

The data are stored in sms and LogisticRegression has been imported for you.
"""

# Instructions

"""

    Split the data into training and testing sets in a 4:1 ratio. Set the random number seed to 13 to ensure repeatability.
    Create a LogisticRegression object and fit it to the training data.
    Generate predictions on the testing data.
    Use the predictions to form a confusion matrix.


"""

# solution

# Split the data into training and testing sets
sms_train, sms_test = tf_idf.randomSplit([0.8, 0.2], seed=13)

# Fit a Logistic Regression model to the training data
logistic = LogisticRegression(regParam=0.2).fit(sms_train)

# Make predictions on the testing data
prediction = logistic.transform(sms_test)

# Create a confusion matrix, comparing predictions to known labels
prediction.groupBy('label', 'prediction').count().show()

#----------------------------------#

# Conclusion

"""
Well played! Your classifier won't be fooled by spam SMS.
"""

[Stage 120:>                                                        (0 + 1) / 1]

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0|   41|
|    0|       0.0|  948|
|    1|       1.0|  105|
|    0|       1.0|    2|
+-----+----------+-----+



                                                                                

"\nWell played! Your classifier won't be fooled by spam SMS.\n"

In [26]:
spark.stop()