In [None]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.1.2.tar.gz (212.4 MB)
[K     |████████████████████████████████| 212.4 MB 72 kB/s 
[?25hCollecting py4j==0.10.9
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 70.2 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.2-py2.py3-none-any.whl size=212880768 sha256=ee6428de59f88731082dd90e26b0e9810bf8985fc1fb0edf81f9d8ccfa2d3696
  Stored in directory: /root/.cache/pip/wheels/a5/0a/c1/9561f6fecb759579a7d863dcd846daaa95f598744e71b02c77
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.2


In [None]:
import pyspark

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark=SparkSession.builder.appName('NLP').getOrCreate()

In [None]:
# Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [94]:
file_path = '/content/drive/MyDrive/'
data=spark.read.csv(file_path +'SMSSpamCollection',inferSchema=True,sep='\t')

In [95]:
data.show()

+----+--------------------+
| _c0|                 _c1|
+----+--------------------+
| ham|Go until jurong p...|
| ham|Ok lar... Joking ...|
|spam|Free entry in 2 a...|
| ham|U dun say so earl...|
| ham|Nah I don't think...|
|spam|FreeMsg Hey there...|
| ham|Even my brother i...|
| ham|As per your reque...|
|spam|WINNER!! As a val...|
|spam|Had your mobile 1...|
| ham|I'm gonna be home...|
|spam|SIX chances to wi...|
|spam|URGENT! You have ...|
| ham|I've been searchi...|
| ham|I HAVE A DATE ON ...|
|spam|XXXMobileMovieClu...|
| ham|Oh k...i'm watchi...|
| ham|Eh u remember how...|
| ham|Fine if thats th...|
|spam|England v Macedon...|
+----+--------------------+
only showing top 20 rows



In [96]:
data=data.withColumnRenamed('_c0','class').withColumnRenamed('_c1','text')

In [97]:
data.show()

+-----+--------------------+
|class|                text|
+-----+--------------------+
|  ham|Go until jurong p...|
|  ham|Ok lar... Joking ...|
| spam|Free entry in 2 a...|
|  ham|U dun say so earl...|
|  ham|Nah I don't think...|
| spam|FreeMsg Hey there...|
|  ham|Even my brother i...|
|  ham|As per your reque...|
| spam|WINNER!! As a val...|
| spam|Had your mobile 1...|
|  ham|I'm gonna be home...|
| spam|SIX chances to wi...|
| spam|URGENT! You have ...|
|  ham|I've been searchi...|
|  ham|I HAVE A DATE ON ...|
| spam|XXXMobileMovieClu...|
|  ham|Oh k...i'm watchi...|
|  ham|Eh u remember how...|
|  ham|Fine if thats th...|
| spam|England v Macedon...|
+-----+--------------------+
only showing top 20 rows



In [98]:
from pyspark.sql.functions import length

In [99]:
data=data.withColumn('length',length(data['text']))

In [100]:
data.show()

+-----+--------------------+------+
|class|                text|length|
+-----+--------------------+------+
|  ham|Go until jurong p...|   111|
|  ham|Ok lar... Joking ...|    29|
| spam|Free entry in 2 a...|   155|
|  ham|U dun say so earl...|    49|
|  ham|Nah I don't think...|    61|
| spam|FreeMsg Hey there...|   147|
|  ham|Even my brother i...|    77|
|  ham|As per your reque...|   160|
| spam|WINNER!! As a val...|   157|
| spam|Had your mobile 1...|   154|
|  ham|I'm gonna be home...|   109|
| spam|SIX chances to wi...|   136|
| spam|URGENT! You have ...|   155|
|  ham|I've been searchi...|   196|
|  ham|I HAVE A DATE ON ...|    35|
| spam|XXXMobileMovieClu...|   149|
|  ham|Oh k...i'm watchi...|    26|
|  ham|Eh u remember how...|    81|
|  ham|Fine if thats th...|    56|
| spam|England v Macedon...|   155|
+-----+--------------------+------+
only showing top 20 rows



In [101]:
data.groupby('class').count().show()

+-----+-----+
|class|count|
+-----+-----+
|  ham| 4827|
| spam|  747|
+-----+-----+



In [102]:
from pyspark.ml.feature import Tokenizer,StopWordsRemover,CountVectorizer,IDF,StringIndexer

In [103]:
tok=Tokenizer(inputCol='text',outputCol='toktext')
stopw=StopWordsRemover(inputCol='toktext',outputCol='stopremovedtext')
countv=CountVectorizer(inputCol='stopremovedtext',outputCol='countvwords')
idf=IDF(inputCol='countvwords',outputCol='tfidf')
classnumeric=StringIndexer(inputCol='class',outputCol='label')

In [104]:
from pyspark.ml.feature import VectorAssembler
featureassembler=VectorAssembler(inputCols=['tfidf','length'],outputCol="features")

In [105]:
from pyspark.ml.classification import NaiveBayes

In [106]:
nb=NaiveBayes()

In [107]:
from pyspark.ml import Pipeline

In [108]:
data_pipe=Pipeline(stages=[classnumeric,tok,stopw,countv,idf,featureassembler])

In [109]:
mod=data_pipe.fit(data)

In [110]:
datan=mod.transform(data)

In [111]:
datan.show()

+-----+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|class|                text|length|label|             toktext|     stopremovedtext|         countvwords|               tfidf|            features|
+-----+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|  ham|Go until jurong p...|   111|  0.0|[go, until, juron...|[go, jurong, poin...|(13423,[7,11,31,6...|(13423,[7,11,31,6...|(13424,[7,11,31,6...|
|  ham|Ok lar... Joking ...|    29|  0.0|[ok, lar..., joki...|[ok, lar..., joki...|(13423,[0,24,297,...|(13423,[0,24,297,...|(13424,[0,24,297,...|
| spam|Free entry in 2 a...|   155|  1.0|[free, entry, in,...|[free, entry, 2, ...|(13423,[2,13,19,3...|(13423,[2,13,19,3...|(13424,[2,13,19,3...|
|  ham|U dun say so earl...|    49|  0.0|[u, dun, say, so,...|[u, dun, say, ear...|(13423,[0,70,80,1...|(13423,[0,70,8

In [112]:
datan=datan.select('label','features')

In [113]:
datan.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(13424,[7,11,31,6...|
|  0.0|(13424,[0,24,297,...|
|  1.0|(13424,[2,13,19,3...|
|  0.0|(13424,[0,70,80,1...|
|  0.0|(13424,[36,134,31...|
|  1.0|(13424,[10,60,139...|
|  0.0|(13424,[10,53,103...|
|  0.0|(13424,[125,184,4...|
|  1.0|(13424,[1,47,118,...|
|  1.0|(13424,[0,1,13,27...|
|  0.0|(13424,[18,43,120...|
|  1.0|(13424,[8,17,37,8...|
|  1.0|(13424,[13,30,47,...|
|  0.0|(13424,[39,96,217...|
|  0.0|(13424,[552,1697,...|
|  1.0|(13424,[30,109,11...|
|  0.0|(13424,[82,214,47...|
|  0.0|(13424,[0,2,49,13...|
|  0.0|(13424,[0,74,105,...|
|  1.0|(13424,[4,30,33,5...|
+-----+--------------------+
only showing top 20 rows



In [114]:
traindata,testdata = datan.randomSplit([0.7,0.3])

In [116]:
nbfit=nb.fit(traindata)

In [117]:
test=nbfit.transform(testdata)

In [118]:
test.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(13424,[0,1,3,9,1...|[-574.63929189276...|[0.99999999999937...|       0.0|
|  0.0|(13424,[0,1,7,8,1...|[-882.98113532224...|[1.0,8.6285976001...|       0.0|
|  0.0|(13424,[0,1,7,8,1...|[-1184.2966068083...|[1.0,2.2348423865...|       0.0|
|  0.0|(13424,[0,1,11,32...|[-882.69724849378...|[1.0,3.4449373158...|       0.0|
|  0.0|(13424,[0,1,14,31...|[-214.77475920545...|[1.0,1.2726229582...|       0.0|
|  0.0|(13424,[0,1,14,78...|[-703.99929311244...|[1.0,5.7394041007...|       0.0|
|  0.0|(13424,[0,1,18,20...|[-835.16526882744...|[1.0,4.7415982228...|       0.0|
|  0.0|(13424,[0,1,21,27...|[-752.59341254885...|[1.0,6.5910461968...|       0.0|
|  0.0|(13424,[0,1,30,12...|[-614.24560582438...|[1.0,4.4344695311...|       0.0|
|  0.0|(13424,[0

In [119]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [120]:
acc=MulticlassClassificationEvaluator()

In [121]:
acc.evaluate(test)

0.9204908978593299