## Configure PySpark Setup

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.0.1/spark-3.0.1-bin-hadoop2.7.tgz
!tar xf spark-3.0.1-bin-hadoop2.7.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.1-bin-hadoop2.7"


import findspark
findspark.init()


import pyspark 
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("App").getOrCreate()
spark

In [2]:
# check number of cores PySpark is using
cores = spark._jsc.sc().getExecutorMemoryStatus().keySet().size()
print("You are working with", cores, "core(s)")

You are working with 1 core(s)


In [3]:
!cp /content/drive/MyDrive/Datasets.zip .
!unzip Datasets.zip

Archive:  Datasets.zip
   creating: Datasets/
  inflating: Datasets/fifa19.csv     
  inflating: Datasets/.DS_Store      
  inflating: Datasets/zomato.csv     
  inflating: Datasets/nyc_air_bnb.csv  
  inflating: Datasets/supermarket_sales.csv  
  inflating: Datasets/users3.parquet  
  inflating: Datasets/Toddler Autism dataset July 2018.csv  
   creating: Datasets/uw-madison-courses/
  inflating: Datasets/uw-madison-courses/course_offerings.csv  
  inflating: Datasets/uw-madison-courses/sections.csv  
  inflating: Datasets/uw-madison-courses/schedules.csv  
  inflating: Datasets/uw-madison-courses/database.sqlite3  
  inflating: Datasets/uw-madison-courses/rooms.csv  
  inflating: Datasets/uw-madison-courses/teachings.csv  
  inflating: Datasets/uw-madison-courses/subjects.csv  
  inflating: Datasets/uw-madison-courses/subject_memberships.csv  
  inflating: Datasets/uw-madison-courses/grade_distributions.csv  
  inflating: Datasets/uw-madison-courses/instructors.csv  
  inflating: Dat

# Load Libraries

In [4]:
from pyspark.ml.feature import *
from pyspark.sql.types import * 

from pyspark.ml.classification import *
from pyspark.ml.evaluation import *
from pyspark.sql.functions import *
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.mllib.evaluation import MulticlassMetrics

import pandas as pd

# ML Model Building

In [5]:
path ="Datasets/"
df = spark.read.csv(path+'kickstarter.csv',inferSchema=True,header=True)

In [6]:
df.limit(4).toPandas()

Unnamed: 0,_c0,blurb,state
0,1,"Using their own character, users go on educati...",failed
1,2,"MicroFly is a quadcopter packed with WiFi, 6 s...",successful
2,3,"A small indie press, run as a collective for a...",failed
3,4,Zylor is a new baby cosplayer! Back this kicks...,failed


In [7]:
# Let's read a few full blurbs
df.show(4,False)

+---+-----------------------------------------------------------------------------------------------------------------------------------+----------+
|_c0|blurb                                                                                                                              |state     |
+---+-----------------------------------------------------------------------------------------------------------------------------------+----------+
|1  |Using their own character, users go on educational quests around a virtual world leveling up subject-oriented skills (ie Physics). |failed    |
|2  |MicroFly is a quadcopter packed with WiFi, 6 sensors, and 3 processors for ultimate stability -- and fits in the palm of your hand.|successful|
|3  |A small indie press, run as a collective for authors who want to self-publish, and a sexy, smart , hilarious novel!                |failed    |
|4  |Zylor is a new baby cosplayer! Back this kickstarter to help fund new cosplay photoshoots to share hi

In [8]:
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- blurb: string (nullable = true)
 |-- state: string (nullable = true)



In [9]:
df.count()

39853

In [10]:
def null_value_calc(df):
    null_columns_counts = []
    numRows = df.count()
    for k in df.columns:
        nullRows = df.where(col(k).isNull()).count()
        if(nullRows > 0):
            temp = k,nullRows,(nullRows/numRows)*100
            null_columns_counts.append(temp)
    return(null_columns_counts)

null_columns_calc_list = null_value_calc(df)
spark.createDataFrame(null_columns_calc_list, ['Column_Name', 'Null_Values_Count','Null_Value_Percent']).show()

+-----------+-----------------+------------------+
|Column_Name|Null_Values_Count|Null_Value_Percent|
+-----------+-----------------+------------------+
|      blurb|              207|0.5194088274408452|
|      state|             2273| 5.703460216294884|
+-----------+-----------------+------------------+



In [11]:
og_len = df.count()
drop_len = df.na.drop().count()
print("Total Rows that contain at least one null value:",og_len-drop_len)
print("Percentage of Rows that contain at least one null value:", (og_len-drop_len)/og_len)

Total Rows that contain at least one null value: 2273
Percentage of Rows that contain at least one null value: 0.05703460216294884


In [12]:
df = df.dropna()
df.count()

37580

In [13]:
# check target variable
df.groupBy("state").count().orderBy(col("count").desc()).show(truncate=False)

+------------------------------------------------------------------------------------------------------------+-----+
|state                                                                                                       |count|
+------------------------------------------------------------------------------------------------------------+-----+
|failed                                                                                                      |23064|
|successful                                                                                                  |13708|
| followed by a Q&A with cast & crew."                                                                       |2    |
| Inc.""."                                                                                                   |2    |
| with superhuman powers."                                                                                   |2    |
| ""Tomorrow Comes Today"" and more !"                          

In [14]:
# convert state into binary classification problem
df = df.filter("state IN('successful','failed')")
df.groupBy("state").count().orderBy(col("count").desc()).show(truncate=False)

+----------+-----+
|state     |count|
+----------+-----+
|failed    |23064|
|successful|13708|
+----------+-----+



In [15]:
df.select("blurb").show(10,False)

+-----------------------------------------------------------------------------------------------------------------------------------+
|blurb                                                                                                                              |
+-----------------------------------------------------------------------------------------------------------------------------------+
|Using their own character, users go on educational quests around a virtual world leveling up subject-oriented skills (ie Physics). |
|MicroFly is a quadcopter packed with WiFi, 6 sensors, and 3 processors for ultimate stability -- and fits in the palm of your hand.|
|A small indie press, run as a collective for authors who want to self-publish, and a sexy, smart , hilarious novel!                |
|Zylor is a new baby cosplayer! Back this kickstarter to help fund new cosplay photoshoots to share his cuteness with the world!    |
|Hatoful Boyfriend meet Skeletons! A comedy Dating Sim that pu

In [16]:
# Replace Slashes and parenthesis with spaces
df = df.withColumn("blurb",translate("blurb", "/()", "   ")) 
df.select("blurb").show(7,False)

+-----------------------------------------------------------------------------------------------------------------------------------+
|blurb                                                                                                                              |
+-----------------------------------------------------------------------------------------------------------------------------------+
|Using their own character, users go on educational quests around a virtual world leveling up subject-oriented skills  ie Physics . |
|MicroFly is a quadcopter packed with WiFi, 6 sensors, and 3 processors for ultimate stability -- and fits in the palm of your hand.|
|A small indie press, run as a collective for authors who want to self-publish, and a sexy, smart , hilarious novel!                |
|Zylor is a new baby cosplayer! Back this kickstarter to help fund new cosplay photoshoots to share his cuteness with the world!    |
|Hatoful Boyfriend meet Skeletons! A comedy Dating Sim that pu

In [17]:
# Removing anything that is not a letter
df = df.withColumn("blurb",regexp_replace('blurb', '[^A-Za-z ]+', ''))
df.select("blurb").show(10,False)

+-------------------------------------------------------------------------------------------------------------------------------+
|blurb                                                                                                                          |
+-------------------------------------------------------------------------------------------------------------------------------+
|Using their own character users go on educational quests around a virtual world leveling up subjectoriented skills  ie Physics |
|MicroFly is a quadcopter packed with WiFi  sensors and  processors for ultimate stability  and fits in the palm of your hand   |
|A small indie press run as a collective for authors who want to selfpublish and a sexy smart  hilarious novel                  |
|Zylor is a new baby cosplayer Back this kickstarter to help fund new cosplay photoshoots to share his cuteness with the world  |
|Hatoful Boyfriend meet Skeletons A comedy Dating Sim that puts you into a high school ful

In [18]:
# Remove multiple spaces
df = df.withColumn("blurb",regexp_replace('blurb', ' +', ' '))
df.select("blurb").show(4,False)

+------------------------------------------------------------------------------------------------------------------------------+
|blurb                                                                                                                         |
+------------------------------------------------------------------------------------------------------------------------------+
|Using their own character users go on educational quests around a virtual world leveling up subjectoriented skills ie Physics |
|MicroFly is a quadcopter packed with WiFi sensors and processors for ultimate stability and fits in the palm of your hand     |
|A small indie press run as a collective for authors who want to selfpublish and a sexy smart hilarious novel                  |
|Zylor is a new baby cosplayer Back this kickstarter to help fund new cosplay photoshoots to share his cuteness with the world |
+------------------------------------------------------------------------------------------------

In [19]:
# Lower case everything
df = df.withColumn("blurb",lower('blurb'))
df.select("blurb").show(4,False)

+------------------------------------------------------------------------------------------------------------------------------+
|blurb                                                                                                                         |
+------------------------------------------------------------------------------------------------------------------------------+
|using their own character users go on educational quests around a virtual world leveling up subjectoriented skills ie physics |
|microfly is a quadcopter packed with wifi sensors and processors for ultimate stability and fits in the palm of your hand     |
|a small indie press run as a collective for authors who want to selfpublish and a sexy smart hilarious novel                  |
|zylor is a new baby cosplayer back this kickstarter to help fund new cosplay photoshoots to share his cuteness with the world |
+------------------------------------------------------------------------------------------------

In [20]:
# tokenize words
regex_tokenizer = RegexTokenizer(inputCol="blurb", outputCol="words", pattern="\\W")
raw_words = regex_tokenizer.transform(df)
raw_words.show(2,False)

+---+------------------------------------------------------------------------------------------------------------------------------+----------+-------------------------------------------------------------------------------------------------------------------------------------------------+
|_c0|blurb                                                                                                                         |state     |words                                                                                                                                            |
+---+------------------------------------------------------------------------------------------------------------------------------+----------+-------------------------------------------------------------------------------------------------------------------------------------------------+
|1  |using their own character users go on educational quests around a virtual world leveling up subjectoriented skills ie physics

In [21]:
# Remove Stopwords
# Define a list of stop words or use default list
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
stopwords = remover.getStopWords() 

# Display default list
stopwords[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your']

In [22]:
words_df = remover.transform(raw_words)
words_df.show(1,False)

+---+------------------------------------------------------------------------------------------------------------------------------+------+-------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------+
|_c0|blurb                                                                                                                         |state |words                                                                                                                                            |filtered                                                                                                                  |
+---+------------------------------------------------------------------------------------------------------------------------------+------+-------------------------------------------

In [23]:
# converting state to string lables
# transform with string indexer
indexer = StringIndexer(inputCol="state", outputCol="label")  
words_df = indexer.fit(words_df).transform(words_df)
words_df.show(1)

+---+--------------------+------+--------------------+--------------------+-----+
|_c0|               blurb| state|               words|            filtered|label|
+---+--------------------+------+--------------------+--------------------+-----+
|  1|using their own c...|failed|[using, their, ow...|[using, character...|  0.0|
+---+--------------------+------+--------------------+--------------------+-----+
only showing top 1 row



In [24]:
# train test split
train = words_df.sampleBy("label", fractions={0: 0.7, 1: 0.7}, seed=10)
test = words_df.subtract(train)

In [25]:
train.groupby('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|  0.0|16111|
|  1.0| 9538|
+-----+-----+



In [26]:
test.groupby('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|  0.0| 6953|
|  1.0| 4170|
+-----+-----+



In [27]:
train.count(),test.count()

(25649, 11123)

In [28]:
def gbt_classifer(train_df,test_df):

    # Text Classification Model

    # Instaniate the classifier
    classifier = GBTClassifier(maxIter=5,maxDepth=5,maxBins=3)

    # Fit the model
    model = classifier.fit(train_df)

    # Generate predictions on test dataframe
    prediction = model.transform(test_df)

    #important: need to cast to float type, and order by prediction, else it won't work
    prediction = prediction.withColumn("label", prediction["label"].cast(FloatType())) 
    prediction = prediction.withColumn("prediction", prediction["prediction"].cast(FloatType())) 

    preds_and_labels = prediction.select(['prediction','label']).orderBy('prediction')
    metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))

    print("====== confusion Matrix=======")
    print(metrics.confusionMatrix().toArray())

    # Overall statistics
    print(f"\n\nAccuracy {metrics.accuracy}")

    # Statistics by class
    labels = [0.0,1.0]
    for label in sorted(labels):
        print("Class %s precision = %s" % (label, metrics.precision(label)))
        print("Class %s recall = %s" % (label, metrics.recall(label)))
        print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))

    # Weighted stats
    print("Weighted recall = %s" % metrics.weightedRecall)
    print("Weighted precision = %s" % metrics.weightedPrecision)
    print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
    print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
    print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)

    return

## Count Vectorizer



In [29]:
# Hashing (Count Vectroizer)
hashing = HashingTF(inputCol="filtered", outputCol="features", numFeatures=20)
train_hash = hashing.transform(train)
test_hash = hashing.transform(test)

In [30]:
train_hash.limit(3).toPandas()

Unnamed: 0,_c0,blurb,state,words,filtered,label,features
0,1,using their own character users go on educatio...,failed,"[using, their, own, character, users, go, on, ...","[using, character, users, go, educational, que...",0.0,"(3.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 2.0, 0.0, ..."
1,3,a small indie press run as a collective for au...,failed,"[a, small, indie, press, run, as, a, collectiv...","[small, indie, press, run, collective, authors...",0.0,"(3.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, ..."
2,5,hatoful boyfriend meet skeletons a comedy dati...,failed,"[hatoful, boyfriend, meet, skeletons, a, comed...","[hatoful, boyfriend, meet, skeletons, comedy, ...",0.0,"(2.0, 0.0, 1.0, 0.0, 1.0, 3.0, 0.0, 0.0, 0.0, ..."


In [31]:
gbt_classifer(train_hash,test_hash)

[[6719.  234.]
 [3935.  235.]]


Accuracy 0.6251910455812281
Class 0.0 precision = 0.6306551529941806
Class 0.0 recall = 0.9663454623903351
Class 0.0 F1 Measure = 0.7632191741920827
Class 1.0 precision = 0.5010660980810234
Class 1.0 recall = 0.05635491606714628
Class 1.0 F1 Measure = 0.10131493856434576
Weighted recall = 0.6251910455812281
Weighted precision = 0.5820723642692084
Weighted F(1) Score = 0.5150720320031352
Weighted F(0.5) Score = 0.49651649039387546
Weighted false positive rate = 0.6024906671237468


## TF-IDF Feature Extraction

In [32]:
# Hashing (Count Vectroizer)
hashing = HashingTF(inputCol="filtered", outputCol="raw_features", numFeatures=20)
train_hash = hashing.transform(train)
test_hash = hashing.transform(test)

In [33]:
# TF-IDF Transformation

idf = IDF(inputCol="raw_features", outputCol="features")
idf = idf.fit(train_hash)

train_tfidf = idf.transform(train_hash)
test_tfidf =  idf.transform(test_hash)

In [34]:
train_tfidf.limit(3).toPandas()

Unnamed: 0,_c0,blurb,state,words,filtered,label,raw_features,features
0,1,using their own character users go on educatio...,failed,"[using, their, own, character, users, go, on, ...","[using, character, users, go, educational, que...",0.0,"(3.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 2.0, 0.0, ...","(2.233013653618827, 0.0, 0.885294631505191, 0...."
1,3,a small indie press run as a collective for au...,failed,"[a, small, indie, press, run, as, a, collectiv...","[small, indie, press, run, collective, authors...",0.0,"(3.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, ...","(2.233013653618827, 0.0, 0.885294631505191, 0...."
2,5,hatoful boyfriend meet skeletons a comedy dati...,failed,"[hatoful, boyfriend, meet, skeletons, a, comed...","[hatoful, boyfriend, meet, skeletons, comedy, ...",0.0,"(2.0, 0.0, 1.0, 0.0, 1.0, 3.0, 0.0, 0.0, 0.0, ...","(1.488675769079218, 0.0, 0.885294631505191, 0...."


In [35]:
gbt_classifer(train_tfidf,test_tfidf)

[[6719.  234.]
 [3935.  235.]]


Accuracy 0.6251910455812281
Class 0.0 precision = 0.6306551529941806
Class 0.0 recall = 0.9663454623903351
Class 0.0 F1 Measure = 0.7632191741920827
Class 1.0 precision = 0.5010660980810234
Class 1.0 recall = 0.05635491606714628
Class 1.0 F1 Measure = 0.10131493856434576
Weighted recall = 0.6251910455812281
Weighted precision = 0.5820723642692084
Weighted F(1) Score = 0.5150720320031352
Weighted F(0.5) Score = 0.49651649039387546
Weighted false positive rate = 0.6024906671237468


## Word2Vec

In [36]:
# Word2Vec
word_vec = Word2Vec(vectorSize=10, minCount=1, inputCol="filtered", outputCol="features")
word_vec = word_vec.fit(train)

train_word_vec = word_vec.transform(train)
test_word_vec = word_vec.transform(test)

In [37]:
train_word_vec.limit(3).toPandas()

Unnamed: 0,_c0,blurb,state,words,filtered,label,features
0,1,using their own character users go on educatio...,failed,"[using, their, own, character, users, go, on, ...","[using, character, users, go, educational, que...",0.0,"[-0.020393035707197017, -0.008813660288329368,..."
1,3,a small indie press run as a collective for au...,failed,"[a, small, indie, press, run, as, a, collectiv...","[small, indie, press, run, collective, authors...",0.0,"[0.09818198370824878, -0.009876895851145187, 0..."
2,5,hatoful boyfriend meet skeletons a comedy dati...,failed,"[hatoful, boyfriend, meet, skeletons, a, comed...","[hatoful, boyfriend, meet, skeletons, comedy, ...",0.0,"[-0.0796588789422198, -0.08226457456060286, 0...."


In [38]:
gbt_classifer(train_word_vec,test_word_vec)

[[6091.  862.]
 [2200. 1970.]]


Accuracy 0.7247145554256945
Class 0.0 precision = 0.734652032324207
Class 0.0 recall = 0.8760247375233712
Class 0.0 F1 Measure = 0.7991340855418525
Class 1.0 precision = 0.6956214689265536
Class 1.0 recall = 0.4724220623501199
Class 1.0 F1 Measure = 0.56269637246501
Weighted recall = 0.7247145554256945
Weighted precision = 0.7200195186706769
Weighted F(1) Score = 0.7104938568687937
Weighted F(0.5) Score = 0.7128212504995067
Weighted false positive rate = 0.37626775555220343
