In [1]:
import findspark
findspark.init()

In [2]:

from sklearn.model_selection import train_test_split
from pyspark.ml.classification import LogisticRegression ,RandomForestClassifier ,LinearSVC
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import BinaryClassificationEvaluator 
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SQLContext
from pyspark import SparkContext
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StringIndexer
import pandas as pd

from pyspark import SparkConf
from apyori import apriori

In [3]:

spark=SparkSession.builder\
    .master("local[*]")\
    .appName("WordCount")\
    .getOrCreate()
sc=spark.sparkContext

23/05/05 16:38:53 WARN Utils: Your hostname, jimbo-G5-5587 resolves to a loopback address: 127.0.1.1; using 192.168.1.5 instead (on interface wlp0s20f3)
23/05/05 16:38:53 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/05/05 16:38:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Data Preprocessing

In [59]:
new_df=pd.read_csv('data/output.csv')

In [5]:
output=new_df.copy()

###  Feature Extraction

In [60]:
# convert labels to int so we can apply classification on them
new_df.loc[new_df['state'] == 'failed','state'] = 0
new_df.loc[new_df['state'] == 'successful','state'] = 1

new_df['state'] = new_df['state'].astype('int64')
print(new_df['state'].value_counts())

0    364650
1    246026
Name: state, dtype: int64


In [7]:
# remove useless features
data_model = new_df.drop(['ID','name','goal','category','deadline','launched'],axis=1)

In [8]:
print(data_model.columns.values)

['main_category' 'currency' 'pledged' 'state' 'backers' 'country'
 'usd pledged']


In [9]:
df = spark.createDataFrame(data_model)

In [10]:
df.printSchema()

root
 |-- main_category: string (nullable = true)
 |-- currency: string (nullable = true)
 |-- pledged: double (nullable = true)
 |-- state: long (nullable = true)
 |-- backers: double (nullable = true)
 |-- country: string (nullable = true)
 |-- usd pledged: double (nullable = true)



In [11]:
df.show(2)

23/05/05 16:39:35 WARN TaskSetManager: Stage 0 contains a task of very large size (1882 KiB). The maximum recommended task size is 1000 KiB.
[Stage 0:>                                                          (0 + 1) / 1]

+-------------+--------+-------+-----+-------+-------+-----------+
|main_category|currency|pledged|state|backers|country|usd pledged|
+-------------+--------+-------+-----+-------+-------+-----------+
|         Food|     USD|52375.0|    1|  224.0|     US|    52375.0|
|         Food|     USD| 1205.0|    1|   16.0|     US|     1205.0|
+-------------+--------+-------+-----+-------+-------+-----------+
only showing top 2 rows



                                                                                

In [12]:
def encoding_features(feature_input,feature_output,df):
    qualification_indexer = StringIndexer(inputCol=feature_input, outputCol=feature_output)#Fits a model to the input dataset with optional parameters.
    df = qualification_indexer.fit(df).transform(df)
    # df.show()
    return df

In [13]:
def one_hot_encoder(index,vec,df):
    onehotencoder_qualification_vector = OneHotEncoder(inputCol=index, outputCol=vec)
    df = onehotencoder_qualification_vector.fit(df).transform(df)
    return df 

In [14]:
df=encoding_features("main_category","main_category_index",df)
df =one_hot_encoder("main_category_index","main_category_vec",df)

23/05/05 16:39:36 WARN TaskSetManager: Stage 1 contains a task of very large size (1882 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [15]:
df=encoding_features("currency","currency_index",df)
df =one_hot_encoder("currency_index","currency_vec",df)

23/05/05 16:39:39 WARN TaskSetManager: Stage 4 contains a task of very large size (1882 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [17]:
#
# ! Not required
#df=encoding_features("pledged","pledged_index",df)
#df =one_hot_encoder("pledged_index","pledged_vec",df)

In [18]:
df=encoding_features("country","country_index",df)
df=one_hot_encoder("country_index","country_vec",df)

23/05/05 16:39:40 WARN TaskSetManager: Stage 7 contains a task of very large size (1882 KiB). The maximum recommended task size is 1000 KiB.


In [19]:
df.columns

['main_category',
 'currency',
 'pledged',
 'state',
 'backers',
 'country',
 'usd pledged',
 'main_category_index',
 'main_category_vec',
 'currency_index',
 'currency_vec',
 'country_index',
 'country_vec']

In [20]:
assembler = VectorAssembler(inputCols = ['main_category_vec', 'currency_vec' ,'pledged', 'backers' ,'country_vec',
 'usd pledged'], outputCol='features')


In [21]:
output = assembler.transform(df)

### ML Models

In [22]:
train, val,test = output.randomSplit([0.7, 0.2,0.1])

### Logistic Regression

In [26]:
lr = LogisticRegression(labelCol="state",maxIter=10)
lrn = lr.fit(train)

23/05/05 16:43:58 WARN TaskSetManager: Stage 60 contains a task of very large size (1882 KiB). The maximum recommended task size is 1000 KiB.
23/05/05 16:43:59 WARN TaskSetManager: Stage 62 contains a task of very large size (1882 KiB). The maximum recommended task size is 1000 KiB.
23/05/05 16:44:01 WARN TaskSetManager: Stage 64 contains a task of very large size (1882 KiB). The maximum recommended task size is 1000 KiB.
23/05/05 16:44:01 WARN TaskSetManager: Stage 66 contains a task of very large size (1882 KiB). The maximum recommended task size is 1000 KiB.
23/05/05 16:44:01 WARN TaskSetManager: Stage 68 contains a task of very large size (1882 KiB). The maximum recommended task size is 1000 KiB.
23/05/05 16:44:01 WARN TaskSetManager: Stage 70 contains a task of very large size (1882 KiB). The maximum recommended task size is 1000 KiB.
23/05/05 16:44:01 WARN TaskSetManager: Stage 72 contains a task of very large size (1882 KiB). The maximum recommended task size is 1000 KiB.
23/05/

In [27]:
predictions = lrn.transform(val)

eval = BinaryClassificationEvaluator(rawPredictionCol = "prediction", labelCol = "state")
auc = eval.evaluate(predictions)
print(auc)

23/05/05 16:44:05 WARN TaskSetManager: Stage 88 contains a task of very large size (1882 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

0.7698174512180909


In [28]:
predictions = lrn.transform(test)

eval = BinaryClassificationEvaluator(rawPredictionCol = "prediction", labelCol = "state")
auc = eval.evaluate(predictions)
print(auc)

23/05/05 16:44:08 WARN TaskSetManager: Stage 99 contains a task of very large size (1882 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

0.7702668233634103


### Linear SVM

In [29]:
lsvc = LinearSVC(maxIter=10, regParam=0.1,labelCol="state")
lsvcModel = lsvc.fit(train)

23/05/05 16:45:18 WARN TaskSetManager: Stage 110 contains a task of very large size (1882 KiB). The maximum recommended task size is 1000 KiB.
23/05/05 16:45:20 WARN TaskSetManager: Stage 112 contains a task of very large size (1882 KiB). The maximum recommended task size is 1000 KiB.
23/05/05 16:45:22 WARN TaskSetManager: Stage 114 contains a task of very large size (1882 KiB). The maximum recommended task size is 1000 KiB.
23/05/05 16:45:22 WARN TaskSetManager: Stage 116 contains a task of very large size (1882 KiB). The maximum recommended task size is 1000 KiB.
23/05/05 16:45:22 WARN TaskSetManager: Stage 118 contains a task of very large size (1882 KiB). The maximum recommended task size is 1000 KiB.
23/05/05 16:45:22 WARN TaskSetManager: Stage 120 contains a task of very large size (1882 KiB). The maximum recommended task size is 1000 KiB.
23/05/05 16:45:22 WARN TaskSetManager: Stage 122 contains a task of very large size (1882 KiB). The maximum recommended task size is 1000 KiB.

In [30]:
predictions = lsvcModel.transform(val)
eval = BinaryClassificationEvaluator(rawPredictionCol = "prediction", labelCol = "state")
auc = eval.evaluate(predictions)
print(auc)

23/05/05 16:45:45 WARN TaskSetManager: Stage 166 contains a task of very large size (1882 KiB). The maximum recommended task size is 1000 KiB.
[Stage 166:>                                                      (0 + 12) / 12]

0.5842423322045236


                                                                                

In [31]:
predictions = lsvcModel.transform(test)
eval = BinaryClassificationEvaluator(rawPredictionCol = "prediction", labelCol = "state")
auc = eval.evaluate(predictions)
print(auc)

23/05/05 16:45:49 WARN TaskSetManager: Stage 177 contains a task of very large size (1882 KiB). The maximum recommended task size is 1000 KiB.
[Stage 177:>                                                      (0 + 12) / 12]

0.582565673494293


                                                                                

### Apriori  Algorithm

In [46]:
output.columns

['main_category',
 'currency',
 'pledged',
 'state',
 'backers',
 'country',
 'usd pledged',
 'main_category_index',
 'main_category_vec',
 'currency_index',
 'currency_vec',
 'country_index',
 'country_vec',
 'features']

In [61]:
# Now for the apriori algorithm
# Will use the new_df dataframe

# First, some minor data cleaning
print(len(new_df['name'].unique()) / len(new_df['name']))
# There are duplicate rows in the dataframe, use the name column to remove them
new_df.drop_duplicates(subset ="name", keep = False, inplace = True)
print(len(new_df['name'].unique()) / len(new_df['name']))

0.5476275471772265
1.0


In [62]:
new_df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged
25,1000235643,HIIT Bottle�,Drinks,Food,USD,2015-04-27 19:33:08,15000.0,2015-03-13 18:33:08,124998.0,1,2784.0,US,124998.0
44,1000519001,Wordwright: re�de�fin�ing word play,Tabletop Games,Games,USD,2015-12-29 02:00:00,2000.0,2015-11-21 14:12:27,14794.0,1,624.0,US,14794.0
116,1001484182,Daweyu Hills � Crop to Cup,Drinks,Food,USD,2015-10-15 21:01:47,25000.0,2015-09-15 21:01:47,25816.0,1,172.0,US,25816.0
160,1002234028,�mp:dance / amiti perry + company,Dance,Dance,USD,2011-05-30 05:59:00,3500.0,2011-05-18 20:06:41,3520.0,1,30.0,US,3520.0
164,1002287054,Michale Graves �Vagabond Acoustic� Extremely L...,Rock,Music,USD,2013-05-27 01:54:34,2500.0,2013-05-06 01:54:34,12413.0,1,240.0,US,12413.0


In [49]:
association_rules = apriori(records, min_support=0.0045, min_confidence=0.2, min_lift=3, min_length=2)
association_results = list(association_rules)

In [51]:
print(len(association_results))

4426


In [54]:
for item in association_rules:

    # first index of the inner list
    # Contains base item and add item
    pair = item[0] 
    items = [x for x in pair]
    print("Rule: " + items[0] + " -> " + items[1])

    #second index of the inner list
    print("Support: " + str(item[1]))

    #third index of the list located at 0th
    #of the third index of the inner list

    print("Confidence: " + str(item[2][0][2]))
    print("Lift: " + str(item[2][0][3]))
    print("=====================================") 

RelationRecord(items=frozenset({' ', 's'}), support=0.07692307692307693, ordered_statistics=[OrderedStatistic(items_base=frozenset({' '}), items_add=frozenset({'s'}), confidence=1.0, lift=4.333333333333333), OrderedStatistic(items_base=frozenset({'s'}), items_add=frozenset({' '}), confidence=0.3333333333333333, lift=4.333333333333333)])