In [8]:
import findspark
findspark.init()

In [9]:

from sklearn.model_selection import train_test_split
from pyspark.ml.classification import LogisticRegression ,RandomForestClassifier ,LinearSVC
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import BinaryClassificationEvaluator 
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SQLContext
from pyspark import SparkContext
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StringIndexer
from sklearn.model_selection import train_test_split
import pandas as pd

from pyspark import SparkConf
from apyori import apriori

In [10]:

spark=SparkSession.builder\
    .master("local[*]")\
    .appName("WordCount")\
    .getOrCreate()
sc=spark.sparkContext

### Data Preprocessing

In [11]:
new_df=pd.read_csv('output.csv')

In [12]:
train_val, test = train_test_split(new_df, test_size=0.1, random_state=42)
train, val = train_test_split(train_val, test_size=0.22, random_state=42)

In [13]:
train.to_csv("train.csv")
val.to_csv("val.csv")
test.to_csv("test.csv")

In [14]:
new_df= train

In [15]:
output=new_df.copy()

###  Feature Extraction

In [16]:
# convert labels to int so we can apply classification on them
new_df.loc[new_df['state'] == 'failed','state'] = 0
new_df.loc[new_df['state'] == 'successful','state'] = 1

new_df['state'] = new_df['state'].astype('int64')

val.loc[val['state'] == 'failed','state'] = 0
val.loc[val['state'] == 'successful','state'] = 1

val['state'] = val['state'].astype('int64')


test.loc[test['state'] == 'failed','state'] = 0
test.loc[test['state'] == 'successful','state'] = 1

test['state'] = test['state'].astype('int64')

print(new_df['state'].value_counts())

0    139405
1     95360
Name: state, dtype: int64


In [17]:
# remove useless features
data_model = new_df.drop(['ID','name','goal','category','deadline','launched'],axis=1)
# remove useless features
val = val.drop(['ID','name','goal','category','deadline','launched'],axis=1)
# remove useless features
test = test.drop(['ID','name','goal','category','deadline','launched'],axis=1)

In [18]:
print(data_model.columns.values)

['main_category' 'currency' 'pledged' 'state' 'backers' 'country'
 'usd pledged']


In [19]:
df = spark.createDataFrame(data_model)
val = spark.createDataFrame(val)
test = spark.createDataFrame(test)

In [20]:
df.printSchema()

root
 |-- main_category: string (nullable = true)
 |-- currency: string (nullable = true)
 |-- pledged: double (nullable = true)
 |-- state: long (nullable = true)
 |-- backers: double (nullable = true)
 |-- country: string (nullable = true)
 |-- usd pledged: double (nullable = true)



In [21]:
df.show(2)

+-------------+--------+-------+-----+-------+-------+-----------+
|main_category|currency|pledged|state|backers|country|usd pledged|
+-------------+--------+-------+-----+-------+-------+-----------+
|        Games|     GBP| 7529.0|    1|  206.0|     GB|   10798.12|
|          Art|     USD|  331.0|    0|    6.0|     US|      331.0|
+-------------+--------+-------+-----+-------+-------+-----------+
only showing top 2 rows



In [22]:
def encoding_features(feature_input,feature_output,df):
    qualification_indexer = StringIndexer(inputCol=feature_input, outputCol=feature_output)#Fits a model to the input dataset with optional parameters.
    df = qualification_indexer.fit(df).transform(df)
    # df.show()
    return df

In [23]:
def one_hot_encoder(index,vec,df):
    onehotencoder_qualification_vector = OneHotEncoder(inputCol=index, outputCol=vec)
    df = onehotencoder_qualification_vector.fit(df).transform(df)
    return df 

In [24]:
df=encoding_features("main_category","main_category_index",df)
df =one_hot_encoder("main_category_index","main_category_vec",df)
val=encoding_features("main_category","main_category_index",val)
val =one_hot_encoder("main_category_index","main_category_vec",val)
test=encoding_features("main_category","main_category_index",test)
test =one_hot_encoder("main_category_index","main_category_vec",test)

In [25]:
df=encoding_features("currency","currency_index",df)
df =one_hot_encoder("currency_index","currency_vec",df)
val=encoding_features("currency","currency_index",val)
val =one_hot_encoder("currency_index","currency_vec",val)
test=encoding_features("currency","currency_index",test)
test =one_hot_encoder("currency_index","currency_vec",test)

In [26]:
#
# ! Not required
#df=encoding_features("pledged","pledged_index",df)
#df =one_hot_encoder("pledged_index","pledged_vec",df)

In [27]:
df=encoding_features("country","country_index",df)
df=one_hot_encoder("country_index","country_vec",df)
val=encoding_features("country","country_index",val)
val=one_hot_encoder("country_index","country_vec",val)
test=encoding_features("country","country_index",test)
test=one_hot_encoder("country_index","country_vec",test)

In [28]:
df.columns

['main_category',
 'currency',
 'pledged',
 'state',
 'backers',
 'country',
 'usd pledged',
 'main_category_index',
 'main_category_vec',
 'currency_index',
 'currency_vec',
 'country_index',
 'country_vec']

In [29]:
assembler = VectorAssembler(inputCols = ['main_category_vec', 'currency_vec' ,'pledged', 'backers' ,'country_vec',
 'usd pledged'], outputCol='features')


In [30]:
train = assembler.transform(df)
val = assembler.transform(val)
test = assembler.transform(test)

### ML Models

### Logistic Regression

In [33]:
lr = LogisticRegression(labelCol="state",maxIter=10)
lrn = lr.fit(train)

In [34]:
predictions = lrn.transform(val)

eval = BinaryClassificationEvaluator(rawPredictionCol = "prediction", labelCol = "state")
auc = eval.evaluate(predictions)
print(auc)

0.7805707424837259


### Linear SVM

In [48]:
lsvc = LinearSVC(maxIter=30, regParam=0.0001,labelCol="state")
lsvcModel = lsvc.fit(train)

In [49]:
predictions = lsvcModel.transform(val)
eval = BinaryClassificationEvaluator(rawPredictionCol = "prediction", labelCol = "state")
auc = eval.evaluate(predictions)
print(auc)

0.7360181163613487
