In [1]:
import findspark
findspark.init()

In [30]:

from sklearn.model_selection import train_test_split
from pyspark.ml.classification import LogisticRegression ,RandomForestClassifier ,LinearSVC
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import BinaryClassificationEvaluator 
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SQLContext
from pyspark import SparkContext
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StringIndexer

from pyspark import SparkConf

### Sparks Integration

In [3]:
spark=SparkSession.builder\
    .master("local[*]")\
    .appName("WordCount")\
    .getOrCreate()



In [4]:
sc=spark.sparkContext

In [5]:
%run data_preprovessing.ipynb

  {


###  Feature Extraction

In [6]:
# convert labels to int so we can apply classification on them
new_df.loc[new_df['state'] == 'failed','state'] = 0
new_df.loc[new_df['state'] == 'successful','state'] = 1

new_df['state'] = new_df['state'].astype('int64')
print(new_df['state'].value_counts())

0    365934
1    247037
Name: state, dtype: int64


In [7]:
# remove useless features
data_model = new_df.drop(['ID','name','goal','category','deadline','launched'],axis=1)

In [8]:
print(data_model.columns.values)

['main_category' 'currency' 'pledged' 'state' 'backers' 'country'
 'usd pledged']


In [9]:
data_model.head(10)

Unnamed: 0,main_category,currency,pledged,state,backers,country,usd pledged
4,Food,USD,52375.0,1,224.0,US,52375.0
5,Food,USD,1205.0,1,16.0,US,1205.0
10,Music,USD,12700.0,1,100.0,US,12700.0
16,Music,USD,250.0,1,7.0,US,250.0
18,Fashion,USD,34268.0,1,624.0,US,34268.0
21,Comics,USD,701.66,1,66.0,US,701.659973
22,Music,USD,15827.0,1,147.0,US,15827.0
24,Film & Video,CAD,48905.0,1,571.0,CA,43203.25
25,Comics,GBP,112.38,1,27.0,GB,167.700302
27,Design,USD,47266.0,1,549.0,US,11253.0


In [12]:
df = spark.createDataFrame(data_model)

In [13]:
df.printSchema()

root
 |-- main_category: string (nullable = true)
 |-- currency: string (nullable = true)
 |-- pledged: string (nullable = true)
 |-- state: long (nullable = true)
 |-- backers: double (nullable = true)
 |-- country: string (nullable = true)
 |-- usd pledged: double (nullable = true)



In [14]:
df.show(2)

+-------------+--------+-------+-----+-------+-------+-----------+
|main_category|currency|pledged|state|backers|country|usd pledged|
+-------------+--------+-------+-----+-------+-------+-----------+
|         Food|     USD|  52375|    1|  224.0|     US|    52375.0|
|         Food|     USD|   1205|    1|   16.0|     US|     1205.0|
+-------------+--------+-------+-----+-------+-------+-----------+
only showing top 2 rows



In [15]:
def encoding_features(feature_input,feature_output,df):
    qualification_indexer = StringIndexer(inputCol=feature_input, outputCol=feature_output)#Fits a model to the input dataset with optional parameters.
    df = qualification_indexer.fit(df).transform(df)
    # df.show()
    return df

In [16]:
def one_hot_encoder(index,vec,df):
    onehotencoder_qualification_vector = OneHotEncoder(inputCol=index, outputCol=vec)
    df = onehotencoder_qualification_vector.fit(df).transform(df)
    return df 

In [17]:
df=encoding_features("main_category","main_category_index",df)
df =one_hot_encoder("main_category_index","main_category_vec",df)

In [18]:
df=encoding_features("currency","currency_index",df)
df =one_hot_encoder("currency_index","currency_vec",df)

In [19]:
df=encoding_features("pledged","pledged_index",df)
df =one_hot_encoder("pledged_index","pledged_vec",df)

In [20]:
df=encoding_features("country","country_index",df)
df=encoding_features("country_index","country_vec",df)

In [21]:
assembler = VectorAssembler(inputCols = ['main_category_vec', 'currency_vec' ,'pledged_vec', 'backers' ,'country_vec',
 'usd pledged'], outputCol='features')


In [22]:
output = assembler.transform(df)

### ML Models

In [23]:
train, test = output.randomSplit([0.7, 0.3])

### Logistic Regression

In [24]:
lr = LogisticRegression(labelCol="state",maxIter=10)
lrn = lr.fit(train)

In [25]:
predictions = lrn.transform(test)

In [26]:

eval = BinaryClassificationEvaluator(rawPredictionCol = "prediction", labelCol = "state")
auc = eval.evaluate(predictions)
print(auc)

0.777432319600384


### Linear SVM

In [32]:
lsvc = LinearSVC(maxIter=10, regParam=0.1,labelCol="state")
lsvcModel = lsvc.fit(train)

In [33]:
predictions = lsvcModel.transform(test)


In [34]:
eval = BinaryClassificationEvaluator(rawPredictionCol = "prediction", labelCol = "state")
auc = eval.evaluate(predictions)
print(auc)

0.7476256918850882
