In [18]:
import findspark
findspark.init()

In [44]:

from sklearn.model_selection import train_test_split
from pyspark.ml.classification import LogisticRegression ,RandomForestClassifier ,LinearSVC
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import BinaryClassificationEvaluator 
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SQLContext
from pyspark import SparkContext
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StringIndexer
import pandas as pd

from pyspark import SparkConf
from apyori import apriori

In [20]:

spark=SparkSession.builder\
    .master("local[*]")\
    .appName("WordCount")\
    .getOrCreate()
sc=spark.sparkContext

### Data Preprocessing

In [45]:
new_df=pd.read_csv('D:/Fourth_Year/Second_Term/BD/KickStarter-Success-Prediction/output.csv')

In [46]:
output=new_df.copy()

###  Feature Extraction

In [22]:
# convert labels to int so we can apply classification on them
new_df.loc[new_df['state'] == 'failed','state'] = 0
new_df.loc[new_df['state'] == 'successful','state'] = 1

new_df['state'] = new_df['state'].astype('int64')
print(new_df['state'].value_counts())

0    364650
1    246026
Name: state, dtype: int64


In [23]:
# remove useless features
data_model = new_df.drop(['ID','name','goal','category','deadline','launched'],axis=1)

In [24]:
print(data_model.columns.values)

['main_category' 'currency' 'pledged' 'state' 'backers' 'country'
 'usd pledged']


In [25]:
df = spark.createDataFrame(data_model)

In [26]:
df.printSchema()

root
 |-- main_category: string (nullable = true)
 |-- currency: string (nullable = true)
 |-- pledged: double (nullable = true)
 |-- state: long (nullable = true)
 |-- backers: double (nullable = true)
 |-- country: string (nullable = true)
 |-- usd pledged: double (nullable = true)



In [27]:
df.show(2)

+-------------+--------+-------+-----+-------+-------+-----------+
|main_category|currency|pledged|state|backers|country|usd pledged|
+-------------+--------+-------+-----+-------+-------+-----------+
|         Food|     USD|52375.0|    1|  224.0|     US|    52375.0|
|         Food|     USD| 1205.0|    1|   16.0|     US|     1205.0|
+-------------+--------+-------+-----+-------+-------+-----------+
only showing top 2 rows



In [28]:
def encoding_features(feature_input,feature_output,df):
    qualification_indexer = StringIndexer(inputCol=feature_input, outputCol=feature_output)#Fits a model to the input dataset with optional parameters.
    df = qualification_indexer.fit(df).transform(df)
    # df.show()
    return df

In [29]:
def one_hot_encoder(index,vec,df):
    onehotencoder_qualification_vector = OneHotEncoder(inputCol=index, outputCol=vec)
    df = onehotencoder_qualification_vector.fit(df).transform(df)
    return df 

In [30]:
df=encoding_features("main_category","main_category_index",df)
df =one_hot_encoder("main_category_index","main_category_vec",df)

In [31]:
df=encoding_features("currency","currency_index",df)
df =one_hot_encoder("currency_index","currency_vec",df)

In [32]:
df=encoding_features("pledged","pledged_index",df)
df =one_hot_encoder("pledged_index","pledged_vec",df)

In [33]:
df=encoding_features("country","country_index",df)
df=encoding_features("country_index","country_vec",df)

In [34]:
assembler = VectorAssembler(inputCols = ['main_category_vec', 'currency_vec' ,'pledged_vec', 'backers' ,'country_vec',
 'usd pledged'], outputCol='features')


In [35]:
output = assembler.transform(df)

### ML Models

In [36]:
train, val,test = output.randomSplit([0.7, 0.2,0.1])

### Logistic Regression

In [38]:
lr = LogisticRegression(labelCol="state",maxIter=10)
lrn = lr.fit(train)

In [39]:
predictions = lrn.transform(val)

eval = BinaryClassificationEvaluator(rawPredictionCol = "prediction", labelCol = "state")
auc = eval.evaluate(predictions)
print(auc)

0.8232507581226749


In [40]:
predictions = lrn.transform(test)

eval = BinaryClassificationEvaluator(rawPredictionCol = "prediction", labelCol = "state")
auc = eval.evaluate(predictions)
print(auc)

0.8249852193907392


### Linear SVM

In [41]:
lsvc = LinearSVC(maxIter=10, regParam=0.1,labelCol="state")
lsvcModel = lsvc.fit(train)

In [42]:
predictions = lsvcModel.transform(val)
eval = BinaryClassificationEvaluator(rawPredictionCol = "prediction", labelCol = "state")
auc = eval.evaluate(predictions)
print(auc)

0.7944182478187383


In [43]:
predictions = lsvcModel.transform(test)
eval = BinaryClassificationEvaluator(rawPredictionCol = "prediction", labelCol = "state")
auc = eval.evaluate(predictions)
print(auc)

0.7977061661418743


### Apriori  Algorithm

In [58]:
records = []
for i in range(0, output.shape[0]):
    records.append([str(output.values[i,j]) for j in range(0, output.shape[1])])

KeyboardInterrupt: 

In [49]:
association_rules = apriori(records, min_support=0.0045, min_confidence=0.2, min_lift=3, min_length=2)
association_results = list(association_rules)

In [51]:
print(len(association_results))

4426


In [54]:
for item in association_rules:

    # first index of the inner list
    # Contains base item and add item
    pair = item[0] 
    items = [x for x in pair]
    print("Rule: " + items[0] + " -> " + items[1])

    #second index of the inner list
    print("Support: " + str(item[1]))

    #third index of the list located at 0th
    #of the third index of the inner list

    print("Confidence: " + str(item[2][0][2]))
    print("Lift: " + str(item[2][0][3]))
    print("=====================================") 

RelationRecord(items=frozenset({' ', 's'}), support=0.07692307692307693, ordered_statistics=[OrderedStatistic(items_base=frozenset({' '}), items_add=frozenset({'s'}), confidence=1.0, lift=4.333333333333333), OrderedStatistic(items_base=frozenset({'s'}), items_add=frozenset({' '}), confidence=0.3333333333333333, lift=4.333333333333333)])