In [3]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from pyspark.ml.classification import LogisticRegression ,RandomForestClassifier ,LinearSVC
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import BinaryClassificationEvaluator 
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SQLContext
from pyspark import SparkContext
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StringIndexer

from pyspark import SparkConf
# from apyori import apriori

###  Sparks Integration

In [4]:
# from pyspark.sql import SparkSession
# run this file once
spark_data=SparkSession.builder\
    .master("local[*]")\
    .appName("WordCount")\
    .getOrCreate()
sc=spark_data.sparkContext




In [5]:
def read_data_set(file_path,encode='cp850'):
    df= spark_data.read.option('header','true').csv( file_path)
    return df

### Read Data Set

In [6]:
df_data_set_1=read_data_set("D:/Fourth_Year/Second_Term/BD/KickStarter-Success-Prediction/ks-projects-201612.csv")
df_data_set_2=read_data_set("D:/Fourth_Year/Second_Term/BD/KickStarter-Success-Prediction/ks-projects-201801.csv")

In [7]:
df_data_set_1.show(5)

+----------+--------------------+--------------+--------------+---------+-------------------+-----+-------------------+--------+----------+--------+--------+------------+----+----+----+----+
|       ID |               name |     category |main_category |currency |          deadline |goal |          launched |pledged |    state |backers |country |usd pledged |_c13|_c14|_c15|_c16|
+----------+--------------------+--------------+--------------+---------+-------------------+-----+-------------------+--------+----------+--------+--------+------------+----+----+----+----+
|1000002330|The Songs of Adel...|        Poetry|    Publishing|      GBP|2015-10-09 11:36:00| 1000|2015-08-11 12:12:28|       0|    failed|       0|      GB|           0|null|null|null|null|
|1000004038|      Where is Hank?|Narrative Film|  Film & Video|      USD|2013-02-26 00:20:50|45000|2013-01-12 00:20:50|     220|    failed|       3|      US|         220|null|null|null|null|
|1000007540|ToshiCapital Reko...|         Mus

In [8]:
df_data_set_2.show(5)

+----------+--------------------+--------------+-------------+--------+----------+--------+-------------------+-------+--------+-------+-------+-----------+----------------+-------------+
|        ID|                name|      category|main_category|currency|  deadline|    goal|           launched|pledged|   state|backers|country|usd pledged|usd_pledged_real|usd_goal_real|
+----------+--------------------+--------------+-------------+--------+----------+--------+-------------------+-------+--------+-------+-------+-----------+----------------+-------------+
|1000002330|The Songs of Adel...|        Poetry|   Publishing|     GBP|2015-10-09| 1000.00|2015-08-11 12:12:28|   0.00|  failed|      0|     GB|       0.00|            0.00|      1533.95|
|1000003930|Greeting From Ear...|Narrative Film| Film & Video|     USD|2017-11-01|30000.00|2017-09-02 04:43:57|2421.00|  failed|     15|     US|     100.00|         2421.00|     30000.00|
|1000004038|      Where is Hank?|Narrative Film| Film & Vide

### Remove Repeated Columns and rename them

In [9]:
def remove_unnamed_columns(df,column):
    df = df.drop(column)
    return df

In [10]:
def remove_repeated_columns(df,column):
    df=df.drop(column)
    return df

In [11]:
def rename_columns_df(df):
    columns=list(df_data_set_1.columns)
    for column in columns:
       df= df.withColumnRenamed(column,column[:-1])
    return df


In [12]:
df_data_set_1=remove_unnamed_columns(df_data_set_1,"_c13")
df_data_set_1=remove_unnamed_columns(df_data_set_1,"_c14")
df_data_set_1=remove_unnamed_columns(df_data_set_1,"_c15")
df_data_set_1=remove_unnamed_columns(df_data_set_1,"_c16")
df_data_set_1.show(10)


+----------+--------------------+--------------+--------------+---------+-------------------+------+-------------------+--------+----------+--------+--------+------------+
|       ID |               name |     category |main_category |currency |          deadline | goal |          launched |pledged |    state |backers |country |usd pledged |
+----------+--------------------+--------------+--------------+---------+-------------------+------+-------------------+--------+----------+--------+--------+------------+
|1000002330|The Songs of Adel...|        Poetry|    Publishing|      GBP|2015-10-09 11:36:00|  1000|2015-08-11 12:12:28|       0|    failed|       0|      GB|           0|
|1000004038|      Where is Hank?|Narrative Film|  Film & Video|      USD|2013-02-26 00:20:50| 45000|2013-01-12 00:20:50|     220|    failed|       3|      US|         220|
|1000007540|ToshiCapital Reko...|         Music|         Music|      USD|2012-04-16 04:24:11|  5000|2012-03-17 03:24:11|       1|    failed|

In [13]:
df_data_set_2=remove_repeated_columns(df_data_set_2,"usd_pledged_real")
df_data_set_2=remove_repeated_columns(df_data_set_2,"usd_goal_real")

In [14]:
df_data_set_2.show(5)

+----------+--------------------+--------------+-------------+--------+----------+--------+-------------------+-------+--------+-------+-------+-----------+
|        ID|                name|      category|main_category|currency|  deadline|    goal|           launched|pledged|   state|backers|country|usd pledged|
+----------+--------------------+--------------+-------------+--------+----------+--------+-------------------+-------+--------+-------+-------+-----------+
|1000002330|The Songs of Adel...|        Poetry|   Publishing|     GBP|2015-10-09| 1000.00|2015-08-11 12:12:28|   0.00|  failed|      0|     GB|       0.00|
|1000003930|Greeting From Ear...|Narrative Film| Film & Video|     USD|2017-11-01|30000.00|2017-09-02 04:43:57|2421.00|  failed|     15|     US|     100.00|
|1000004038|      Where is Hank?|Narrative Film| Film & Video|     USD|2013-02-26|45000.00|2013-01-12 00:20:50| 220.00|  failed|      3|     US|     220.00|
|1000007540|ToshiCapital Reko...|         Music|        Mu

In [15]:
df_data_set_1=rename_columns_df(df_data_set_1)

### Merge Data Set

In [16]:
df = df_data_set_1.union(df_data_set_2)
df .show(5)

+----------+--------------------+--------------+-------------+--------+-------------------+-----+-------------------+-------+----------+-------+-------+-----------+
|        ID|                name|      category|main_category|currency|           deadline| goal|           launched|pledged|     state|backers|country|usd pledged|
+----------+--------------------+--------------+-------------+--------+-------------------+-----+-------------------+-------+----------+-------+-------+-----------+
|1000002330|The Songs of Adel...|        Poetry|   Publishing|     GBP|2015-10-09 11:36:00| 1000|2015-08-11 12:12:28|      0|    failed|      0|     GB|          0|
|1000004038|      Where is Hank?|Narrative Film| Film & Video|     USD|2013-02-26 00:20:50|45000|2013-01-12 00:20:50|    220|    failed|      3|     US|        220|
|1000007540|ToshiCapital Reko...|         Music|        Music|     USD|2012-04-16 04:24:11| 5000|2012-03-17 03:24:11|      1|    failed|      1|     US|          1|
|100001104

### Preprocessing Data Set

In [17]:
# drop all nulls remaining in name & category
df = df.dropna( subset=['name', 'category','country'])

In [18]:
df.columns

['ID',
 'name',
 'category',
 'main_category',
 'currency',
 'deadline',
 'goal',
 'launched',
 'pledged',
 'state',
 'backers',
 'country',
 'usd pledged']

In [19]:
df =df.toPandas()  

In [20]:
#get null percentage

df.isnull().sum()/df.shape[0]

ID               0.000000
name             0.000000
category         0.000000
main_category    0.000000
currency         0.000000
deadline         0.000000
goal             0.000000
launched         0.000000
pledged          0.000000
state            0.000000
backers          0.000000
country          0.000000
usd pledged      0.010782
dtype: float64

In [21]:
df.head(5)

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09 11:36:00,1000,2015-08-11 12:12:28,0,failed,0,GB,0
1,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26 00:20:50,45000,2013-01-12 00:20:50,220,failed,3,US,220
2,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16 04:24:11,5000,2012-03-17 03:24:11,1,failed,1,US,1
3,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29 01:00:00,19500,2015-07-04 08:35:03,1283,canceled,14,US,1283
4,1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01 13:38:27,50000,2016-02-26 13:38:27,52375,successful,224,US,52375


In [22]:
# convert usd pledged to float values
df.loc[:,'usd pledged'] = pd.to_numeric(df['usd pledged'], downcast='float', errors='coerce')
# convert goal to float
df['goal'] = pd.to_numeric(df.goal, downcast='float', errors='coerce')

In [23]:
# fill all nan with 0
df = df.fillna(value=0.0)

In [24]:
# check nan in country
df[~df.country.str.contains('^[A-Z]{2}$', case=False)].country.value_counts()

N,0""      3790
"N,""0"    3783
0           348
1           234
failed      198
           ... 
289           1
483           1
109           1
288           1
25.00         1
Name: country, Length: 248, dtype: int64

In [25]:
# Replace null countries with None
replace = df[~df.country.str.contains('^[A-Z]{2}$', case=False)].country.unique().tolist()
df.loc[:,'country'] = df.country.replace(to_replace=replace, value='None')

In [26]:
df.loc[:,'backers'] = pd.to_numeric(df.backers, errors='coerce', downcast='integer')

In [27]:
df = df.dropna(axis=0, subset=['backers'])

In [28]:
df.head(10)

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09 11:36:00,1000.0,2015-08-11 12:12:28,0.0,failed,0.0,GB,0.0
1,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26 00:20:50,45000.0,2013-01-12 00:20:50,220.0,failed,3.0,US,220.0
2,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16 04:24:11,5000.0,2012-03-17 03:24:11,1.0,failed,1.0,US,1.0
3,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29 01:00:00,19500.0,2015-07-04 08:35:03,1283.0,canceled,14.0,US,1283.0
4,1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01 13:38:27,50000.0,2016-02-26 13:38:27,52375.0,successful,224.0,US,52375.0
5,1000023410,Support Solar Roasted Coffee & Green Energy! ...,Food,Food,USD,2014-12-21 18:30:44,1000.0,2014-12-01 18:30:44,1205.0,successful,16.0,US,1205.0
6,1000030581,Chaser Strips. Our Strips make Shots their B*tch!,Drinks,Food,USD,2016-03-17 19:05:12,25000.0,2016-02-01 20:05:12,453.0,failed,40.0,US,453.0
7,1000034518,SPIN - Premium Retractable In-Ear Headphones w...,Product Design,Design,USD,2014-05-29 18:14:43,125000.0,2014-04-24 18:14:43,8233.0,canceled,58.0,US,8233.0
8,100004195,STUDIO IN THE SKY - A Documentary Feature Film...,Documentary,Film & Video,USD,2014-08-10 21:55:48,65000.0,2014-07-11 21:55:48,6240.57,canceled,43.0,US,6240.569824
9,100004721,Of Jesus and Madmen,Nonfiction,Publishing,CAD,2013-10-09 18:19:37,2500.0,2013-09-09 18:19:37,0.0,failed,0.0,CA,0.0


In [29]:
df["state"].value_counts()

failed                 364650
successful             246026
canceled                70955
live                     7216
undefined                7105
                        ...  
2016-02-22 21:57:30         1
2015-01-28 13:53:56         1
2014-10-06 10:58:27         1
2015-10-10 01:00:00         1
2013-04-30                  1
Name: state, Length: 204, dtype: int64

In [30]:
failed=df.loc[df['state'] == "failed"  ]

In [31]:
successed =df.loc[df['state'] == "successful"]

In [32]:
new_frames = [successed,failed]
new_df = pd.concat(new_frames)

In [33]:
new_df["state"].value_counts()

failed        364650
successful    246026
Name: state, dtype: int64

In [34]:
new_df.to_csv('output.csv',index=False)