In [3]:
## load required libraries 
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from pdpbox import pdp, get_dataset, info_plots
from eli5.sklearn import PermutationImportance
from sklearn.preprocessing import LabelEncoder
from collections import Counter
import pandas as pd 
import numpy as np 
import eli5
import re

In [8]:
#Load the dataset
df = pd.read_csv("../input/ks-projects-201801.csv",  parse_dates = ["launched", "deadline"])
df_copy = df.copy()  #keep a copy of the data frame that remains intact
print ("Number of Campaigns: ", df.shape[0], "\nNumber of Columns: ", df.shape[1])
df.head()

Number of Campaigns:  378661 
Number of Columns:  15


Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


As of every machine learning problem, we need to explore the data and look out for problematic cases. These steps can be listed below:
* Outliers
* Missing values
* Unwanted information/labels
* Encode categorical values to numerical
* Extract Features

Let's see if there are any missing values. We will deal with outliers by considering features relevant to mean and median of each category. 

In [199]:
df.isnull().sum().sort_values(ascending = False)

usd pledged         3797
name                   4
usd_goal_real          0
usd_pledged_real       0
country                0
backers                0
state                  0
pledged                0
launched               0
goal                   0
deadline               0
currency               0
main_category          0
category               0
ID                     0
dtype: int64

So as it shows, usd_pledged has around 1% of it's values missing and there are 4 campaigns with missing names. Let's take a further look.

In [32]:
display(df[df['usd pledged'].isnull()].shape)
df[df['usd pledged'].isnull()].head()

(0, 15)

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real


It seems reasonable to keep these 3797 rows and substitute the NAN values with the corresponding usd_pledged_real and drop those 4 rows with no name. 
Also a glance at the country column shows that some country entries seem to be incorrect with the '*N,0"*' entry. We should also consider taking care of that. 
Since most of the campaigns are from the states (78.85%) we will use a indicator of being from the states or not. Hence these values will turn to 0 (not from the states, assuming that '*N,0"*' does not indicate that).

In [33]:
df['usd pledged'].fillna(df['usd_pledged_real'], inplace=True)
df.dropna(inplace=True)

In [34]:
#Let's make sure the null values have been taken care of:
df.isnull().sum().sort_values(ascending = False)

usd_goal_real       0
usd_pledged_real    0
usd pledged         0
country             0
backers             0
state               0
pledged             0
launched            0
goal                0
deadline            0
currency            0
main_category       0
category            0
name                0
ID                  0
dtype: int64

Great! So now that we have dealt with the missing values, let's delve into the feature extraction step. Below is a list of useful features that we will be using:
*Features focusing on the name of the campaign can be:*
* Number of words in the name
* Number of syllables
* Presence of particular punctuation (e.g. ? !)
* Number of characters
* Ratio between vowels and alphanumeric length of the name
* Whether the name is uppercase or not (Maybe names in uppercase can attract more attention to potential backers)

These features are useful because we are interested to know whether a specific range of words, characters or any sort of pattern is effective in the campaign's success.
Other important features could be those relevant to the time of launch and its durations. There may be a correlation between the time and day/month a campaign is launched.
Some features in this aspect could be:
* Launched hour
* Launched day
* Launched week
* Launched month
* Whether it was a weekend or not
* Duration of the campaign 

Another group of features which is obviously effective is based on the goal and pledged amount. We can extract new features by considering the mean goal in each category and main category to see if a specific campaign is an outlier compared to other campaignes of its field, etc. Below is a list of such features:
* Mean of the category's goal
* Median of the category's goal
* Count of the categoty (to see whether the campaign is relatively popular or not)
* Mean of the main category's goal
* Median of the main category's goal
* Count of the main category
* Difference between a campaign's goal and mean of its category
* Pledged amount as a percentage of the goal
* Pledged amount per backer 
* Average Pledged amount per category 
* Difference between pledged amount per backer for a campaign and the average received in that category

Another interesting feature could be the country where the campaign is launched.

The following blocks computes the features listed above.

In [35]:
 #Let's only keep the successful and failed labels and turn it into a binary classification:
df.query("state in ['failed', 'successful']", inplace=True)

In [36]:
df.shape[0]

331672

In [37]:
def syllable_count(word):
    """Count the syllables in a word"""
    word = word.lower()
    vowels = "aeiouy"
    count = 0
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith("e"):
        count -= 1
    if count == 0:
        count += 1
    return count

In [38]:
def countVowels2LettersRatio(word):
    '''Count ratio between vowels and letters'''
    word = str(word)
    count = 1  
    vowels = 0
    for i in word:
        if i.isalpha():
            count = count + 1
            if i in 'aeiou':
                vowels = vowels + 1
    return ((vowels * 1.0) / count)



In [39]:
## feature extraction for the campaign name

df["num_syllable"] = df["name"].apply(lambda x: syllable_count(x))
df["num_words"] = df["name"].apply(lambda x: len(x.split()))
df["num_chars"] = df["name"].apply(lambda x: len(x.replace(" ","")))
df['has_exclaimation_mark'] = int(len(re.findall(r'\!', str(df.name.str[:]))) != 0)  #presence of ! in the name
df['has_question_mark'] = int(len(re.findall(r'\?', str(df.name.str[:]))) != 0)  #presence of ? in the name
df['name_is_upper'] = df.name.str.isupper().astype(float)# if name is uppercase
df['name_vowel_ratio'] = df.name.apply(countVowels2LettersRatio)# for each name calculate vowels ratio




## feature extraction for the campaign length and timing
df["launched_month"] = df["launched"].dt.month
df["launched_week"] = df["launched"].dt.week
df["launched_day"] = df["launched"].dt.weekday
df["is_weekend"] = df["launched_day"].apply(lambda x: 1 if x > 4 else 0)
df["duration_in_days"] = df["deadline"] - df["launched"]
df["duration_in_days"] = df["duration_in_days"].apply(lambda x: int(str(x).split()[0]))

## label encoding the categorical features. This is necessary for the features related to goal and pledged amounts
#df = pd.concat([df, pd.get_dummies(df["main_category"])], axis = 1)
label_encoder = LabelEncoder()
for cat in ["category", "main_category"]:
    df[cat] = label_encoder.fit_transform(df[cat])
   
## feature extraction for goal and pledged values
df.head()
c1 = df.groupby("main_category").agg({"goal" : "mean", "category" : "sum"})
c2 = df.groupby("category").agg({"goal" : "mean", "main_category" : "sum"})
c1 = c1.reset_index().rename(columns={"goal" : "main_category_goal_mean", "category" : "main_category_count"})
c2 = c2.reset_index().rename(columns={"goal" : "category_goal_mean", "main_category" : "category_count"})
df = df.merge(c2, on = "category")
df = df.merge(c1, on = "main_category") 
df["diff_mean_category_goal"] = df["category_goal_mean"] - df["goal"]
df["diff_mean_category_goal"] = df["main_category_goal_mean"] - df["goal"]
df['goal_log'] = np.log1p(df.goal) # normalizing goal by applying log to prevent skewness
df.loc[:,'goal_reached'] = df['pledged'] / df['goal'] # Pledged amount as a percentage of goal.
df.loc[df['backers'] == 0, 'backers'] = 1 # In backers column, impute 0 with 1 to prevent undefined division.
df.loc[:,'pledge_per_backer'] = df['pledged'] / df['backers'] # Pledged amount per backer.

## create indicator variable for `country` variable
df['country_is_us'] = (df.country == 'US').astype(int)


In [40]:
df["state"] = df["state"].apply(lambda x: 1 if x=="successful" else 0)


In [41]:

df = df.drop(["launched", "deadline", "name", "currency", "country"], axis = 1) #drop the unnecassary columns

In [42]:
#Define the label and features and scale the data
label = df.state
features = [c for c in df.columns if c not in ["state"]]
#Let's normalise the data entries
df_scaled = pd.DataFrame(sklearn.preprocessing.normalize(df[features]))
#df_scaled.columns=list(df_scaled[features])

Now we are ready to split the data set to test and train and implement a model!

Resources: 
* https://www.kaggle.com/shivamb/an-insightful-story-of-crowdfunding-projects
* https://www.kaggle.com/srishti280992/kickstarter-project-classification-lgbm-70-3
* https://www.kaggle.com/dronqo/how-successful-is-your-kickstarter-project
* https://www.kaggle.com/kromel/kickstarter-successful-vs-failed
* https://www.kaggle.com/carlolepelaars/exploration-of-kickstarter-data-2010-2017

In [86]:
# Separate into training and testing set, use a simple midpoint separation
midpoint=int(df.shape[0]/2)
df_train = df.iloc[0:midpoint,:]
df_test = df.iloc[midpoint:int(df.shape[0]),:]
label_train=df.state.iloc[0:midpoint]
label_test=df.state.iloc[midpoint:int(df.shape[0])]

### Train the logistic regression model on training data

In [87]:
from sklearn.linear_model import LogisticRegression
import numpy as np

model_lr = LogisticRegression()

#fit and transform the training data
X = model_lr.fit(df_train,label_train)
# cv_test = model.transform(df_test)

### Making a Prediction and apply model to test data

In [88]:
predictions=model_lr.predict(df_test)

### Obtain an accuracy value

In [89]:
from sklearn.metrics import accuracy_score
score = accuracy_score(df_test.state,predictions)
print (score*100)

from sklearn.metrics import classification_report
print(classification_report(df_test.state,predictions))

from sklearn.metrics import confusion_matrix
print (confusion_matrix(df_test.state,predictions))

74.31438288429533
             precision    recall  f1-score   support

          0       0.71      0.98      0.82     99675
          1       0.94      0.38      0.54     66161

avg / total       0.80      0.74      0.71    165836

[[97966  1709]
 [40887 25274]]


### Train the Random Forest Classifier model on training data

In [102]:
from sklearn.ensemble import RandomForestClassifier

model_rfc = RandomForestClassifier()

#fit and transform the training data
X = model_rfc.fit(df_train,label_train)

### Making a Prediction and apply model to test data

In [103]:
predictions=model_rfc.predict(df_test)

### Obtain an accuracy value

In [104]:
from sklearn.metrics import accuracy_score
score = accuracy_score(df_test.state,predictions)
print (score*100)

from sklearn.metrics import classification_report
print(classification_report(df_test.state,predictions))

from sklearn.metrics import confusion_matrix
print (confusion_matrix(df_test.state,predictions))

100.0
             precision    recall  f1-score   support

          0       1.00      1.00      1.00     99675
          1       1.00      1.00      1.00     66161

avg / total       1.00      1.00      1.00    165836

[[99675     0]
 [    0 66161]]


### Train the Gradient Boost Classifier model on training data

In [105]:
from sklearn.ensemble import GradientBoostingClassifier

model_gbc = GradientBoostingClassifier()

#fit and transform the training data
X = model_gbc.fit(df_train,label_train)

### Making a Prediction and apply model to test data

In [106]:
predictions=model_gbc.predict(df_test)

### Obtain an accuracy value

In [107]:
from sklearn.metrics import accuracy_score
score = accuracy_score(df_test.state,predictions)
print (score*100)

from sklearn.metrics import classification_report
print(classification_report(df_test.state,predictions))

from sklearn.metrics import confusion_matrix
print (confusion_matrix(df_test.state,predictions))

100.0
             precision    recall  f1-score   support

          0       1.00      1.00      1.00     99675
          1       1.00      1.00      1.00     66161

avg / total       1.00      1.00      1.00    165836

[[99675     0]
 [    0 66161]]
