## Import Data

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data_small.csv')
print(df.shape)
df.head()

(289551, 17)


Unnamed: 0,id,campaignName,description,categories,duration,monetaryGoal,country,city,state,date_created,deadline_date,launched_date,state_changed_at,backers_count,usd_pledged,url,target
0,1123889576,mars-on-earth-an-art-residency,Help a fine art photographer continue her proj...,Space Exploration,27,1000,US,Boston,MA,2015-06-24,2015-10-23,2015-09-26,2015-10-23,53,1884,https://www.kickstarter.com/projects/cassandra...,successful
1,1724173143,vulcan-i-rocket-powered-by-3d-printed-engine,Team of undergraduates racing to be the first ...,Space Exploration,30,15000,US,San Diego,CA,2014-05-06,2015-05-21,2015-04-21,2015-05-21,465,21882,https://www.kickstarter.com/projects/105499101...,successful
2,707260502,starscraper-the-next-generation-of-suborbital-...,What if we built a rocket that is better than ...,Space Exploration,31,10000,US,Boston,MA,2014-11-29,2015-01-09,2014-12-09,2015-01-09,294,17176,https://www.kickstarter.com/projects/burpg/sta...,successful
3,497637964,students-building-a-near-space-balloon-with-li...,A group of high school students are building a...,Space Exploration,30,150,US,Mountain View,CA,2014-11-19,2015-11-26,2015-10-27,2015-11-26,45,970,https://www.kickstarter.com/projects/136362214...,successful
4,1546008758,earth-360,Re-inventing the way we look at our planet by ...,Space Exploration,30,7500,US,Fairfield,CT,2012-04-11,2012-09-21,2012-08-22,2012-09-21,28,7576,https://www.kickstarter.com/projects/211370922...,successful


## Encode strings

In [3]:
def create_dictionary(data):
    temp_dict = {}
    for key in data.iteritems():
        if key[1] not in temp_dict.keys():
            temp_dict[key[1]] = len(temp_dict)
    return temp_dict

In [4]:
# use clean df with no duplicates
df = df.copy()

# create dictionaries for string variables
city_dict = create_dictionary(df['city'])
state_dict = create_dictionary(df['state'])
country_dict = create_dictionary(df['country'])
cat_dict = create_dictionary(df['categories'])

# map dictionaries to dataframe
df['categories'] = df['categories'].map(cat_dict)
df['country'] = df['country'].map(country_dict)
df['city'] = df['city'].map(city_dict)
df['state'] = df['state'].map(state_dict)
df['target'] = df['target'].map({'failed':0, 'successful':1})

# join name and description

df['name_description'] = df['campaignName'].apply(lambda x: ' '.join(x.split('-'))) + ' ' + df.description

## NLP

In [None]:
!pip install spacy

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import TransformerMixin
from sklearn.model_selection import cross_val_score, train_test_split

import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

In [7]:
# pick features
target = 'target'
features = ['name_description', 'categories', 'duration'
            , 'monetaryGoal', 'country']

# train/test split
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=22)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((231640, 5), (57911, 5), (231640,), (57911,))

In [8]:
# combine train features with nlp features

def tfidf_features(train,test):
    
    X_train = train.copy()
    X_test = test.copy()
    
    # vectorizer
    tfidf_vector = TfidfVectorizer(stop_words='english', max_features=3000)

    # tokenize name
    train_name = X_train['name_description']
    tfidf_fitted = tfidf_vector.fit(train_name)
    train_name = tfidf_vector.transform(train_name)
    test_name = X_test['name_description']
    test_name = tfidf_vector.transform(test_name) 
    
    # create name matrix
    train_name_matrix = train_name.todense()
    test_name_matrix = test_name.todense()
    
    # create df from name matrix
    train_name_matrix_df = pd.DataFrame(train_name_matrix, columns=tfidf_vector.get_feature_names())
    test_name_matrix_df = pd.DataFrame(test_name_matrix, columns=tfidf_vector.get_feature_names())
    
    #get lengths of train and test
    train_len = len(X_train)
    test_len = len(X_test)

    # add id for merge
    X_train['id'] = list(range(train_len))
    train_name_matrix_df['id'] = list(range(train_len))

    X_test['id'] = list(range(test_len))
    test_name_matrix_df['id'] = list(range(test_len))


    # merge train with name matrix
    X_train = X_train.merge(train_name_matrix_df, on='id', how='inner')
    X_test = X_test.merge(test_name_matrix_df, on='id', how='inner')
    
    # drop unnecessary columns
    X_train = X_train.drop(columns=['name_description', 'id'])
    X_test = X_test.drop(columns=['name_description', 'id'])
    
    assert len(X_train) == len(train)
    assert len(X_test) == len(test)
    
    return X_train, X_test, tfidf_fitted

In [9]:
X_train_nlp, X_test_nlp, tfidf_fitted = tfidf_features(X_train, X_test)
X_train_nlp.shape, X_test_nlp.shape

((231640, 3004), (57911, 3004))

## Models

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
import pickle

In [12]:
model = RandomForestClassifier(n_jobs = -1, n_estimators=1000, max_depth=300)

model.fit(X_train_nlp, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=300, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [13]:
print('model score:', model.score(X_train_nlp, y_train))

#make predictions
y_preds = model.predict(X_test_nlp)

# accuracy
score = accuracy_score(y_preds, y_test)
print('test accuracy:', score)

model score: 0.9997539285097565
test accuracy: 0.7531902401961631


In [15]:
pickle.dump(model, open('model_rf_tfidf_75.pkl', 'wb'))
pickle.dump(tfidf_fitted, open('tfidf.pkl', 'wb'))

In [18]:
# try with joblib to reduce pkl size
filename = 'joblib_nlp_rf_75.sav'
joblib.dump(model, open(filename, 'wb'))

## Functions for flask app

In [None]:
def nlp_flask(data_df):
    
    df = data_df.copy()
    
    df['name_description'] = df['campaignName'].apply(lambda x: ' '.join(x.split('-'))) + ' ' + df.description
    
    features = ['name_description', 'categories', 'duration'
            , 'monetaryGoal', 'country']
    
    df = df[features]
    
    # vectorizer
    tfidf_vector = pickle.load(open('tfidf.pkl', 'rb'))

    # tokenize name
    train_text = df['name_description']
    train_text = tfidf_vector.transform(train_text)
    
    # create name matrix
    train_text_matrix = train_text.todense()
    
    # create df from name matrix
    train_text_matrix_df = pd.DataFrame(train_text_matrix, columns=tfidf_vector.get_feature_names())

    # add id for merge
    df['id'] = list(range(len(df)))
    train_text_matrix_df['id'] = list(range(len(df)))

    # merge train with name matrix
    df_merged = df.merge(train_text_matrix_df, on='id', how='inner')
    
    # drop unnecessary columns
    df_merged = df.drop(columns=['name_description', 'id'])
    
    assert len(df_merged) == len(data_df)
    
    return df_merged