## Import Data

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data_small.csv')
print(df.shape)
df.head()

(289551, 17)


Unnamed: 0,id,campaignName,description,categories,duration,monetaryGoal,country,city,state,date_created,deadline_date,launched_date,state_changed_at,backers_count,usd_pledged,url,target
0,1123889576,mars-on-earth-an-art-residency,Help a fine art photographer continue her proj...,Space Exploration,27,1000,US,Boston,MA,2015-06-24,2015-10-23,2015-09-26,2015-10-23,53,1884,https://www.kickstarter.com/projects/cassandra...,successful
1,1724173143,vulcan-i-rocket-powered-by-3d-printed-engine,Team of undergraduates racing to be the first ...,Space Exploration,30,15000,US,San Diego,CA,2014-05-06,2015-05-21,2015-04-21,2015-05-21,465,21882,https://www.kickstarter.com/projects/105499101...,successful
2,707260502,starscraper-the-next-generation-of-suborbital-...,What if we built a rocket that is better than ...,Space Exploration,31,10000,US,Boston,MA,2014-11-29,2015-01-09,2014-12-09,2015-01-09,294,17176,https://www.kickstarter.com/projects/burpg/sta...,successful
3,497637964,students-building-a-near-space-balloon-with-li...,A group of high school students are building a...,Space Exploration,30,150,US,Mountain View,CA,2014-11-19,2015-11-26,2015-10-27,2015-11-26,45,970,https://www.kickstarter.com/projects/136362214...,successful
4,1546008758,earth-360,Re-inventing the way we look at our planet by ...,Space Exploration,30,7500,US,Fairfield,CT,2012-04-11,2012-09-21,2012-08-22,2012-09-21,28,7576,https://www.kickstarter.com/projects/211370922...,successful


## Encode strings

In [3]:
def create_dictionary(data):
    temp_dict = {}
    for key in data.iteritems():
        if key[1] not in temp_dict.keys():
            temp_dict[key[1]] = len(temp_dict)
    return temp_dict

In [4]:
# create dictionaries for string variables
city_dict = create_dictionary(df['city'])
state_dict = create_dictionary(df['state'])
country_dict = create_dictionary(df['country'])
cat_dict = create_dictionary(df['categories'])

# map dictionaries to dataframe
df['categories'] = df['categories'].map(cat_dict)
df['country'] = df['country'].map(country_dict)
df['city'] = df['city'].map(city_dict)
df['state'] = df['state'].map(state_dict)
df['target'] = df['target'].map({'failed':0, 'successful':1})

# join name and description
df['name_description'] = df['campaignName'].apply(lambda x: ' '.join(x.split('-'))) + ' ' + df.description

## NLP

In [43]:
# only need to run once for AWS Sagemaker
# lg_url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.1.0/en_core_web_lg-2.1.0.tar.gz"

# r = requests.get(lg_url, allow_redirects=True)
# open('lg.zip', 'wb').write(r.content)

# tar = tarfile.open('lg.zip', "r:gz")
# tar.extractall('down_lg')

In [None]:
!pip install spacy

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import TransformerMixin
from sklearn.model_selection import cross_val_score, train_test_split

import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

nlp = spacy.load("./down_lg/en_core_web_lg-2.1.0/en_core_web_lg/en_core_web_lg-2.1.0")

In [9]:
# pick features and target
target = 'target'
features = ['name_description', 'categories', 'duration'
            , 'monetaryGoal', 'country']

# train/test split
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=22)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((231640, 5), (57911, 5), (231640,), (57911,))

In [10]:
# word embedding function

def nlp_word_embedding(train,test):
    
    X_train = train.copy()
    X_test = test.copy()
    
    # get vectors
    train_vectors = [nlp(doc).vector for doc in X_train.name_description]
    test_vectors = [nlp(doc).vector for doc in X_test.name_description]
    
    # create df from name matrix
    train_vector_df = pd.DataFrame(train_vectors)
    test_vector_df = pd.DataFrame(test_vectors)
    
    #get lengths of train and test
    train_len = len(X_train)
    test_len = len(X_test)

    # add id for merge
    X_train['id'] = list(range(train_len))
    train_vector_df['id'] = list(range(train_len))

    X_test['id'] = list(range(test_len))
    test_vector_df['id'] = list(range(test_len))


    # merge train with name matrix
    X_train = X_train.merge(train_vector_df, on='id', how='inner')
    X_test = X_test.merge(test_vector_df, on='id', how='inner')
    
    
    # drop unnecessary columns
    X_train = X_train.drop(columns=['name_description', 'id'])
    X_test = X_test.drop(columns=['name_description', 'id'])
    
    assert len(X_train) == len(train)
    assert len(X_test) == len(test)
    
    return X_train, X_test

In [14]:
X_train_nlp, X_test_nlp = nlp_word_embedding(X_train[:100000], X_test[:100000])

In [15]:
X_train_nlp.shape, X_test_nlp.shape

((100000, 304), (57911, 304))

## Model

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [19]:
model = RandomForestClassifier()

# fit model
model.fit(X_train_nlp2, y_train[:100000])



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [22]:
# accuracy

print("model accuracy:", model.score(X_train_nlp2, y_train[:100000]))
y_preds = model.predict(X_test_nlp2[:20000])
score = accuracy_score(y_preds, y_test[:20000])
print("test accuracy:", score)

model accuracy: 0.98957
test accuracy: 0.63015


In [None]:
import pickle
pickle.dump(model, open('model_rf_nlp_wed.pkl', 'wb'))

## Functions for flask app with spacy

In [None]:
# this is a function for the flask app.py file
def nlp_word_embedding(df):
    
    # dataframe
    df = df.copy()

    # get vectors
    vectors = [nlp(doc).vector for doc in X_train.name_description]
    
    # create df from name matrix
    vector_df = pd.DataFrame(vectors)
    
    #get lengths of train and test
    len = len(df)

    # add id for merge
    df['id'] = list(range(len))
    vector_df['id'] = list(range(len))


    # merge train with name matrix
    df_merged = df.merge(vector_df, on='id', how='inner')
      
    # drop unnecessary columns
    df_merged = df_merged.drop(columns=['name_description', 'id'])
    
    assert len(df) == len(df_merged)
    
    return df_merged

In [None]:
# need script in heroku script to install spacy