## Import Data

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data_small.csv')
print(df.shape)
df.head()

(289551, 17)


Unnamed: 0,id,campaignName,description,categories,duration,monetaryGoal,country,city,state,date_created,deadline_date,launched_date,state_changed_at,backers_count,usd_pledged,url,target
0,1123889576,mars-on-earth-an-art-residency,Help a fine art photographer continue her proj...,Space Exploration,27,1000,US,Boston,MA,2015-06-24,2015-10-23,2015-09-26,2015-10-23,53,1884,https://www.kickstarter.com/projects/cassandra...,successful
1,1724173143,vulcan-i-rocket-powered-by-3d-printed-engine,Team of undergraduates racing to be the first ...,Space Exploration,30,15000,US,San Diego,CA,2014-05-06,2015-05-21,2015-04-21,2015-05-21,465,21882,https://www.kickstarter.com/projects/105499101...,successful
2,707260502,starscraper-the-next-generation-of-suborbital-...,What if we built a rocket that is better than ...,Space Exploration,31,10000,US,Boston,MA,2014-11-29,2015-01-09,2014-12-09,2015-01-09,294,17176,https://www.kickstarter.com/projects/burpg/sta...,successful
3,497637964,students-building-a-near-space-balloon-with-li...,A group of high school students are building a...,Space Exploration,30,150,US,Mountain View,CA,2014-11-19,2015-11-26,2015-10-27,2015-11-26,45,970,https://www.kickstarter.com/projects/136362214...,successful
4,1546008758,earth-360,Re-inventing the way we look at our planet by ...,Space Exploration,30,7500,US,Fairfield,CT,2012-04-11,2012-09-21,2012-08-22,2012-09-21,28,7576,https://www.kickstarter.com/projects/211370922...,successful


## Encode strings

In [3]:
def create_dictionary(data):
    temp_dict = {}
    for key in data.iteritems():
        if key[1] not in temp_dict.keys():
            temp_dict[key[1]] = len(temp_dict)
    return temp_dict

In [4]:
# use clean df with no duplicates
df = df.copy()

# create dictionaries for string variables
city_dict = create_dictionary(df['city'])
state_dict = create_dictionary(df['state'])
country_dict = create_dictionary(df['country'])
cat_dict = create_dictionary(df['categories'])

# map dictionaries to dataframe
df['categories'] = df['categories'].map(cat_dict)
df['country'] = df['country'].map(country_dict)
df['city'] = df['city'].map(city_dict)
df['state'] = df['state'].map(state_dict)
df['target'] = df['target'].map({'failed':0, 'successful':1})

# join name and description
df['name_description'] = df['campaignName'].apply(lambda x: ' '.join(x.split('-'))) + ' ' + df.description

## NLP

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import TransformerMixin
from sklearn.model_selection import cross_val_score, train_test_split

import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

nlp = spacy.load('en_core_web_md')

In [6]:
# pick features
target = 'target'
features = ['name_description', 'categories', 'duration'
            , 'monetaryGoal', 'country']

# train/test split
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=22)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((231640, 5), (57911, 5), (231640,), (57911,))

In [19]:
X_train_small = X_train[:100]
X_test_small = X_test[:100]
y_train_small = y_train[:10]
y_test_small = y_test[:10]

In [20]:
X_train_small.head()

Unnamed: 0,name_description,categories,duration,monetaryGoal,country
132205,social media cleanser What SMC offers YOU: Pea...,3,20,3000,0
271533,east west food drive to better serve artists W...,61,30,2500,0
260566,shoot omalley twice in san francisco We need $...,151,32,3500,0
201906,one mans trash is another mans calendar I'm ma...,110,30,1000,0
230423,the dance and the dawn a theatrical experience...,124,30,2000,0


In [24]:
def tokenizer(df):
    name_description = df.name_description
    doc = [nlp(text) for text in name_description]
    token_list = []
    for item in doc:
        tokens = [token.lemma_ for token in item if (token.is_stop != True) and (token.is_punct != True)]
#         tokens = [' '.join(token) for token in tokens]
        token_list.append(tokens)
    token_list = [' '.join(token) for token in token_list]
    return token_list

In [25]:
name_desc_tokens = tokenizer(X_train_small)

In [26]:
name_desc_tokens[1]

'east west food drive to better serve artist -PRON- be raise $ 2500 to buy healthy food\\r \n to feed east west artist admin \\r \n a homecooked dinner every nite \\r \n for -PRON- entire 6 months in berlin'

In [27]:
tfidf_vector = TfidfVectorizer(stop_words='english', max_features=10000)

tfidf_vector.fit(name_desc_tokens)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [29]:
import pickle

In [33]:
pickle.dump(tfidf_vector, open('tfidf.pkl', 'wb'))

In [None]:
x = tfidf_vector.transform(name_desc_tokens)

In [None]:
train_desc_matrix = x.todense()

In [None]:
train_desc_matrix_df = pd.DataFrame(train_desc_matrix, columns=tfidf_vector.get_feature_names())

In [None]:
train_desc_matrix_df.shape

In [None]:
# do everything on test

x_test_vector = tfidf_vector.transform(name_desc_tokens)

# turn into matrix
train_desc_matrix = x.todense()

In [None]:
print(train_desc_matrix_df.shape)
train_desc_matrix_df.head()

In [12]:
# combine train features with nlp features

def nlp_test(train,test):
    
    X_train = train.copy()
    X_test = test.copy()
    
    # vectorizer
    tfidf_vector = TfidfVectorizer(stop_words='english', max_features=8500)

    # tokenize name
    train_name = X_train['name_description']
    train_name = tfidf_vector.fit_transform(train_name)
    test_name = X_test['name_description']
    test_name = tfidf_vector.transform(test_name) 
    
    # create name matrix
    train_name_matrix = train_name.todense()
    test_name_matrix = test_name.todense()
    
    # create df from name matrix
    train_name_matrix_df = pd.DataFrame(train_name_matrix, columns=tfidf_vector.get_feature_names())
    test_name_matrix_df = pd.DataFrame(test_name_matrix, columns=tfidf_vector.get_feature_names())
    
    #get lengths of train and test
    train_len = len(X_train)
    test_len = len(X_test)

    # add id for merge
    X_train['id'] = list(range(train_len))
    train_name_matrix_df['id'] = list(range(train_len))

    X_test['id'] = list(range(test_len))
    test_name_matrix_df['id'] = list(range(test_len))


    # merge train with name matrix
    X_train = X_train.merge(train_name_matrix_df, on='id', how='inner')
    X_test = X_test.merge(test_name_matrix_df, on='id', how='inner')
    
    
    # drop unnecessary columns
    X_train = X_train.drop(columns=['name_description', 'id'])
    X_test = X_test.drop(columns=['name_description', 'id'])
    
    assert len(X_train) == len(train)
    assert len(X_test) == len(test)
    
    return X_train, X_test

In [13]:
X_train_nlp, y_train_nlp = nlp_test(X_train, X_test)

In [14]:
X_train_nlp.head()

Unnamed: 0,categories,duration,monetaryGoal,country_x,00,000,01,02,10,100,...,zodiac,zoe,zombie,zombies,zone,zoo,zu,zum,är,über
0,3,20,3000,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,61,30,2500,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,151,32,3500,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,110,30,1000,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,124,30,2000,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [20]:
model = RandomForestClassifier()

model.fit(X_train_nlp, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [22]:
model.score(X_train_nlp, y_train)

0.9875669141771715

In [23]:
y_preds = model.predict(y_train_nlp)

In [24]:
score = accuracy_score(y_preds, y_test)

In [25]:
score

0.7162542522146051

In [26]:
import pickle
pickle.dump(model, open('model_rf_tues.pkl', 'wb'))

## Functions for flask app

In [41]:
new_feat = features + ['campaignName', 'description']
new_feat = new_feat[1:]

In [42]:
input = df[new_feat][:1]

In [43]:
input

Unnamed: 0,categories,duration,monetaryGoal,country,campaignName,description
0,0,27,1000,0,mars-on-earth-an-art-residency,Help a fine art photographer continue her proj...


In [65]:
def nlp_john(data_df):
    
    df = data_df.copy()
    
    df['name_description'] = df['campaignName'].apply(lambda x: ' '.join(x.split('-'))) + ' ' + df.description
    
    features = ['name_description', 'categories', 'duration'
            , 'monetaryGoal', 'country']
    
    df = df[features]
    
    # vectorizer
    tfidf_vector = pickle.load(open('tfidf.pkl', 'rb'))

    # tokenize name
    train_text = df['name_description']
    train_text = tfidf_vector.transform(train_text)
    
    # create name matrix
    train_text_matrix = train_text.todense()
    
    # create df from name matrix
    train_text_matrix_df = pd.DataFrame(train_text_matrix, columns=tfidf_vector.get_feature_names())

    # add id for merge
    df['id'] = list(range(len(df)))
    train_text_matrix_df['id'] = list(range(len(df)))

    # merge train with name matrix
    df_merged = df.merge(train_text_matrix_df, on='id', how='inner')
    
    # drop unnecessary columns
    df_merged = df.drop(columns=['name_description', 'id'])
    
    assert len(df_merged) == len(data_df)
    
    return df_merged

In [66]:
nlp_john(input)

Unnamed: 0,categories,duration,monetaryGoal,country
0,0,27,1000,0


In [78]:
df = input.copy()
    
df['name_description'] = df['campaignName'].apply(lambda x: ' '.join(x.split('-'))) + ' ' + df.description
    
features = ['name_description', 'categories', 'duration'
            , 'monetaryGoal', 'country']
    
df = df[features]

    
# # vectorizer
tfidf_vector = pickle.load(open('tfidf.pkl', 'rb'))

# tokenize name
train_text = df['name_description']
train_text = tfidf_vector.transform(train_text)

# # create name matrix
train_text_matrix = train_text.todense()

# create df from name matrix
train_text_matrix_df = pd.DataFrame(train_text_matrix, columns=tfidf_vector.get_feature_names())

# # add id for merge
df['id'] = list(range(len(df)))
train_text_matrix_df['id'] = list(range(len(df)))

# merge train with name matrix
df_merged = df.merge(train_text_matrix_df, on='id', how='inner')
df_merged

# drop unnecessary columns
df_merged = df_merged.drop(columns=['name_description', 'id'])

# assert len(df_merged) == len(input)



In [79]:
df_merged

Unnamed: 0,categories,duration,monetaryGoal,country,100,11,11th,12,13th,1987,...,world,write,writer,yearn,yo,yoga,young,youth,zealand,ëcke
0,0,27,1000,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
