# Importing Libraries 

In [12]:
import pandas as pd
import numpy as np
import sklearn

# to split the dataset into test and train 
from sklearn.model_selection import train_test_split

# Transformers 
from sklearn.preprocessing import StandardScaler

# language Processing Libraries
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import spacy

# Neural network libraries
from tensorflow.keras.layers import *
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot

from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2

# Loading in the Dataset 
source: https://www.kaggle.com/kemical/kickstarter-projects/notebooks?sortBy=dateRun&group=upvoted&pageSize=20&datasetId=4104

In [13]:
# To view all the columns 
pd.set_option('display.max_columns', None)

df = pd.read_csv('kickstarter.csv', 
                parse_dates=['deadline', 'launched'])
df.head(5)

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


Ideas:
- delete ID
- feature engineering (deadline and launched)
- Consider top five countries
- Only consider failed or successful and make it a binary classifictaion
- Will keep all the 15 main categories
- For category we could pass it to the ordinal encoding 

# EDA

In [14]:
total_nan = df.isna().sum().sort_values(ascending=False)
percentage_nan = (total_nan / df.shape[0]) * 100
tabel = pd.concat([total_nan, percentage_nan], axis=1, keys=['Total NAN', 'Percentage of NAN'])
tabel

Unnamed: 0,Total NAN,Percentage of NAN
usd pledged,3797,1.002744
name,4,0.001056
usd_goal_real,0,0.0
usd_pledged_real,0,0.0
country,0,0.0
backers,0,0.0
state,0,0.0
pledged,0,0.0
launched,0,0.0
goal,0,0.0


In [15]:
df.state.value_counts()

failed        197719
successful    133956
canceled       38779
undefined       3562
live            2799
suspended       1846
Name: state, dtype: int64

In [16]:
# filtering out the dataset for binary target variable - failed / successful
df = df.loc[(df['state'] == 'failed') | (df['state'] == 'successful')]
df.state.value_counts()

failed        197719
successful    133956
Name: state, dtype: int64

In [17]:
# Imbalance classification but not worrisome
successful = 131490/(192871+131490)
successful

0.4053816580908309

In [18]:
# changing the target variable to 0 and 1 
df['state'] = df['state'].map({'failed':0, 'successful':1})
df['state'].value_counts()

0    197719
1    133956
Name: state, dtype: int64

# Deleting Nan values 

In [19]:
df.dropna(axis=0, inplace=True)

In [20]:
total_nan = df.isna().sum().sort_values(ascending=False)
percentage_nan = (total_nan / df.shape[0]) * 100
tabel = pd.concat([total_nan, percentage_nan], axis=1, keys=['Total NAN', 'Percentage of NAN'])
tabel

Unnamed: 0,Total NAN,Percentage of NAN
usd_goal_real,0,0.0
usd_pledged_real,0,0.0
usd pledged,0,0.0
country,0,0.0
backers,0,0.0
state,0,0.0
pledged,0,0.0
launched,0,0.0
goal,0,0.0
deadline,0,0.0


In [31]:
# Reseting the dataset 
df.reset_index(drop=True, inplace=True)

# Using NLTK Library to tokenize
- Tokenize the name column

In [49]:
# Instantiating
lemm = WordNetLemmatizer()
corpus=[]

for i in range(0, len(df['name'])):
#     print(i)
    doc = re.sub('[^a-zA-Z!?]', " ", df['name'][i])
    doc = doc.lower()
    doc = doc.split()
    doc = [lemm.lemmatize(word) for word in doc if not word in set(stopwords.words('english'))]
    doc = " ".join(doc)
    corpus.append(doc)

In [50]:
corpus

['song adelaide abullah',
 'greeting earth zgac art capsule et',
 'hank?',
 'toshicapital rekordz need help complete album',
 'monarch espresso bar',
 'support solar roasted coffee green energy! solarcoffee co',
 'chaser strip strip make shot b tch!',
 'jesus madman',
 'lisa lim new cd!',
 'cottage market',
 'g spot place gamers connect eachother go pro!',
 'tombstone old west tabletop game miniature mm',
 'survival ring',
 'beard',
 'note london',
 'mike corey darkness light album',
 'boco tea',
 'cmuk shoe take life foot first',
 'mikeyj clothing brand fundraiser',
 'alice wonderland g minor',
 'mountain brew quest alcohol sustainability',
 'book zoo mini comic',
 'matt cavenaugh jenny power make st album!',
 'superhero teddy bear',
 'permaculture skill',
 'rebel army origin heroic story major gripe',
 'moon animated short film',
 'daily brew coffee',
 'ledr workbook one tough journal!',
 'feather cast furled fly fishing leader',
 'bb',
 'chris eger band new nashville record!',
 'squ

# Using Spacy Library to tokenize

In [35]:
# instantiate the spacy object 
nlp = spacy.load("en_core_web_lg")

In [42]:
tokens=[]

for doc in nlp.pipe(df['name']):
    
    doc_tokens = []
    
    for token in doc:
        
        if (token.is_stop == False) & (token.is_punct ==False) & (token.pos_ != 'PRON'):
            doc_tokens.append(token.lemma_.lower())
            
    tokens.append(doc_tokens)
    

In [43]:
# Now we need to extract list from a list 
tokens

[['songs', 'adelaide', 'abullah'],
 ['greet', 'earth', 'zgac', 'arts', 'capsule', 'et'],
 ['hank'],
 ['toshicapital', 'rekordz', 'need', 'help', 'complete', 'album'],
 ['monarch', 'espresso', 'bar'],
 ['support',
  'solar',
  'roasted',
  'coffee',
  'green',
  'energy',
  ' ',
  'solarcoffee.co'],
 ['chaser', 'strips', 'strips', 'shot', 'b*tch'],
 ['jesus', 'madmen'],
 ['lisa', 'lim', 'new', 'cd'],
 ['cottage', 'market'],
 ['g', 'spot', 'place', 'gamer', 'connect', 'eachother', 'pro'],
 ['tombstone', 'old', 'west', 'tabletop', 'game', 'miniature', '32', 'mm'],
 ['survival', 'rings'],
 ['beard'],
 ['note', 'london'],
 ['mike', 'corey', 'darkness', 'light', 'album'],
 ['boco', 'tea'],
 ['cmuk', 'shoe', 'life', 'feet'],
 ['mikeyj', 'clothing', 'brand', 'fundraiser'],
 ['alice', 'wonderland', 'g', 'minor'],
 ['mountain', 'brew', 'quest', 'alcohol', 'sustainability'],
 ['book', 'zoo', 'mini', 'comic'],
 ['matt', 'cavenaugh', 'jenny', 'powers', '1st', 'album'],
 ['superhero', 'teddy', 'bear

In [44]:
corpus_spacy = []

for i in range(0, len(tokens)):
    
    token = " ".join(tokens[i])
    corpus_spacy.append(token)

In [45]:
corpus_spacy

['songs adelaide abullah',
 'greet earth zgac arts capsule et',
 'hank',
 'toshicapital rekordz need help complete album',
 'monarch espresso bar',
 'support solar roasted coffee green energy   solarcoffee.co',
 'chaser strips strips shot b*tch',
 'jesus madmen',
 'lisa lim new cd',
 'cottage market',
 'g spot place gamer connect eachother pro',
 'tombstone old west tabletop game miniature 32 mm',
 'survival rings',
 'beard',
 'note london',
 'mike corey darkness light album',
 'boco tea',
 'cmuk shoe life feet',
 'mikeyj clothing brand fundraiser',
 'alice wonderland g minor',
 'mountain brew quest alcohol sustainability',
 'book zoo mini comic',
 'matt cavenaugh jenny powers 1st album',
 'superhero teddy bear',
 'permaculture skill',
 'rebel army origins heroic story major gripe',
 'moon animated short film',
 'daily brew coffee',
 'ledr workbook tough journal',
 'feather cast furled fly fishing leader',
 'bb130a',
 'chris eger band new nashville record',
 'squatch watchers',
 'arrow

# Using NLTK and LSTM

 Approach to the problem:
 
 - We will one_hot to get the indices of words 
 - We will be using padding to make input all the same length
 - We will passing the input to to Embedded layer than to LSTM


In [47]:
voc_size = 10000
sent_length = 12

In [52]:
# One hot rep
one_hot_rep = [one_hot(words, voc_size) for words in corpus]
one_hot_rep

[[910, 9489, 9986],
 [1344, 7672, 5775, 8602, 4814, 9293],
 [5983],
 [5858, 7125, 6447, 4687, 6199, 4483],
 [1073, 4828, 5642],
 [8431, 1707, 4987, 3655, 4253, 1147, 8250, 1306],
 [952, 7591, 7591, 4515, 404, 7519, 7651],
 [5370, 3004],
 [564, 4272, 5493, 7307],
 [5270, 6153],
 [1550, 1420, 4132, 2749, 9761, 2086, 4013, 3679],
 [2823, 7263, 3825, 6175, 3267, 9699, 4796],
 [894, 5366],
 [3042],
 [2174, 4967],
 [3938, 6889, 8340, 8133, 4483],
 [3151, 4074],
 [4738, 2374, 4519, 2406, 4797, 3231],
 [3842, 39, 9765, 5238],
 [6111, 7793, 1550, 4608],
 [3115, 7091, 7364, 5432, 5197],
 [769, 5316, 3871, 6480],
 [948, 6609, 5251, 9824, 4515, 9772, 4483],
 [9691, 1609, 8502],
 [8301, 1310],
 [662, 8879, 8065, 8512, 6047, 4398, 3363],
 [9202, 1561, 4504, 3947],
 [2593, 7091, 3655],
 [3127, 8490, 7794, 7644, 7919],
 [4075, 4194, 8572, 9487, 1713, 1651],
 [150],
 [5986, 778, 1650, 5493, 4241, 7781],
 [4808, 4191],
 [5974, 5899, 599, 4483],
 [7280, 26, 3947, 7280, 6462, 9200, 26],
 [4388, 6996, 5501

In [57]:
# Padding the sentences to be the same length
embedded_docs = pad_sequences(one_hot_rep,
                            padding= 'pre',
                            maxlen=sent_length)
print(embedded_docs)

[[   0    0    0 ...  910 9489 9986]
 [   0    0    0 ... 8602 4814 9293]
 [   0    0    0 ...    0    0 5983]
 ...
 [   0    0    0 ... 6785 5096 5315]
 [   0    0    0 ...  384 8007 2916]
 [   0    0    0 ... 7416 2828 2968]]


In [58]:
embedded_docs.shape

(331462, 12)

In [66]:
X_final = embedded_docs
y_final = np.array(df['state'])

In [68]:
print(type(np.array(df['state'])))
type(embedded_docs)

<class 'numpy.ndarray'>


numpy.ndarray

In [69]:

X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size =0.25, random_state=105)

In [79]:
# Creating Model
embedded_vector_features = 100
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=5)

model = Sequential([
    (Embedding(voc_size, embedded_vector_features, input_length=sent_length)),
#     (LSTM(120, dropout=0.25, recurrent_dropout=0.25, return_sequences=True)),
    (Bidirectional(LSTM(120, dropout=0.35, recurrent_dropout=0.35, kernel_regularizer=l2(0.25), recurrent_regularizer=l2(0.325)))),
    (Dense(1, kernel_initializer = 'glorot_uniform', activation='sigmoid'))
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [80]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=21, batch_size = 80, callbacks=early_stopping)

Epoch 1/21
Epoch 2/21
Epoch 3/21
Epoch 4/21
Epoch 5/21
Epoch 6/21
Epoch 7/21
Epoch 8/21


<tensorflow.python.keras.callbacks.History at 0x7faa2f85db50>

In [81]:
model.evaluate(X_test, y_test)



[0.6399770379066467, 0.6439191102981567]