In [12]:
import pandas as pd
import numpy as np
import re

In [13]:
pd.set_option('display.max_colwidth', -1)

# Text mining
## Feature engineering
In this work I'm going to analyse the project names of several KickStarter projects.
You can find the data set in Kaggle: https://www.kaggle.com/kemical/kickstarter-projects/data

In [14]:
#read data
df = pd.read_csv('data_textmining.csv', index_col=0, encoding='utf-8-sig')

In [15]:
# create safe copy
dfname = df

In [16]:
dfname.head()

Unnamed: 0,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,date_diff,successful,rtdate,rate,goal_usd,pledged_usd
0,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09 00:00:00,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,58,0,2015-08-01 00:00:00,1.527486,1527.485876,0.0
1,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01 00:00:00,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,59,0,2017-09-01 00:00:00,1.0,30000.0,2421.0
2,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26 00:00:00,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,44,0,2013-01-01 00:00:00,1.0,45000.0,220.0
3,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16 00:00:00,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,29,0,2012-03-01 00:00:00,1.0,5000.0,1.0
4,Community Film Project: The Art of Neighborhood Filmmaking,Film & Video,Film & Video,USD,2015-08-29 00:00:00,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,55,0,2015-07-01 00:00:00,1.0,19500.0,1283.0


In [17]:
# make sure all names are string
dfname['name'] = dfname.name.astype(str)

In [18]:
# there are 23105  projects with (canceled) in the name
pattern = '\(canceled\)'
dfname['name'] = dfname.name.str.replace(pattern, '', n=1, case=False)

In [19]:
# there are 1060  projects with (suspended) in the name
pattern = '\(suspended\)'
dfname['name'] = dfname.name.str.replace(pattern, '', n=1, case=False)

In [20]:
#remove rare characters 
pattern = '[^A-Za-z0-9!? ]'
dfname['name'] = dfname.name.str.replace(pattern, '')

In [21]:
pattern = '\(\)'
dfname['name'] = dfname.name.str.replace(pattern, '')

In [22]:
#remove excess whitespace
for i in range(8):
    pattern = '  '
    dfname['name'] = dfname.name.str.replace(pattern, ' ')

In [23]:
dfname.shape

(378654, 17)

In [24]:
pattern = '  '
dfname.name.str.find(pattern).sum()

-378654

In [25]:
# create new feature: name length
dfname['name_len'] = dfname.name.str.len()

In [26]:
# convert all to lowercase before split
dfname['name_low'] = dfname.name.str.lower()

In [27]:
# get rid of ? ! before split
pattern = '[!?]'
dfname['name_low'] = dfname.name_low.str.replace(pattern, '')

In [28]:
# drop empty names
dfname = dfname.drop(dfname[dfname.name_len == 0].index, axis=0)
dfname = dfname.drop(dfname[dfname.name == ' '].index, axis=0) 

In [29]:
# split each name by whitespace creating a list with words
dfname['name_words']  = dfname.name_low.str.strip().str.split('[\s]')

In [30]:
# create new feature: number of words in each name
dfname['number_words']  =  dfname.name_words.str.len()

In [31]:
dfname.name_words.isnull().sum()

0

## Word count
** I want to know how many times each word is used in successful or failed projects.**

In [32]:
# create a safe copy
table2 = dfname

In [33]:
rows= list()
for row in table2[['successful', 'name_words']].iterrows():
    r = row[1]
    for word in r.name_words:
        rows.append((r.successful, word))
        
words = pd.DataFrame(rows, columns=['successful', 'word'])
words.head()

Unnamed: 0,successful,word
0,0,the
1,0,songs
2,0,of
3,0,adelaide
4,0,abullah


In [34]:
# drop empty words
words = words.drop(words[words.word == ''].index, axis=0)

In [35]:
# pivot table
count_words = words.pivot_table(index = 'word', columns='successful',aggfunc=np.size)

In [42]:
count_words = count_words.sort_values(1, ascending=False)
count_words.head()

successful,0,1,prop_successful,prop_failed,diff
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
the,127362.0,82720.0,0.603611,0.527143,0.076467
a,59404.0,42984.0,0.313656,0.245869,0.067786
of,46576.0,28572.0,0.208491,0.192775,0.015716
and,32742.0,18362.0,0.133988,0.135517,-0.001529
for,31512.0,16790.0,0.122517,0.130426,-0.007909


In [38]:
# save to csv for further analysis
count_words.to_csv('count_words.csv', encoding='utf-8-sig')

In [39]:
# create new features
n_successful = len(dfname[dfname.successful == 1])

n_failed = len(dfname[dfname.successful == 0])

count_words['prop_successful'] = count_words[1] / n_successful

count_words['prop_failed'] = count_words[0] / n_failed

In [40]:
count_words['diff'] = count_words['prop_successful'] - count_words['prop_failed']

In [43]:
# which words are more used in successful projects than in failed projects?
sorted_bydiff = count_words.sort_values(by='diff', ascending=False)
sorted_bydiff[:30]

successful,0,1,prop_successful,prop_failed,diff
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
the,127362.0,82720.0,0.603611,0.527143,0.076467
album,11628.0,16426.0,0.119861,0.048128,0.071734
a,59404.0,42984.0,0.313656,0.245869,0.067786
new,13696.0,14036.0,0.102421,0.056687,0.045734
film,11618.0,11644.0,0.084967,0.048086,0.03688
debut,3840.0,6138.0,0.044789,0.015894,0.028896
by,10274.0,9764.0,0.071248,0.042523,0.028725
short,4620.0,6236.0,0.045504,0.019122,0.026382
ep,4056.0,5692.0,0.041535,0.016788,0.024747
record,2028.0,3420.0,0.024956,0.008394,0.016562
