# Learning the difference between boys' and girls' names, city names, etc.

## Text preprocessing

Goal: convert a text string into an array of letters, represented by numbers.

First, load the file of line-separated names. Strip out spaces.

In [54]:
import numpy as np 
from keras.utils.np_utils import to_categorical


In [55]:
#Clean up the file by stripping out extra spaces
def cleanFile(inputFile, outputFile):

    namesFile = open(inputFile, "rw")
    fixedNamesFile = open(outputFile, "w")
    names = namesFile.readlines()   

    for i in range(len(names)):
        names[i] = names[i].strip()

    #print(names)
    for name in names:
        fixedNamesFile.write(name)    
        fixedNamesFile.write('\n')
    namesFile.close()
    fixedNamesFile.close()

#cleanFile("USGirlsNames.txt", "USGirlsNamesStripped.txt")

#Clean up a file by removing duplicate entries
def removeDuplicates(inputFile, outputFile):
    
    namesFile = open(inputFile, "rw")
    fixedNamesFile = open(outputFile, "w")
    names = namesFile.readlines()   

    for i in range(len(names)):
        names[i] = names[i].strip()
    names = set(names)
    
    #print(names)
    for name in names:
        fixedNamesFile.write(name)    
        fixedNamesFile.write('\n')
    namesFile.close()
    fixedNamesFile.close()
    
#removeDuplicates("cityNamesWithDuplicates.csv", "cityNamesStripped.txt")

#Create a subsample of a dataset (Useful for making the dataset sizes match)
#input: a list of lists.
def subsample(input, size):
    #pick some random indices i to access
    i = np.random.permutation(len(input))
    #convert the list of lists into a numpy array:
    input = np.concatenate((input, input), axis = 0)
    #subsample the array at i:
    subsample = input[i[0:size]]
    return subsample

f = [[1,2,3,4,5], [1,2], [23]]
print(type(f))
print(type(f[0]))
print(len(f))
fs = subsample(f, 2)
print(fs)
print(type(fs))
print(type(fs[0]))


<type 'list'>
<type 'list'>
3
[[1, 2, 3, 4, 5] [23]]
<type 'numpy.ndarray'>
<type 'list'>


In [56]:
maxlen = 15 #The maximum name length that will be considered. Others will be padded or truncated.

In [57]:
def pad_name(n):
    return ("               "+n)[-maxlen:]


print( pad_name('fox') )

            fox


In [58]:
#now convert one string to an array of ascii codes

def convert_to_ascii(s):
   s=pad_name(s.upper().strip())
   return [ ord(c) for c in s ]
    

In [59]:
def convert_to_ascii_lists(names):
    return [convert_to_ascii(name) for name in names]



In [7]:
namesFile = open("USGirlsNamesStripped.txt", "rw")
names = namesFile.read().splitlines()   
asciiGirlNames = convert_to_ascii_lists(names)
namesFile.close()
#asciiGirlNames
print(len(asciiGirlNames))

1000


In [8]:
namesFile = open("USBoysNamesStripped.txt", "rw")
names = namesFile.read().splitlines()
asciiBoyNames = convert_to_ascii_lists(names)
namesFile.close()

#print(asciiBoyNames)
print(type(asciiBoyNames))
print(asciiBoyNames[0])
print(type(asciiBoyNames[0]))
print(asciiBoyNames[0:1])
print(len(asciiBoyNames))


<type 'list'>
[32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 74, 65, 67, 79, 66]
<type 'list'>
[[32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 74, 65, 67, 79, 66]]
1000


In [60]:
#The top 10,000 surnames in the US Census
namesFile = open("USCensusSurnames10K.txt", "r")
names = namesFile.read().splitlines()
asciiSurnames = convert_to_ascii_lists(names)
namesFile.close()

#print(asciiBoyNames)
print(asciiSurnames[2])


[32, 32, 32, 32, 32, 32, 32, 87, 73, 76, 76, 73, 65, 77, 83]


In [10]:
#These are the top 5000 cities by population:
namesFile = open("Top5000PopulationStripped.csv", "rw")
names = namesFile.read().splitlines()
asciiCityNames = convert_to_ascii_lists(names)
namesFile.close()
#asciiCityNames

In [61]:
#These are the unique place names found by zip codes
namesFile = open("cityNamesStripped.txt", "rw")
names = namesFile.read().splitlines()
asciiZipNames = convert_to_ascii_lists(names)
namesFile.close()
#asciiCityNames

## Building training and test sets

In [12]:
#Build training and test sets
#def build 

In [15]:
#To test all names vs city names, run this section: 
peopleY = np.ones(len(asciiBoyNames)+len(asciiGirlNames), dtype = np.int)
cityY = np.zeros(len(asciiZipNames), dtype = np.int)

ys = np.concatenate((peopleY, cityY), axis=0)
xs= np.concatenate((asciiBoyNames, asciiGirlNames, asciiZipNames), axis=0)

##print(type(ys))

boysY = np.zeros(len(asciiBoyNames), dtype=np.int)
girlsY = np.ones(len(asciiGirlNames), dtype=np.int)
ys = np.concatenate((boysY, girlsY), axis=0)
xs = np.concatenate((asciiBoyNames, asciiGirlNames), axis = 0)

In [42]:
#Boy names vs places
groupA = asciiBoyNames
groupB = asciiZipNames

In [17]:
#Boy names vs Girl names
groupA = asciiBoyNames
groupB = asciiGirlNames

In [18]:
#Girl names vs places
groupA = asciiGirlNames
groupB = asciiZipNames

In [19]:
#first names vs city names
#CAUTION: asciiBoyNames is getting modified!
print(len(asciiBoyNames))
groupA = asciiBoyNames
groupA.extend(asciiGirlNames)
groupB = asciiZipNames

print(len(groupA))
print(len(asciiBoyNames))

1000
2000
2000


In [20]:
#First names vs surnames
groupA = asciiBoyNames
groupA.extend(asciiGirlNames)
groupB = asciiSurnames

print(len(groupA))
print(len(asciiBoyNames))

3000
3000


In [62]:
#Test surnames vs. city names
#Since the datasets are pretty different in size, we'll take a random subsample of the 
#larger one to match the smaller one. 

groupA = asciiSurnames
groupB = asciiZipNames

print(len(groupA))
print(len(groupB))

10000
19252


In [10]:
#Make the group sizes match by randomly subsampling the larger group:

subsampleSize = min(len(groupA),len(groupB))
print(subsampleSize)
print(type(groupA))
subA = subsample(groupA, subsampleSize)
subB = subsample(groupB, subsampleSize)

#target is a single number.
aY = np.ones(subsampleSize, dtype = np.int)
bY = np.zeros(subsampleSize, dtype = np.int)

#concatenate the two groups into one
ys = np.concatenate((aY, bY), axis = 0)
xs = np.concatenate((subA, subB), axis = 0)

10000
<type 'list'>


In [83]:
#Make the group sizes match by randomly subsampling the larger group:
subsampleSize = min(len(groupA),len(groupB))
print(subsampleSize)
print(type(groupA))
subA = subsample(groupA, subsampleSize)
subB = subsample(groupB, subsampleSize)

#Make the targets a one-hot vector for each group.
aY = np.ones(subsampleSize, dtype = np.int)
bY = np.zeros(subsampleSize, dtype = np.int)

#Concatenate the two groups into one
ys = np.concatenate((aY, bY), axis = 0)
type(ys)
ys = to_categorical(np.asarray(ys))
xs = np.concatenate((subA, subB), axis = 0)

10000
<type 'list'>


In [15]:
# Now we need to build training and test sets. 

# To separate them into training and test, create a permutation of integers 1:the length of y. 
# Then grab the top 200 testPercent indexed by the permutation for the x and y test sets, and put the rest
# into the test sets. Everything else goes in the training sets

In [84]:
len(ys)

20000

In [85]:
testPercent = 0.1
i = np.random.permutation(len(ys))
testSize = int(testPercent*len(ys))

y_test = ys[i[0:testSize]]
y_train = ys[i[testSize:len(ys)]]
X_test = xs[i[0:testSize]]
X_train = xs[i[testSize:len(ys)]]
print(X_train)
print(X_test[0])

#Yay! We have training and test sets!

[[32 32 32 ..., 69 82 44]
 [32 32 32 ..., 69 82 44]
 [32 32 32 ..., 68 65 89]
 ..., 
 [32 32 32 ..., 79 78 44]
 [32 32 32 ..., 78 79 44]
 [32 32 32 ..., 68 79 82]]
[32 32 32 32 32 32 32 32 68 85 71 71 69 82 44]


In [86]:
print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)
#print(X_train)

(18000, 'train sequences')
(2000, 'test sequences')
Pad sequences (samples x time)
('X_train shape:', (18000, 15))
('X_test shape:', (2000, 15))
('y_train shape:', (18000, 2))
('y_test shape:', (2000, 2))


In [87]:
print(y_train[0:10])
print(type(X_train))
print(type(y_test))
print(type(X_test[0]))
print(X_train)

[[ 1.  0.]
 [ 1.  0.]
 [ 0.  1.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 0.  1.]
 [ 1.  0.]
 [ 0.  1.]
 [ 0.  1.]]
<type 'numpy.ndarray'>
<type 'numpy.ndarray'>
<type 'numpy.ndarray'>
[[32 32 32 ..., 69 82 44]
 [32 32 32 ..., 69 82 44]
 [32 32 32 ..., 68 65 89]
 ..., 
 [32 32 32 ..., 79 78 44]
 [32 32 32 ..., 78 79 44]
 [32 32 32 ..., 68 79 82]]


# Text classification

Now we begin the text classification using Convolution1D,
following the example of CNN_for_text_classification that ran on IMDB to classify sentiment
http://localhost:8888/notebooks/Documents/CNN_for_text_classification_example.ipynb

In [88]:
import numpy as np
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import LSTM

In [89]:
# set parameters:
max_features = 100   #5000 #26 #Number of distinct letters? Not sure...
batch_size = 32
embedding_dims = 50 #reduced from 50
nb_filter = 250 #reduced from 250. "Dimensionality of the output" 
                # -- but they are not making it binary so I don't know what this means...
filter_length = 3
hidden_dims = 250 #reduced from 250
nb_epoch = 2
lstm_blocks = 4 #number of memory cells. I have no idea what is reasonable here...
look_back = 3 #Look 3 steps back.

#I don't know what nb_filter is.

In [90]:
#from keras.datasets import imdb
#(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features)
#maxlen=400


In [94]:
print('Build model...')
model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen))

# we add an LSTM model:
model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2))
#default sigmoid activiation on LSTM.
#2 (below) is number of classes we are dividing on (surname vs cityname, etc.)
model.add(Dense(2, activation='softmax'))




Build model...


In [24]:
#Original model (no longer compiles):

#print('Build model...')
#model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
#model.add(Embedding(max_features,
#                    embedding_dims,
#                    input_length=maxlen,
#                    dropout=0.2))

# we add an LSTM model:
#model.add(LSTM(lstm_blocks, input_dim = look_back))
#default sigmoid activiation on LSTM.

# We add a vanilla hidden layer:
#model.add(Dense(hidden_dims))
#model.add(Dropout(0.2))
#model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
#model.add(Dense(1))
#model.add(Activation('sigmoid'))



In [95]:
model.compile(loss='mean_squared_error',
              optimizer='adam',
              metrics=['accuracy'])

In [96]:
f = model.fit(X_train, y_train,
          batch_size=batch_size,
          nb_epoch=nb_epoch,
validation_data=(X_test, y_test))

Train on 18000 samples, validate on 2000 samples
Epoch 1/2
Epoch 2/2


In [33]:
f

<keras.callbacks.History at 0x7fbea13cb8d0>

In [34]:
print(f.history)

{'acc': [0.98077777777777775, 1.0, 1.0], 'loss': [0.014909499947118699, 4.5523736130740465e-06, 2.4451105286971142e-06], 'val_acc': [1.0, 1.0, 1.0], 'val_loss': [6.224240314622875e-07, 8.2409586184439838e-08, 3.6908517245137772e-08]}


In [107]:
#Get the confusion matrix to figure out if we're getting *any* of the classes cleanly.

y_pred = model.predict(X_test)
print("ypredictions: ")
#print(y_pred[0:5]) 
#Each of these is an array containing a single value between 0 and 1.
#I'd expected it to be a one-hot vector...
#And it doesn't make sense that it thinks it's getting 100% when in fact it's only getting random.


y_pred = np.argmax(y_pred, axis=1)
#This seems to be pushing the values to 0 or 1:

print(y_pred[0:5])


yt=np.argmax(y_test, axis=1)

#Actual values: each prediction is one point, 0 or 1. Not a one-hot vector.
#Need to fix that for multi-dimensional spaces.
print("y_actual: ")
print(yt[0:5])
#print(labelSet)

#from sklearn.metrics import confusion_matrix
#print(confusion_matrix(yt, y_pred))

import pandas as pd
y_actu = pd.Series(yt, name='Actual')
#print("y_actu: ")
#print(y_actu)
yp = pd.Series(y_pred, name='Predicted')
#print("y_pred:")
#print(yp)
df_confusion = pd.crosstab(y_actu, yp)
print(df_confusion)

ypredictions: 
[0 0 1 1 0]
y_actual: 
[0 0 1 1 0]
Predicted    0     1
Actual              
0          992     0
1            0  1008
