# Preprocessing

In [1]:
import re
import time
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

In [2]:
def clean_num_punct (text):
    text = re.sub(r'([^a-zA-Z ]+?)', ' ', text)
    text = text.replace('X', '')
    text = text.replace('\n', ' ')
    return text.lower()


def transform_text(series, save_as):
    vectorizer = TfidfVectorizer(norm='l2',
                                 lowercase=True, 
                                 use_idf=True, 
                                 sublinear_tf=True).fit(series)
    pickle.dump(vectorizer, open(save_as, 'wb'))
    vec_train = vectorizer.transform(series)
    return vec_train


## Get dataset

In [3]:
dataset_file = 'dataset_product.pickle'

In [4]:
with open(dataset_file, 'rb') as handle:
    dataset = pickle.load(handle)


In [5]:
X_train = dataset['X_train'].copy().reset_index(drop=True)
type(X_train)

pandas.core.series.Series

Let's follow some text along the process

In [6]:
#X_train[5]

### Clean training data

#### 1) Remove numbers, punctuation, capitalized Xs

In [7]:
type(X_train)
X_train = X_train.apply(clean_num_punct)

#### 2) Choose a language and filter

Since texts also contain proper nouns that might be helpful for classification but might not be found in a language's dictionary, we will not do anything here. It also seems that the texts are made up of American English considering the location (state/zip) provided with each sample.

#### 3) Remove stopwords

In [9]:
stop = stopwords.words('english')

# Exclude stopwords with Python's list comprehension and pandas.DataFrame.apply.
X_train = X_train.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop) and len(word)>1]))


In [10]:
# Before cleaning
#X_train_before = dataset['X_train'].reset_index(drop=True)
#X_train_before[5]

In [11]:
# After cleaning
#X_train[5]

#### 4) Other techniques

### Transform text to vector

In [12]:
start = time.time()
train_matrix = transform_text(X_train, 'tfidf_vec_product.pkl')
end = time.time()
print('Finished in: ' str(round(end-start,2)) + ' s')

In [13]:
train_matrix.shape, X_train.shape

((109963, 57940), (109963,))

In [14]:
# Our example from above, indexes that are filled (corresponds to set of words in text)
print(len(train_matrix[5].nonzero()[1]) == len(set(X_train[5].split(' '))))
#train_matrix[5].nonzero()[1] # Indices in sparse matrix

True


array([45839, 45584, 45371, 33333, 31565, 26351, 24627, 23696, 22352,
       20655, 20603, 16293, 12522,  9531,  8352,  7943,  7544,  7361,
        3538], dtype=int32)

In [15]:
# Are all of the unique cleaned words captured in this huge, sparse matrix?
for i in range(X_train.shape[0]):
    if len(train_matrix[i].nonzero()[1])==len(set(X_train[i].split(" "))):
        pass
    else:
        print('Error at index: ' + str(i))
print('End')

Error at index: 72576
End


Not quite yet, but it's a lot better than it was. I was able to find out about this data's peculiarities by investigating the above indices. For example: new lines and single character words (often typos or remainders of contractions). It might not be ideal to sort these out, but for now it helps massively with dimension reduction. Ultimately, it's a trade-off.

### Remove empty samples from  training data

In [16]:
#print(X_train[72576]) # Before
print(X_train.shape, train_matrix.shape) 
X_train = X_train.drop(index=72576).reset_index(drop=True)
train_matrix = transform_text(X_train, 'tfidf_vec_product.pkl')
#print(X_train[72576]) # After
print(X_train.shape, train_matrix.shape)

(109963,) (109963, 57940)
(109962,) (109962, 57940)


**Note:** Still need to remove element from y_train. Handled before training. Needs a better solution.

In [17]:
for i in range((train_matrix.shape[0])):
    if train_matrix[i].nnz == train_matrix.shape[1]:
        print('Error at: ' + str(i))
print('End')

End


## Save sparse matrix for training

In [18]:
#file_name= 'X_train_product.npz'
#sparse.save_npz(file_name, train_matrix)

-----------

## Processing for validation / check function for test data

In [19]:
stop = stopwords.words('english')
tf = pickle.load(open('tfidf_vec_product.pkl', 'rb'))

In [20]:
series = dataset['X_val'].copy()
series = series.reset_index(drop=True)
series = series.apply(clean_num_punct)
series = series.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop) and len(word)>1]))
# Transform to vector
val_matrix = tf.transform(series)

In [21]:
# Check to see if any are all zero
for i in range((val_matrix.shape[0])):
    if val_matrix[i].nnz == val_matrix.shape[1]:
        print('Error at: ' + str(i))
print('End')

End


In [22]:
def clean_raw_text(text_as_series, tf_file, stopwords):
    tf = pickle.load(open(tf_file, 'rb'))
    series = text_as_series.reset_index(drop=True)

    def clean_num_punct(text):
        text = re.sub(r'([^a-zA-Z ]+?)', ' ', text)
        text = text.replace('X', '')
        text = text.replace('\n', ' ')
        return text.lower()

    def remove_stopwords(text):
        word_list = text.split()
        filtered_words = [word for word in word_list if word not in stopwords.words('english')]
        return ' '.join(filtered_words)

    series = series.apply(clean_num_punct)
    series = series.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop) and len(word) > 1]))
    matrix = tf.transform(series)
    return matrix

In [23]:
mytest_mtx = clean_raw_text(dataset['X_val'].copy(), 'tfidf_vec_product.pkl', stop)

In [24]:
print(mytest_mtx.shape, val_matrix.shape)
print(val_matrix.shape, dataset['y_val'].shape)
print((mytest_mtx!=val_matrix).nnz==0)

(17874, 57940) (17874, 57940)
(17874, 57940) (17874,)
True


In [25]:
# Save for training validation (Test data will be processed in predict.py)
#file_name= 'X_val_product.npz'
#sparse.save_npz(file_name, val_matrix)