In [12]:
# Import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
np.random.seed(2012)

# Configure visual settings:
%matplotlib inline 
plt.rcParams['figure.figsize'] = (10.0, 8.0) 
plt.style.use(['bmh'])

# Load the dataframe

data = pd.read_pickle(r'assets/NLP_data.p')

# Import lists of variable names
with open('var_names.p', 'rb') as f:
    target_variables, predictor_variables, categorical_variables, numerical_variables, text_variables, ordinal_variables = pickle.load(f)

In [13]:
# No boxes OR furniture listed. Keep only those rows with either non-empty furniture list
# or non-null box count!
data = data[(data['furniture'] != '') | (~data['boxes'].isnull())]

In [14]:
# Handle null values once and for all here. Easily revisit-able in order to do more advanced imputation.

# Est_hours and num_movers are important target variables. I can't do without them!
data.dropna(subset=['est_hours','num_movers'], inplace=True)

# It is curious why we don't have info for the rate sometimes. Are these free moves? If they were for charity
# or for non-residential, I should have weeded them out sooner. I probably won't be doing anything with rate 
# for the time being though, so it doesn't matter.

# If travel_fee is blank, simply assume there wasn't one. This is a very good guess.
data['travel_fee'].fillna(0, inplace=True)

# If no boxes listed, assume furniture only.
data['boxes'].fillna(0, inplace=True)

# Some variable will just be median-imputed for now. They are the ones that can be imputed 
# more artfully later on:
median_imputation_variables = ['loc1.sqFt','loc2.sqFt','loc1.lengthOfWalkOptID','loc2.lengthOfWalkOptID']

for col in median_imputation_variables:
    data[col].fillna(data[col].median(), inplace=True)


data.isnull().sum().sort_values(ascending=False)[:5]

rate                       152
num_noun_phrases             0
loc1.stairs                  0
loc1.elevatorType_Large      0
loc2.rooms                   0
dtype: int64

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
training_data, testing_data = train_test_split(data, test_size=.2, stratify=data['truck_type'])

In [19]:
# Do count vectorizing:

# Define tokenizer:

from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
    
# Get only words, and stem:
tokenizer = RegexpTokenizer('[a-zA-Z]+')
lemmatizer = WordNetLemmatizer()
def my_tokenizer(string):
    tokens = tokenizer.tokenize(string)
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    lemmas = [x for x in lemmas if len(x)>1]
    return(lemmas)

# Do count vectorizer:

from sklearn.feature_extraction.text import CountVectorizer

cvt = CountVectorizer(stop_words='english', max_features = 100, tokenizer=my_tokenizer)
X = cvt.fit_transform(training_data['furniture'])

cvt_training_data = pd.DataFrame(X.A, columns=cvt.get_feature_names())

cvt_training_data.columns

assert cvt_training_data.shape[0] == training_data.shape[0]



training_data = training_data.reset_index().join(cvt_training_data, rsuffix='_')

cvt_training_data.sum().sort_values(ascending=False)[:10]

table      20363
chair      13735
small      10528
bed         9435
dresser     7270
tv          7264
large       5981
desk        5896
queen       5481
dining      5138
dtype: int64

In [21]:
X = cvt.transform(testing_data['furniture'])

cvt_testing_data = pd.DataFrame(X.A, columns=cvt.get_feature_names())

cvt_testing_data.columns

assert cvt_testing_data.shape[0] == testing_data.shape[0]



testing_data = testing_data.reset_index().join(cvt_testing_data, rsuffix='_')

cvt_testing_data.sum().sort_values(ascending=False)[:10]

table      5057
chair      3517
small      2563
bed        2362
dresser    1868
tv         1862
large      1620
desk       1553
queen      1368
dining     1308
dtype: int64

In [22]:
training_data['truck_type'].value_counts()

Little    6490
Big       2944
Name: truck_type, dtype: int64

In [23]:
testing_data['truck_type'].value_counts()

Little    1623
Big        736
Name: truck_type, dtype: int64

In [24]:
training_data.to_pickle('assets/training_data.p')
testing_data.to_pickle('assets/testing_data.p')