## 1. Import data and dependencies

In [1]:
import pickle 
import scipy
from scipy import sparse
import numpy as np
import pandas as pd
from ast import literal_eval

In [2]:
# load the data from the pkl file 
objects = []
with (open("home_project.pkl", "rb")) as openfile:
    while True:
        try:
            objects.append(pickle.load(openfile))
        except EOFError:
            break


In [3]:
# assign names to the dataframes
project_tf_idf_mat = objects[0]['project_tf_idf_mat']
project_df =  objects[0]['project_df']
holdout_tf_idf_mat =  objects[0]['holdout_tf_idf_mat']
holdout_df = objects[0]['holdout_df']

## 2. Inspect and preprocess the data
- text_features is a dictionary, it needs to be split to columns of feature
- country has some missing values

In [4]:
# look at the data 
project_df.head()

Unnamed: 0,datapoint_id,invoice_arrival_date,country,rel_doc,text_features
0,14240,2022-01-03 18:09:53.421000+00:00,AU,True,"{'num_of_rows': 98, 'num_of_punc_in_text_words..."
1,35837,2021-01-18 13:07:49.108000+00:00,AU,True,"{'num_of_rows': 19, 'num_of_punc_in_text_words..."
2,32165,2021-11-05 00:06:48.725000+00:00,AU,True,"{'num_of_rows': 47, 'num_of_punc_in_text_words..."
3,56670,2021-04-05 19:08:41.746000+00:00,AU,True,"{'num_of_rows': 9, 'num_of_punc_in_text_words'..."
4,38372,2021-02-02 13:39:24.751000+00:00,AU,True,"{'num_of_rows': 76, 'num_of_punc_in_text_words..."


The features are in a dictionary form, they have to be split to columns in order to understand what is going on here. The dictionaries are all strings so they have to be converted to dictionaries and then they can be converted to Pandas Series and used as columns of data. The function literal_eval will convert the text to dictionaries and the function pd.Series will convert the dictionaries to columns. 

In [None]:
# apply literal_eval to the text column
project_df.text_features.apply(literal_eval)

It not working, let's check if we can locate problematic strings.

In [5]:
# iterate over all strings and try to find where literal_eval is not working 
for i in range(len(project_df.text_features)):
    try:
        literal_eval(project_df.text_features[i])
    except Exception:
        print(i)
        

22645
23209


only two strings are causing the problems, let's take a closer look at them.

In [6]:
project_df.text_features[22645]

"{'num_of_rows': 1, 'num_of_punc_in_text_words': 0, 'num_of_punc_in_text_chars': 0, 'lines_made_of_symbols': 0, 'empty _spaces': 1, 'characters_in_raw_invoice': 2, 'words_raw_invoice_by_split': 0, 'ascii_characters_in_invoice': 0, 'words_capital_first': 0, 'words_all_uppercase': 0, 'alphanumeric_words': 0, 'words_repeated_characters': 0, 'web_adresses': 0, 'email_adresses': 0, 'num_of_digits': 0, 'solo_numbers': 0, 'float_point_numbers': 0, 'numbers_line_delimited': 0, 'total_number_of_numbers_in_invoice': 0, 'punc_prop': nan, 'lines_symbols_prop': nan, 'num_of_chrs_prop': inf, 'words_num_prop': nan, 'ascii_characters_to_prop': nan, 'words_capital_first_prop': nan, 'words_all_uppercase_prop': nan, 'alphanumeric_words_prop': nan, 'words_repeated_characters_prop': nan, 'digits_in_invoice_prop': nan, 'solo_numbers_prop': nan, 'float_point_numbers_prop': nan}"

In [7]:
project_df.text_features[23209]

"{'num_of_rows': 1, 'num_of_punc_in_text_words': 0, 'num_of_punc_in_text_chars': 0, 'lines_made_of_symbols': 0, 'empty _spaces': 0, 'characters_in_raw_invoice': 1, 'words_raw_invoice_by_split': 0, 'ascii_characters_in_invoice': 0, 'words_capital_first': 0, 'words_all_uppercase': 0, 'alphanumeric_words': 0, 'words_repeated_characters': 0, 'web_adresses': 0, 'email_adresses': 0, 'num_of_digits': 0, 'solo_numbers': 0, 'float_point_numbers': 0, 'numbers_line_delimited': 0, 'total_number_of_numbers_in_invoice': 0, 'punc_prop': nan, 'lines_symbols_prop': nan, 'num_of_chrs_prop': inf, 'words_num_prop': nan, 'ascii_characters_to_prop': nan, 'words_capital_first_prop': nan, 'words_all_uppercase_prop': nan, 'alphanumeric_words_prop': nan, 'words_repeated_characters_prop': nan, 'digits_in_invoice_prop': nan, 'solo_numbers_prop': nan, 'float_point_numbers_prop': nan}"

Both of these look like empty documents. I think it's safe to just remove these rows from the data.

In [8]:
# drop the bad rows 
project_df.drop([22645, 23209], axis= 0, inplace=True)

In [9]:
# transform text_features to columns of data and save the new dataframe.
project_df = pd.concat([project_df.drop(['text_features'], axis =1), project_df.text_features.apply(literal_eval).apply(pd.Series)], axis=1)

In [10]:
# take a look at the data, look for missing values
project_df.info()

# only country has missing values

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49998 entries, 0 to 49999
Data columns (total 35 columns):
 #   Column                              Non-Null Count  Dtype              
---  ------                              --------------  -----              
 0   datapoint_id                        49998 non-null  int64              
 1   invoice_arrival_date                49998 non-null  datetime64[ns, UTC]
 2   country                             48155 non-null  object             
 3   rel_doc                             49998 non-null  bool               
 4   num_of_rows                         49998 non-null  float64            
 5   num_of_punc_in_text_words           49998 non-null  float64            
 6   num_of_punc_in_text_chars           49998 non-null  float64            
 7   lines_made_of_symbols               49998 non-null  float64            
 8   empty _spaces                       49998 non-null  float64            
 9   characters_in_raw_invoice           499

The only variable with missing values is 'country', I will replace the missing values with 'missing'

In [11]:
# fill missing countries with 'missing'
project_df['country'].fillna('missing', inplace=True)

In [109]:
# copy the data to new dataframe so that we can always go back
from copy import deepcopy
df_copy = deepcopy(project_df)

In [110]:
# replace countries with small number of counts with 'other'
country_count = df_copy.country.value_counts()
under_50 = country_count[country_count<50]
df_copy.loc[df_copy["country"].isin(under_50.index.tolist()), 'country'] = "other"

In [15]:
df_copy.country.value_counts()

AU         39670
DK          5321
missing     1843
DE           731
US           689
other        335
GB           238
FR           232
PL           221
HU           133
NZ           118
AT           101
NL            97
LU            59
IT            58
CH            51
NO            51
ES            50
Name: country, dtype: int64

I need to remove the two problematic rows from the sparse csr matrix as well. I found the following function online, I didn't implement it myself.

In [17]:
def delete_rows_csr(mat, indices):
    """
    Remove the rows denoted by ``indices`` form the CSR sparse matrix ``mat``.
    """
    if not isinstance(mat, scipy.sparse.csr_matrix):
        raise ValueError("works only for CSR format -- use .tocsr() first")
    indices = list(indices)
    mask = np.ones(mat.shape[0], dtype=bool)
    mask[indices] = False
    return mat[mask]

In [18]:
project_tf_idf_mat = delete_rows_csr(project_tf_idf_mat, [22645, 23209])

Now I want to get rid of the id and date columns, I don't think they will be helpful:

In [19]:
df_copy.drop(['datapoint_id', 'invoice_arrival_date'], axis = 1, inplace=True)

Now convert country to dummies:

In [106]:
df_copy = pd.get_dummies(df_copy)

In [23]:
df_copy.head()

Unnamed: 0,rel_doc,num_of_rows,num_of_punc_in_text_words,num_of_punc_in_text_chars,lines_made_of_symbols,empty _spaces,characters_in_raw_invoice,words_raw_invoice_by_split,ascii_characters_in_invoice,words_capital_first,...,country_HU,country_IT,country_LU,country_NL,country_NO,country_NZ,country_PL,country_US,country_missing,country_other
0,True,98.0,0.0,0.0,0.0,349.0,2407.0,400.0,1954.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,True,19.0,0.0,0.0,0.0,16.0,172.0,30.0,131.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,True,47.0,0.0,0.0,0.0,184.0,1187.0,196.0,951.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,True,9.0,0.0,0.0,0.0,20.0,119.0,21.0,88.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,True,76.0,0.0,0.0,0.0,107.0,937.0,133.0,749.0,0.0,...,0,0,0,0,0,0,0,0,0,0


We have prepared the data, time to model

## 3. build models
 - Feature and target values: X,y
 - combine training data with tf-idf matrix
 - Train test split
 - train a few algorithms
 - Deal with imbalanced classes
 - I will train two models: Random Forest and Gradient Boosting

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD

### Reduce the dimension of the tf-idf matrix using truncated SVD
 The tf-idf matrix is way too big to work with, I will combine it with the text features and reduce the dimension using truncated SVD. This is common for sparse matrices

In [27]:
truncatedSVD = TruncatedSVD(150)

In [28]:
# split to features and target data and labels
y = df_copy['rel_doc']
X = df_copy.drop(['rel_doc'], axis = 1)

combine the dataframe with the tf-idf matrix and save the whole thing as a sparse matrix.

In [29]:
data = sparse.hstack([project_tf_idf_mat, X])

In [30]:
# split to train and test 
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.3,  random_state = 1, stratify = y)

In [31]:
truncatedSVD.fit(X_train)


TruncatedSVD(n_components=150)

In [38]:
truncated_X_train = truncatedSVD.transform(X_train)
truncated_X_test = truncatedSVD.transform(X_test)

Check for imbalance

In [35]:
y.value_counts(normalize=True) 
# pretty imbablanced... 

True     0.928897
False    0.071103
Name: rel_doc, dtype: float64

### Setup ML pipelines

In [36]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

I will train a Random Forest classifier and a Gradient Boosting classsifier. Obviously, many more classifiers can be considered. 

In [41]:
# setup up pipelines to stack scaling and modelling 
pipelines = {
    'rf': make_pipeline(StandardScaler(), RandomForestClassifier(random_state=1)),
    'gb': make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state=1))

    }

I will use grid search to optimize the n_estimators parameter for both models. Obviously, this can be done for all the parameters of the models, depending on how much time we want to invest.

In [42]:
# grid search for tuning parameters
grid = {
     'rf': {
         'randomforestclassifier__n_estimators' : [100, 200, 300] 
     },
     'gb':{
         'gradientboostingclassifier__n_estimators' : [100, 200, 300] 
     }
}

In [44]:
# find the best hyperparameters using GridSearchCV and save the best model

# create a blank dictionary to hold models
fit_models = {}
# loop over algorithms and choose hyperparameters using GrisSearchCV
for algo, pipeline in pipelines.items():
    print(f'training the {algo} model')
    model = GridSearchCV(pipeline, grid[algo], n_jobs = -1, cv=10)
    model.fit(truncated_X_train, y_train)
    fit_models[algo] = model

training the rf model
training the gb model


## 4. Evaluate performance on test partition

In [60]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from matplotlib import pyplot as plt

In [47]:
# transform the test dataset
truncated_X_test = truncatedSVD.transform(X_test)

For shortness of time I will consider a threshold of 0.5 for both models, This is not ideal since the data is very imbalanced. With more time I would try to optimize this threshold.

In [67]:
# look at confusion matrices for both models

for algo, model in fit_models.items():
    yhat = model.predict(truncated_X_test)
    cm = confusion_matrix(y_test, yhat)
    print(f' Confusion matrix for {algo}:')
    print(cm)

 Confusion matrix for rf:
[[  572   495]
 [  125 13808]]
 Confusion matrix for gb:
[[  601   466]
 [  170 13763]]


Both models are performing very well on the majority class, the performance on the minority class is not bad but it can probably be improved. 

In [68]:
# evaluate the performance of the models

for algo, model in fit_models.items():
    yhat = model.predict(truncated_X_test)
    accuracy = np.round(accuracy_score(y_test, yhat),3)
    precision = np.round(precision_score(y_test, yhat),3)
    recall = np.round(recall_score(y_test, yhat),3)
    print(f'metrics for {algo}: accuracy = {accuracy}, precision = {precision}, recall = {recall}')
    

metrics for rf: accuracy = 0.959, precision = 0.965, recall = 0.991
metrics for gb: accuracy = 0.958, precision = 0.967, recall = 0.988


### Save best model

In [69]:
with open ('RandomForestModel.pkl', 'wb') as f:
    pickle.dump(fit_models['rf'], f)

## 5. Make predictions

In [71]:
# transform text_features to columns of data and save the new dataframe.
holdout_df = pd.concat([holdout_df.drop(['text_features'], axis =1), holdout_df.text_features.apply(literal_eval).apply(pd.Series)], axis=1)

In [78]:
# fill missing countries with 'missing'
holdout_df['country'].fillna('missing', inplace=True)

In [111]:
# copy the data to new dataframe so that we can always go back
from copy import deepcopy
holdout_copy = deepcopy(holdout_df)

In [112]:
# replace countries with small number of counts with 'other'
holdout_copy.loc[holdout_copy["country"].isin(under_50.index.tolist()), 'country'] = "other"

In [113]:
# replace countries with small number of counts with 'other', some countries that were not in the training data
country_count = holdout_copy.country.value_counts()
under_5 = country_count[country_count<5]
holdout_copy.loc[holdout_copy["country"].isin(under_5.index.tolist()), 'country'] = "other"

In [119]:
holdout_copy.drop(['datapoint_id', 'invoice_arrival_date'], axis = 1, inplace=True)

In [120]:
holdout_copy = pd.get_dummies(holdout_copy)

In [122]:
data = sparse.hstack([holdout_tf_idf_mat, holdout_copy])

In [123]:
truncated_data = truncatedSVD.transform(data)

In [124]:
prediction = fit_models['rf'].predict(truncated_data)

In [138]:
prediction_submission = pd.DataFrame([holdout_df.datapoint_id, prediction]).T

In [143]:
prediction_submission.to_parquet('submission.parquet')

## If I had more time
- Better preprocessing for text features: I could look at the relationships bertween these variables, and maybe come up with some insights that can improve the modelling
- More classifiers : I tried only two algorithms - Random Forest and Gradient Boosting, many more classifiers can be tried out, like Naive Bayes, Neural Network etc.
- More hyperparameter tuning: I only tuned one hyperparameter in the models using cross validation over three values. Any one of the other hyperparameters can also be tuned and improved.
- Find optimal number of components for truncated SVD: I used 150 components without any strong justification, the correct thing to do is to try out different numbers and look at the variance in the data for each one of these numbers. Then we can choose a value that doesn't lose too much information.
- More attention to imbalance: The data is very imbalanced and I didn't really take this into account in the modelling, with more time I would consider resampling, class weights in the model fit and choosing a better threshold than 0.5