In [None]:
%load_ext autoreload
%autoreload 2

import cssutils
from functools import reduce
import os
from os.path import basename
import csv, ast
from glob import glob
from collections import Counter, defaultdict
from nltk.classify import MaxentClassifier 

import pymongo
from pymongo import MongoClient
from pymongo import TEXT

In [None]:
client = MongoClient('localhost:27017')
db=client.raxdb
faresheet = 'CXfaresheets_new'
fs = db['CXfaresheets_new']

### 0. Collect Data

In [None]:
### 0.1 Find manually created classifications
docs = [doc for doc in fs.find({'classifications.Commission': {"$exists": True}}, 
                               {'classifications':1, 'filename':1, 'country':1})]

In [None]:
def docs_convert(docs, target_class):
    """
    a function convert mongo classifications documents to data frame
    @param cols: a list of classification columns to include
    """
    df = pd.DataFrame(docs)
    cols = list((df.columns))
    cols.remove('classifications')
    if cols == []:  
        cols += list(docs[0]['classifications'].keys())
    else:
        cols.append(target_class)
        
    df = pd.concat([df.drop(['classifications'], axis=1), df['classifications'].apply(pd.Series)], axis=1)
    df = df[cols]
    # filter out undefined entries 
    df = df[df[target_class]!='Commission']    
    return df
dat = docs_convert(docs, 'Commission')
dat.head()

In [None]:
dat.describe()

In [None]:
# clean undefined data
dat = dat.loc[dat['Commission']!='undefined']
print(len(dat))
dat.describe()

In [None]:
# tag training data
cnt = 0
for index, row in dat.iterrows():
    fs.update_one({'_id': row['_id']},
                  {"$set": {'cases.commission_classification': 'training'}}
                 )
    cnt = cnt+1

### Some options to consider to deal with the lack of samples problem: 
- leave-one-out cross validation
- bagging and bootstrapping

### construct data for modelling:

In [None]:
#docs = [doc for doc in fs.find({'cases.commission_classification': "training"}, 
docs = [doc for doc in fs.find({'$or': [ { 'classifications.Commission': 'yes' }, { 'classifications.Commission': 'no' }]}, 
                               {'tc_features':1, 'topword_features':1, 'classifications':1, 'filename':1, 'country':1})]

In [None]:
len(docs)

In [None]:
def data_construct(target_docs, feature_list, mode, class_n="", labels=[]):
    """
    a function convert mongo classifications documents to data frame
    
    @param feature_list: a list of classification features to include
    @param docs: mongodb documents which contain the following fields: _id, filename, country and features
    @param labels: a list of valid labelling values
    @param class_n: name of the classification task
    @param mode: either 'training' or 'prediction'
    
    @return X_train: a list of dictionary, features data
    @return Y_train: a list of labels
    @return files_index: a list of MongoDB document id to identify the files  
    """
    X_dat = []
    Y_dat = []
    files_index = []
    if mode not in ['training', 'prediction']:
        print('Invalid mode for this function')
        return None
    cnt = 0
    for doc in target_docs:
        # check whether is a valid label for training mode
        if mode == 'training':
            valid = False
            for label in labels:
                if doc['classifications'][class_n] == label:
                    valid = True
        else:
            valid = True
        # if it's valid
        if valid:
            features = {}
            for feature in feature_list:
                try:
                    dict1 = doc[feature]
                    features.update(dict1)
                except KeyError:
                    print("File: ", doc['filename'], "can't find feature", feature)
            # if this data entries has no features
            if features:
                # add features
                X_dat.append(features)
                if mode == 'training':
                    Y_dat.append(doc['classifications'][class_n])
                files_index.append((doc['_id'], doc['filename'], doc['country']))
    if mode == 'training':
        return X_dat, Y_dat, files_index
    if mode == 'prediction':
        return X_dat, files_index
        

# X_dat, Y_dat, files_index = data_construct(docs, ['tc_features', 'topword_features'], 
#                                            mode = 'training', class_n='Commission', labels = ['yes', 'no'])

In [None]:
len(X_dat)

In [None]:
from sklearn.feature_extraction import DictVectorizer
def train_data_transform(X_train, Y_train, Y_map):
    """
    a function to transform the data to fit into the classifier
    @X_train: a list of dictionaries (features), note that there shouldn't be nan values in the data set 
    """
    # transform X
    v= DictVectorizer(sparse=False)
    X_train = v.fit_transform(X_train)
    # transform Y
    Y_train = [Y_map[label] for label in Y_train]
    return X_train, Y_train, v.feature_names_, v.vocabulary_
X_dat, Y_dat, feature_names, feature_index = train_data_transform(X_dat, Y_dat, Y_map={'yes':1, 'no':0})   

In [None]:
print("Num of data entries: "+str(len(X_dat)))
print("Num of features: "+str(len(X_dat[0])))

### 1. Feature Engineering

In [None]:
def get_NB_feature_importances(clf_nb, feature_names):
    fim_maps=[]
    for j in range(len(clf_nb.feature_log_prob_)):
        fim_map = {}
        for i in range(len(feature_names)):
            fim_map[feature_names[i]] = clf_nb.feature_log_prob_[j][i]
        fim_map = sorted(fim_map.items(), key=lambda x: x[1], reverse=True)
        fim_maps.append(fim_map)
    return fim_maps

### 1.0 Benchmark model
Before feature engineering, the following functions in PopulateCXfaresheet_new.py were run in sequence:
- update_CXfaresheets_new
- find_all_tourcodes
- encode_tourcodes
- Besides, the topwords features have already been created

The benchmark model use tc_features and topword_features

Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import LeaveOneOut
def train_with_loo(X_dat, Y_dat, Clf, rs=None, balanced=False):
    cnt = 0
    loo = LeaveOneOut()
    golds = [] 
    preds = []
    for train_index, test_index in loo.split(X_dat):
        cnt += 1
        print("Training model: ", cnt)
        # print("leave Out:", test_index[0])
        X_train, X_test = X_dat[train_index], X_dat[test_index]
        y_train = [Y_dat[i] for i in train_index]
        y_test = Y_dat[test_index[0]]
        if rs:
            if balanced:
                clf = Clf(random_state=rs, class_weight="balanced")
            else:
                clf = Clf(random_state=rs)         
        else:
            if balanced:
                clf = Clf(class_weight="balanced")
            else:
                clf = Clf()
        clf.fit(X_train, y_train)
        pred = clf.predict(X_test)
        preds.append(pred[0])
        golds.append(y_test)
    return golds, preds

In [None]:
from sklearn.metrics import classification_report
golds, preds = train_with_loo(X_dat, Y_dat, rs=25, Clf=DecisionTreeClassifier)
print(classification_report(golds, preds, target_names=['commission', 'not commission']))

Random Forest

In [None]:
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
golds, preds = train_with_loo(X_dat, Y_dat, rs=25, Clf=RandomForestClassifier)
print(classification_report(golds, preds, target_names=['commission', 'not commission']))

BernoulliNB

In [None]:
from sklearn.naive_bayes import BernoulliNB
golds, preds = train_with_loo(X_dat, Y_dat, BernoulliNB)
print(classification_report(golds, preds, target_names=['commission', 'not commission']))

### Random Forest performs the best out of the 3 models, since Random Forest is an ensemble version of Decision Tree, this shows that the model has potential to improve with more samples and proper feature selection

### 1.0 Removing features with low variance

In [None]:
from sklearn.feature_selection import VarianceThreshold
# remove all features that are either one or zero (on or off) in more than 90% of the samples.
sel = VarianceThreshold(threshold=(.9 * (1 - .9)))
X_dat_new = sel.fit_transform(X_dat)
mask = sel.get_support()
feature_names_new = [feature_names[i] for i in range(len(feature_names)) if mask[i]]

In [None]:
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
golds, preds = train_with_loo(X_dat_new, Y_dat, rs=25, Clf=RandomForestClassifier)
print(classification_report(golds, preds, target_names=['commission', 'not commission']))

In [None]:
golds, preds = train_with_loo(X_dat_new, Y_dat, rs=25, Clf=DecisionTreeClassifier)
print(classification_report(golds, preds, target_names=['commission', 'not commission']))

### Removing features with low variance decrease the model performance, shouldn't use 

### 1.1 Feature selection with feature importances

In [None]:
# get feature importances from Decision Tree
def get_feature_importances(clf1, feature_names):
    fim_map = {}
    for i in range(len(feature_names)):
        fim_map[feature_names[i]] = clf1.feature_importances_[i]

    fim_map = sorted(fim_map.items(), key=lambda x: x[1], reverse=True)
    return fim_map

In [None]:
# build classifier to get feature importances
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=25)
clf.fit(X_dat, Y_dat)

In [None]:
fim_map = get_feature_importances(clf, feature_names_new)
fim_map[:50]

In [None]:
len(X_dat)

In [None]:
# select with important features
from sklearn.feature_selection import SelectFromModel
model = SelectFromModel(clf, threshold=0.0001)
model.fit(X_dat, Y_dat)
X_dat_new = model.transform(X_dat)
mask = model.get_support()
feature_names_new = [feature_names[i] for i in range(len(feature_names)) if mask[i]]

In [None]:
X_dat_new.shape

In [None]:
feature_names_new 

In [None]:
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
golds, preds = train_with_loo(X_dat_new, Y_dat, rs=25, Clf=RandomForestClassifier)
print(classification_report(golds, preds, target_names=['commission', 'not commission']))

In [None]:
from sklearn.metrics import classification_report
golds, preds = train_with_loo(X_dat_new, Y_dat, rs=25, Clf=DecisionTreeClassifier)
print(classification_report(golds, preds, target_names=['commission', 'not commission']))

### Feature selection with feature importance improves the performance of a single Decision Tree but not random forest

### 1.2 Feature creation with rules
Rules for feature engineering:
1. If BCODE in tourcodes then is highly possible is not a commission sheet
2. If all the tourcode starts with number, then is highly possible is not a commission sheet
3. Under Incentive agreement, the term 'travel' is in the coporate name

In [None]:
def encode_tc_firstdigit_is_num(db, faresheet, search_dict={}):
    """
    This function create feature tc_features.tc_firstdigit_isNum.
    If all the tourcodes start with number, then tc_features.tc_firstdigit_isNum is updated as true"""
    fs = db[faresheet]
    docs = [doc for doc in fs.find(search_dict, {'tourcodes':1})]
    cnt = 0
    for doc in docs:
        # check whether the first digit is numerical
        tourcodes = doc['tourcodes']
        tourcodes_fd = [t[0] for t in tourcodes]
        tourcodes_fd_num = []
        for t in tourcodes_fd:
            try:
                tourcodes_fd_num.append(int(t))
            except ValueError:
                pass
        if len(tourcodes_fd_num) == len(tourcodes_fd):
            fd_is_num = True
        else:
            fd_is_num = False
        # record it to the collection
        fs.update_one({'_id': doc['_id']},
                      {'$set': {'tc_features.tc_firstdigit_isNum': fd_is_num}}) 
        cnt = cnt + 1
        print("Updated: ", cnt)        

In [None]:
encode_tc_firstdigit_is_num(db, faresheet, search_dict={})

In [None]:
def encode_tc_has_BCODE(db, faresheet, search_dict={}):
    """
    This function create feature tc_features.tc_hasBCODE
    If any of the tourcodes has BCODE, then tc_features.tc_hasBCODE is updated as true 
    """
    fs = db[faresheet]
    docs = [doc for doc in fs.find(search_dict, {'tourcodes':1})]
    cnt = 0
    for doc in docs:
        # check whether the first digit is numerical
        tourcodes = doc['tourcodes']
        tc_hasBCODE = [tc for tc in tourcodes if 'BCODE' in tc]
        if len(tc_hasBCODE)>0:
            hasBCODE=True
        else:
            hasBCODE=False            
        # record it to the collection
        fs.update_one({'_id': doc['_id']},
                      {'$set': {'tc_features.tc_hasBCODE': hasBCODE}}) 
        cnt = cnt + 1
        print("Updated: ", cnt)   

In [None]:
encode_tc_has_BCODE(db, faresheet, search_dict={})

### Modelling with the new features:

In [None]:
docs = [doc for doc in fs.find({'cases.commission_classification': "training"}, 
                               {'tc_features':1, 'topword_features':1, 'classifications':1, 'filename':1, 'country':1})]

In [None]:
X_dat, Y_dat, files_index = data_construct(docs, ['tc_features', 'topword_features'], 
                                           mode = 'training', class_n='Commission', labels = ['yes', 'no'])
X_dat, Y_dat, feature_names, feature_index = train_data_transform(X_dat, Y_dat, Y_map={'yes':1, 'no':0}) 

In [None]:
print("Num of data entries: "+str(len(X_dat)))
print("Num of features: "+str(len(X_dat[0])))
len(X_dat[0])

In [None]:
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
golds, preds = train_with_loo(X_dat, Y_dat, rs=25, Clf=RandomForestClassifier)
print(classification_report(golds, preds, target_names=['commission', 'not commission']))

In [None]:
# select with important features
from sklearn.feature_selection import SelectFromModel
model = SelectFromModel(clf, threshold=0.01)
model.fit(X_dat, Y_dat)
X_dat_new = model.transform(X_dat)
mask = model.get_support()
feature_names_new = [feature_names[i] for i in range(len(feature_names)) if mask[i]]

In [None]:
X_dat_new.shape

In [None]:
feature_names_new

In [None]:
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
golds, preds = train_with_loo(X_dat_new, Y_dat, rs=25, Clf=RandomForestClassifier)
print(classification_report(golds, preds, target_names=['commission', 'not commission']))

In [None]:
from sklearn.metrics import classification_report
golds, preds = train_with_loo(X_dat_new, Y_dat, rs=25, Clf=DecisionTreeClassifier)
print(classification_report(golds, preds, target_names=['commission', 'not commission']))

### The new feature tc_firstdigit_isNum seems to be useful, it improves recall on commission predictions compared with previous validation

### 1.3 Feature creation: corporate travel agreement
corporate travel agreement has a high possibility of being a commission sheet except faresheets from countries LK, IT
set keyword_features.cta as : 

"appear": the term appear but not country LK or IT, 

"appear_LK_IT": the term appear but is country LK or IT,, 

"not appear": the term does not appear

In [None]:
cta_docs = [doc for doc in fs.find({ "$text": { "$search": "\"CORPORATE TRAVEL AGREEMENT\""}, 
                                     },
                                   {"filename":1, "country":1}
                                    )] 

In [None]:
# update all as not appear first:
db[faresheet].update_many(
    {},
    {"$set": {"keyword_features.cta":"not appear"}}
)

In [None]:
# update the appear case
for doc in cta_docs:
    # appear case
    if doc['country'] not in ['LK', 'IT']:
        db[faresheet].update_one({"_id" : doc['_id']}, {"$set": {"keyword_features.cta":"appear"}}) 
    # appear_LK_IT case
    if doc['country'] in ['LK', 'IT']:
        db[faresheet].update_one({"_id" : doc['_id']}, {"$set": {"keyword_features.cta":"appear_LK_IT"}})

In [None]:
def encode_keyword_features_cta(db, faresheet, search_dict={}):
    # get all CORPORATE TRAVEL AGREEMENT documents 
    cta_docs = [doc for doc in fs.find({ "$text": { "$search": "\"CORPORATE TRAVEL AGREEMENT\""}, 
                                     },
                                   {"filename":1, "country":1}
                                    )]
    # update all as not appear first:
    db[faresheet].update_many(
        {},
        {"$set": {"keyword_features.cta":"not appear"}}
    )
    # update the appear case
    for doc in cta_docs:
        # appear case
        if doc['country'] not in ['LK', 'IT']:
            db[faresheet].update_one({"_id" : doc['_id']}, {"$set": {"keyword_features.cta":"appear"}}) 
        #appear_LK_IT case
        if doc['country'] in ['LK', 'IT']:
            db[faresheet].update_one({"_id" : doc['_id']}, {"$set": {"keyword_features.cta":"appear_LK_IT"}})   

In [None]:
len(cta_LK_IT_docs) 

### 1.4 Feature creation: specifc form 
If it's ADHOC NOTIFICATION FORM or ONE-OFF ADHOC FIT/GROUP REQUEST, then it must be not commission and not filed



In [None]:
texts = [doc for doc in fs.find({'teststring': {'$exists': True}}, 
                               {'teststring':1, 'filename':1})] 

In [None]:
texts_df = pd.DataFrame(texts)

In [None]:
texts_df['filename_lower'] = texts_df['filename'].str.lower()

In [None]:
texts = [doc for doc in fs.find({'teststring': {'$exists': True}}, 
                               {'teststring':1, 'filename':1})] 
texts_df = pd.DataFrame(texts)
texts_df['filename_lower'] = texts_df['filename'].str.lower()

In [None]:
texts_df

In [None]:
# update all as false by default
# update all as not appear first:
db[faresheet].update_many(
        {},
        {"$set": {"keyword_features.one_off_adhoc_fg_request":False}}
    )
# create keyword feature and update classification result for one-off adhoc fit/group request
for index, row in texts_df.loc[texts_df['teststring'].str.contains('ONE-OFF ADHOC FIT/GROUP REQUEST')].iterrows():
    db[faresheet].update_one({"_id" : row['_id']}, {"$set": {"keyword_features.one_off_adhoc_fg_request":True}})
    db[faresheet].update_one({"_id" : row['_id']}, {"$set": {"classifications.Commission":"no"}})
    db[faresheet].update_one({"_id" : row['_id']}, {"$set": {"classifications.Filed":"no"}})    

In [None]:
# update all as false by default
db[faresheet].update_many(
        {},
        {"$set": {"keyword_features.adhoc_noti_form":False}}
    )
# create keyword feature and update classification result for ADHOC NOTIFICATION FORM
for index, row in texts_df.loc[texts_df['teststring'].str.contains('ADHOC NOTIFICATION FORM')].iterrows():
    db[faresheet].update_one({"_id" : row['_id']}, {"$set": {"keyword_features.adhoc_noti_form":True}})
    db[faresheet].update_one({"_id" : row['_id']}, {"$set": {"classifications.Commission":"no"}})
    db[faresheet].update_one({"_id" : row['_id']}, {"$set": {"classifications.Filed":"no"}}) 

### Training with new features:

In [None]:
docs = [doc for doc in fs.find({'cases.commission_classification': "training"}, 
                               {'tc_features':1, 'topword_features':1, 'keyword_features':1, 'classifications':1, 'filename':1, 'country':1})]

In [None]:
X_dat, Y_dat, files_index = data_construct(docs, ['tc_features', 'keyword_features', 'topword_features'], 
                                           mode = 'training', class_n='Commission', labels = ['yes', 'no'])
X_dat, Y_dat, feature_names, feature_index = train_data_transform(X_dat, Y_dat, Y_map={'yes':1, 'no':0}) 

In [None]:
print("Num of data entries: "+str(len(X_dat)))
print("Num of features: "+str(len(X_dat[0])))
len(X_dat[0])

In [None]:
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
golds, preds = train_with_loo(X_dat, Y_dat, rs=25, Clf=RandomForestClassifier)
print(classification_report(golds, preds, target_names=['commission', 'not commission']))

In [None]:
# select with important features
from sklearn.feature_selection import SelectFromModel
model = SelectFromModel(clf, threshold=0.001)
model.fit(X_dat, Y_dat)
X_dat_new = model.transform(X_dat)
mask = model.get_support()
feature_names_new = [feature_names[i] for i in range(len(feature_names)) if mask[i]]

In [None]:
X_dat_new.shape

In [None]:
feature_names_new

In [None]:
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
golds, preds = train_with_loo(X_dat_new, Y_dat, rs=25, Clf=RandomForestClassifier)
print(classification_report(golds, preds, target_names=['commission', 'not commission']))

### try including some of the specific forms sheets in training: 

In [None]:
doc_form1 = [doc for doc in fs.find({'keyword_features.one_off_adhoc_fg_request': True}, 
                               {'tc_features':1, 'topword_features':1, 'keyword_features':1, 'classifications':1, 'filename':1, 'country':1})]

In [None]:
doc_form2 = [doc for doc in fs.find({'keyword_features.adhoc_noti_form': True}, 
                               {'tc_features':1, 'topword_features':1, 'keyword_features':1, 'classifications':1, 'filename':1, 'country':1})]

In [None]:
train_docs = doc_form1 + doc_form2 + docs

In [None]:
X_dat, Y_dat, files_index = data_construct(train_docs, ['tc_features', 'keyword_features', 'topword_features'], 
                                           mode = 'training', class_n='Commission', labels = ['yes', 'no'])
X_dat, Y_dat, feature_names, feature_index = train_data_transform(X_dat, Y_dat, Y_map={'yes':1, 'no':0}) 
print("Num of data entries: "+str(len(X_dat)))
print("Num of features: "+str(len(X_dat[0])))
len(X_dat[0])

### without balancing the class

In [None]:
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
golds, preds = train_with_loo(X_dat, Y_dat, rs=25, Clf=RandomForestClassifier)
print(classification_report(golds, preds, target_names=['commission', 'not commission']))

In [None]:
# get wrongly classified files
def get_wrong_files(golds, preds, files_index):
    wrong_index = [(i, golds[i], preds[i]) for i in range(len(golds)) if golds[i]!=preds[i]]  
    wrong_files = [files_index[wrong_index[i][0]] for i in range(len(wrong_index))]
    print("Number of wrongly classified files:", len(wrong_index))
    for i in range(len(wrong_index)):
        print("Wrongly classified file: ", wrong_files[i][1], wrong_files[i][2])
        print("The true label is: ", wrong_index[i][1], "Predicted as: ", wrong_index[i][2])
    return wrong_index, wrong_files

In [None]:
wrong_index, wrong_files = get_wrong_files(golds, preds, files_index)

### with balancing the class:

In [None]:
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
golds, preds = train_with_loo(X_dat, Y_dat, rs=25, Clf=RandomForestClassifier, balanced=True)
print(classification_report(golds, preds, target_names=['commission', 'not commission']))

In [None]:
wrong_index, wrong_files = get_wrong_files(golds, preds, files_index)

### Balancing the weight of the class doesn't help

In [None]:
# select with important features
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
# train on all data to get feature importance
clf = RandomForestClassifier(random_state=25)
clf.fit(X_dat, Y_dat)
model = SelectFromModel(clf, threshold=0.001)
model.fit(X_dat, Y_dat)
X_dat_new = model.transform(X_dat)
mask = model.get_support()
feature_names_new = [feature_names[i] for i in range(len(feature_names)) if mask[i]]

In [None]:
X_dat_new.shape

In [None]:
feature_names_new

In [None]:
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
golds, preds = train_with_loo(X_dat_new, Y_dat, rs=25, Clf=RandomForestClassifier)
print(classification_report(golds, preds, target_names=['commission', 'not commission']))
# print("number of wrongly classified files: ": file)

In [None]:
wrong_index, wrong_files = get_wrong_files(golds, preds, files_index)

### selecting features with chi2/mutual information:

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif
sel = SelectKBest(mutual_info_classif, k=100)
X_dat_new = sel.fit_transform(X_dat, Y_dat)
mask = sel.get_support()
feature_names_new = [feature_names[i] for i in range(len(feature_names)) if mask[i]]

In [None]:
X_dat_new.shape

In [None]:
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
golds, preds = train_with_loo(X_dat_new, Y_dat, rs=25, Clf=RandomForestClassifier)
print(classification_report(golds, preds, target_names=['commission', 'not commission']))
# print("number of wrongly classified files: ": file)

In [None]:
wrong_index, wrong_files = get_wrong_files(golds, preds, files_index)

### using chi2/mutual information selection is not as effective as feature importance

### 1.5 Feature creation: filename_features

- create features to indicate whether these terms are in filename: BCODE, commission, agent, corporate

In [None]:
texts_df['filename_lower'] = texts_df['filename'].str.lower()

In [None]:
texts_df.loc[texts_df['filename_lower'].str.contains('b code')]

In [None]:
texts_df.loc[texts_df['filename_lower'].str.contains('bcode')]

In [None]:
texts_df.loc[texts_df['filename_lower'].str.contains('b_code')]

In [None]:
texts_df.loc[texts_df['teststring'].str.contains('commission')]

In [None]:
texts_df['filename_str'] = texts_df['filename'].apply(lambda x: x.replace('_', ' '))
texts_df['filename_str'] = texts_df['filename_str'].apply(lambda x: x.replace('-', ' '))
texts_df['filename_str'] = texts_df['filename_str'].apply(lambda x: x.replace(',', ' '))
texts_df['filename_str'] = texts_df['filename_str'].apply(lambda x: x.replace(';', ' '))
texts_df['filename_str'] = texts_df['filename_str'].apply(lambda x: x.replace('.', ' '))
texts_df['filename_str'] = texts_df['filename_str'].apply(lambda x: x.replace('html', ''))

In [None]:
texts_df['filename_str'] 

In [None]:
a = texts_df['teststring']+texts_df['filename_str']

In [None]:
a[0]

### 1.6 Feature creation: tf-idf valus 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(min_df=300, lowercase=False,
                                   stop_words='english', analyzer='word')

In [None]:
tfidf = tfidf_vectorizer.fit_transform(texts_df['teststring'])

In [None]:
tfidf_feature_list = tfidf_vectorizer.get_feature_names()

In [None]:
tfidf_feature_list[:158]

In [None]:
# remove the number
add_on_stopwords = tfidf_feature_list[:158]
stopwords = list(tfidf_vectorizer.stop_words_) + add_on_stopwords

In [None]:
# build with the number remove
tfidf_vectorizer = TfidfVectorizer(min_df=300, lowercase=False,
                                    analyzer='word', stop_words=stopwords)
tfidf = tfidf_vectorizer.fit_transform(texts_df['teststring'])
tfidf_feature_list = tfidf_vectorizer.get_feature_names()

In [None]:
tfidf_feature_list


In [None]:
len(tfidf_feature_list )

In [None]:
tfidf = tfidf.toarray()

In [None]:
tfidf.shape

In [None]:
texts_df['_id'][0]

In [None]:
tfidf_feature_list[1]

In [None]:
for i in range(len(tfidf)):
    row = tfidf[i]
    print(texts_df['_id'][i])
    # record tfidf 
    tfidf_features = {}
    for j in range(len(row)):
        tfidf_features[tfidf_feature_list[j]]: row[j]
        print(tfidf_feature_list[j], row[j])
        j+=1
    i+=1
    
    if i>2:
        break

    
    

# 2. Produce Commission sheet prediction 

In [None]:
#docs = [doc for doc in fs.find({'cases.commission_classification': "training"}, 
docs = [doc for doc in fs.find({'$or': [ { 'classifications.Commission': 'yes' }, { 'classifications.Commission': 'no' }]}, 
                               {'tc_features':1, 'topword_features':1, 'keyword_features':1, 'classifications':1, 'filename':1, 'country':1})]

In [None]:
X_dat, Y_dat, files_index = data_construct(docs, ['tc_features', 'topword_features', 'keyword_features'], 
                                           mode = 'training', class_n='Commission', labels = ['yes', 'no'])

In [None]:
X_dat, Y_dat, feature_names, feature_index = train_data_transform(X_dat, Y_dat, Y_map={'yes':1, 'no':0}) 

In [None]:
len(X_dat)

In [None]:
# build classifier to get feature importances
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=25)
clf.fit(X_dat, Y_dat)
fim_map = get_feature_importances(clf, feature_names)
fim_map[:50]

In [None]:
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
golds, preds = train_with_loo(X_dat, Y_dat, rs=25, Clf=RandomForestClassifier, balanced=True)
print(classification_report(golds, preds, target_names=['commission', 'not commission']))
wrong_index, wrong_files = get_wrong_files(golds, preds, files_index)

In [None]:
# choose important features, threshold = 0
important_features = [ f for f in fim_map if f[1]>0]
# important_features
# find important features indexes
important_features_index = [feature_index[f[0]] for f in important_features]
len(important_features)

In [None]:
import numpy as np
X_dat_if = [np.take(row, important_features_index) for row in X_dat] 
X_dat_if = np.array(X_dat_if)
X_dat_if.shape

In [None]:
# cross validation training with importance features
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
golds, preds = train_with_loo(X_dat_if, Y_dat, rs=25, Clf=RandomForestClassifier, balanced=True)
print(classification_report(golds, preds, target_names=['commission', 'not commission']))
wrong_index, wrong_files = get_wrong_files(golds, preds, files_index)

In [None]:
# build classifier with all data
clf = RandomForestClassifier()
clf.fit(X_dat_if, Y_dat)

In [None]:
# find document to predict
pred_docs = [doc for doc in fs.find({'lang': 'en'}, 
                               {'tc_features':1, 'topword_features':1, 'keyword_features':1, 'classifications':1, 'filename':1, 'country':1})]

In [None]:
# construction the data
pred_dat, pred_files_index = data_construct(pred_docs, ['tc_features', 'topword_features', 'keyword_features'], 
                                           mode = 'prediction')

In [None]:
important_features_list = [e[0] for e in important_features]

In [None]:
# transform the data to fit in the classifier
def predict_data_transform(pred_dat, feature_list):
    v = DictVectorizer(sparse=False)
    pred_train = v.fit_transform(pred_dat)
    feature_index = [v.vocabulary_[f] for f in feature_list]
    pred_train = [np.take(row, feature_index) for row in pred_train] 
    return pred_train
X_pred = predict_data_transform(pred_dat, important_features_list)

In [None]:
np.asarray(X_pred).shape

In [None]:
predict_map = {'1':'yes', '0':'no'}

In [None]:
fs.update_many({}, {'$set': {'predictions.Commission': 'undefined'}})

In [None]:
pred_files_index[0]

In [None]:
pred_fieid = 'predictions.Commission'
preds = clf.predict(X_pred)
preds = [predict_map[str(p)] for p in preds]
cnt = 0
for i in range(len(preds)):
    cnt+=1
    fs.update_one({'_id': pred_files_index[i][0]}, {'$set': {pred_fieid: preds[i]}})
    print('Updating: ', cnt, pred_files_index[i], preds[i])

### get predictions