In [None]:
# Libs imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from gensim.models import FastText
import spacy
import re
from catboost import CatBoostClassifier
from catboost import Pool
from datetime import datetime


pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')

In [None]:
pd.concat([pd.read_csv('extra_data.csv')['MERCHANT_NAME'], 
           pd.read_csv('train.csv')['MERCHANT_NAME'],
           pd.read_csv('test.csv')['MERCHANT_NAME']]).to_csv('all_labels3.txt', index = False, header = False )

In [None]:
text_file = 'all_labels3.txt'#"items_unique_text.txt"


class FileLinesIter:
    def __init__(self, filename):
        self.filename = filename
    
    def __iter__(self):
        count = 0
        with open(self.filename, 'r', encoding='utf-8') as fin:
            for line in fin:
                yield line.split()

class ListLinesIter:
    def __init__(self, goods_names):
        self.goods_names = goods_names

    def __iter__(self):
        random.shuffle(self.goods_names)
        for good in self.goods_names:
            yield good

#### Make And Train FastText Model

model_good = FastText(
    vector_size=21,# (int, optional) – Dimensionality of the word vectors (embeddings).
    window=3,       # (int, optional) – The maximum distance between the current and predicted word within a sentence.
    min_count=1,    # (int, optional) – The model ignores all words with total frequency lower than this.
    workers=1,#8,
    seed = 1,
    #negative=5,
    #min_alpha=0.000001,
    #max_vocab_size=500_000,
    #bucket=1_000_000,
    sg=1 # Sg = 1 -> skip-gram, 
)

model_good.build_vocab(corpus_iterable=FileLinesIter(text_file))

total_examples = model_good.corpus_count

total_words = model_good.corpus_total_words

model_good.train(
    corpus_iterable=FileLinesIter(text_file), 
    total_examples=total_examples, 
    epochs = 20
)

model_fname = "fast_text_best.ft"

model_good.save(model_fname)

In [None]:
# Testing path
# path = '/content/drive/MyDrive/Colab Notebooks/alvinapp/'
path = ""

# Load the files into a Pandas Dataframe
train = pd.read_csv(path+'Train.csv')
test = pd.read_csv(path+'Test.csv')
extra = pd.read_csv(path+'extra_data.csv')
ss = pd.read_csv(path+'SampleSubmission.csv')

The biggest challenge in this competition is the lack of labeled data. So we can use some unlabeled rows and label them by yourself using simple heuristics

In [None]:
extra.loc[extra['MERCHANT_NAME'].str.contains('PHARMACY'), 'MERCHANT_CATEGORIZED_AS'] = 'Health'

extra.loc[extra['MERCHANT_NAME'].str.contains('CREDIT'), 'MERCHANT_CATEGORIZED_AS'] = 'Loan Repayment'

extra.loc[extra['MERCHANT_NAME'].str.contains('GRILL'), 'MERCHANT_CATEGORIZED_AS'] = 'Going out'

extra.loc[extra['MERCHANT_NAME'].str.contains('INSURANCE'),  'MERCHANT_CATEGORIZED_AS'] = 'Emergency fund'

extra.loc[extra['MERCHANT_NAME'].str.contains('PIZZA'),  'MERCHANT_CATEGORIZED_AS'] = 'Going out'

extra.loc[extra['MERCHANT_NAME'].str.contains('LOAN'),   'MERCHANT_CATEGORIZED_AS'] = 'Loan Repayment'

extra.loc[extra['MERCHANT_NAME'].str.contains('ARTCAFFE'),  'MERCHANT_CATEGORIZED_AS'] = 'Going out'

extra.loc[extra['MERCHANT_NAME'].str.contains('SAVINGS'),  'MERCHANT_CATEGORIZED_AS'] = 'Emergency fund'

extra.loc[extra['MERCHANT_NAME'].str.contains('CASHNOW'), 'MERCHANT_CATEGORIZED_AS'] = 'Loan Repayment'

extra.loc[extra['MERCHANT_NAME'].str.contains('GOOGLE'), 'MERCHANT_CATEGORIZED_AS'] = 'Bills & Fees'

extra.loc[extra['MERCHANT_NAME'].str.contains('DECATHLON'), 'MERCHANT_CATEGORIZED_AS'] = 'Miscellaneous'

extra.loc[extra['MERCHANT_NAME'].str.contains('KANDAMOJA APP'), 'MERCHANT_CATEGORIZED_AS'] = 'Loan Repayment'

extra.loc[extra['MERCHANT_NAME'].str.contains('CLINIC'), 'MERCHANT_CATEGORIZED_AS'] = 'Health'

extra.loc[(extra['MERCHANT_NAME'].str.contains('GAS')) & (extra['MERCHANT_NAME'].str.contains('STATION')), 'MERCHANT_CATEGORIZED_AS'] = 'Transport & Fuel'

extra.loc[extra['MERCHANT_NAME'].str.contains('FUEL'), 'MERCHANT_CATEGORIZED_AS'] = 'Transport & Fuel'

## 1. Load the dataset

In [None]:
# Let’s observe the shape of our datasets.
print('Train data shape :', train.shape)
print('Test data shape :', test.shape)

In [None]:
# concating extra data labeled by hands
train = pd.concat([train, extra.loc[extra['MERCHANT_CATEGORIZED_AS'].isna() == False]]).reset_index(drop = True)

In [None]:
# Let’s observe the shape of our datasets.
print('Train data shape :', train.shape)
print('Test data shape :', test.shape)

In [None]:
train['MERCHANT_CATEGORIZED_AS'].value_counts()

In [None]:
# concstructing entity features using spacy

nlp = spacy.load('en_core_web_lg')

train = pd.concat([train, pd.get_dummies(
    train['MERCHANT_NAME'].apply(
        lambda x: list(
            set(
                [x.label_ for x in nlp(x.lower()).ents]
            )
        )
    ).explode()
).sum(level = 0)], axis = 1)

test = pd.concat([test, pd.get_dummies(
    test['MERCHANT_NAME'].apply(
        lambda x: list(
            set(
                [x.label_ for x in nlp(x.lower()).ents]
            )
        )
    ).explode()
).sum(level = 0)], axis = 1)

In [None]:
# dropping useless columns
train.drop(columns = ['CARDINAL', 'GPE', 'ORG', 'PRODUCT'], inplace = True)
test.drop(columns = ['CARDINAL', 'GPE', 'ORG', 'PRODUCT'], inplace = True)

In [None]:
# constructing Seasonal features

def season_funct(x):
    if x in [11, 12, 1, 2, 3]:
        return 'DRY'
    else:
        return 'WET'

train['season'] = train['PURCHASED_AT'].astype('datetime64').dt.month.apply(season_funct)
test['season'] = test['PURCHASED_AT'].astype('datetime64').dt.month.apply(season_funct)

In [None]:
# Use a dictionary comprehension and zip to create a dictionary for all the categories in the train data
labels_train = train['MERCHANT_CATEGORIZED_AS'].astype('category').cat.categories.tolist()
replace_map_train = {'MERCHANT_CATEGORIZED_AS' : {k: v for k,v in zip(labels_train,list(range(1,len(labels_train)+1)))}}
print("Train data: ", replace_map_train)

In [None]:
train["USER_GENDER"] = train["USER_GENDER"].apply(lambda x: "Male" if pd.isna(x) else x)
test["USER_GENDER"] = test["USER_GENDER"].apply(lambda x: "Male" if pd.isna(x) else x)

In [None]:
# dropping useless columns
train.drop(columns = ['USER_AGE'], inplace = True)
test.drop(columns = ['USER_AGE'], inplace = True)

In [None]:
train = pd.concat([train, pd.get_dummies(train['USER_HOUSEHOLD'].apply(lambda x: 1 if x == 1 else 2 if x == 2 else 3 if x == 3 else 4 ), 
               prefix = 'USER_HOUSEHOLD')], axis = 1)

test = pd.concat([test, pd.get_dummies(test['USER_HOUSEHOLD'].apply(lambda x: 1 if x == 1 else 2 if x == 2 else 3 if x == 3 else 4 ), 
               prefix = 'USER_HOUSEHOLD')], axis = 1)

In [None]:
train.drop(columns = ['USER_HOUSEHOLD'], inplace = True)
test.drop(columns = ['USER_HOUSEHOLD'], inplace = True)

In [None]:
model_wv = FastText.load("fast_text_best.ft")

In [None]:
# constructing embeddings features
train = pd.concat([train, pd.DataFrame(train['MERCHANT_NAME'].apply(lambda x: np.array([model_wv.wv[x] for x in x.split()]).mean(axis = 0)).to_list())], axis = 1)

In [None]:
test = pd.concat([test, pd.DataFrame(test['MERCHANT_NAME'].apply(lambda x: np.array([model_wv.wv[x] for x in x.split()]).mean(axis = 0)).to_list())], axis = 1)

In [None]:
to_drop_cols = ['MERCHANT_CATEGORIZED_AT', 'MERCHANT_NAME', 'PURCHASED_AT', 'USER_ID', 'Transaction_ID']#, 'USER_HOUSEHOLD_1']

In [None]:
# constricting frequent merchant names features 
s = train['MERCHANT_NAME'].apply(lambda x: re.sub('[^A-Za-z0-9]+', ' ', re.sub(' +',' ',x)).upper().split(' '))

unique_words = s.apply(pd.Series).stack().reset_index(drop = True)

vals = unique_words.value_counts().head(25).index.tolist()

In [None]:
for c in vals:
    train[c] = train['MERCHANT_NAME'].str.contains(c).astype(int)
    test[c] = test['MERCHANT_NAME'].str.contains(c).astype(int)

In [None]:
test.drop(columns = [ x for x in to_drop_cols if x != 'Transaction_ID'], inplace = True)
train.drop(columns = to_drop_cols, inplace = True)

In [None]:
# fixing skeweness
test['PURCHASE_VALUE'] = np.log1p(test['PURCHASE_VALUE'])
train['PURCHASE_VALUE'] = np.log1p(train['PURCHASE_VALUE'])

test['USER_INCOME'] = np.log1p(test['USER_INCOME'])
train['USER_INCOME'] = np.log1p(train['USER_INCOME'])

In [None]:
# Based on our data set, this function converts boolean to binary entries
def create_binary_cols(content):
  if content == False:
    content = 0
  elif content == True:
    content = 1
  elif content == 'N':
    content = 0
  elif content == 'Y':
    content = 1
  elif content == 'Male':
    content = 0
  elif content == 'Female':
    content = 1
  elif content == 'Unknown':
    content = -1    
  return content

In [None]:
# Gender column convert:
train['USER_GENDER'] = train['USER_GENDER'].apply(create_binary_cols)
test['USER_GENDER'] = test['USER_GENDER'].apply(create_binary_cols)

# Is_purchase_paid_via_mpesa_send_money column convert:
train['IS_PURCHASE_PAID_VIA_MPESA_SEND_MONEY'] = train['IS_PURCHASE_PAID_VIA_MPESA_SEND_MONEY'].apply(create_binary_cols)
test['IS_PURCHASE_PAID_VIA_MPESA_SEND_MONEY'] = test['IS_PURCHASE_PAID_VIA_MPESA_SEND_MONEY'].apply(create_binary_cols)

In [None]:
# Separate the features from the target in the training data
X = train.drop(["MERCHANT_CATEGORIZED_AS"], axis=1)
y = train["MERCHANT_CATEGORIZED_AS"]

In [None]:
model = CatBoostClassifier(n_estimators = 575,
                           depth = 6,
                           learning_rate = 0.05,
                           random_strength = 0.5,
                           #eval_metric = 'AUC',
                           od_type = "Iter",
                           #l2_leaf_reg = 100,
                           od_wait = 100,
                           task_type="GPU")

In [None]:
model.fit(
    X, y,
    cat_features = ['season'],
    plot = True, 
    verbose = False
)

In [None]:
def plot_feature_importance(importance, names ,model_type):
    
    import seaborn as sns
    #Create arrays from feature importance and feature names
    feature_importance = np.array(importance)
    feature_names = np.array(names)

    #Create a DataFrame using a Dictionary
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)

    #Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)

    #Define size of bar plot
    plt.figure(figsize=(12, 25))
    #Plot Searborn bar chart
    sns.barplot(x=fi_df['feature_importance'].head(300), y=fi_df['feature_names'].head(300))
    #Add chart labels
    plt.title(model_type + 'FEATURE IMPORTANCE')
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')

In [None]:
plot_feature_importance(model.feature_importances_,model.feature_names_, 'catboost')

In [None]:
importance = model.feature_importances_
names = model.feature_names_

#Create arrays from feature importance and feature names
feature_importance = np.array(importance)
feature_names = np.array(names)

#Create a DataFrame using a Dictionary
data={'feature_names':feature_names,'feature_importance':feature_importance}
fi_df = pd.DataFrame(data)

#Sort the DataFrame in order decreasing feature importance
fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)

In [None]:
# Dropping features based on feature_importances
cols_to_drop = fi_df[fi_df['feature_importance'] == 0]['feature_names'].values.tolist()

X.drop(columns = cols_to_drop, inplace = True)

In [None]:
model = CatBoostClassifier(n_estimators = 575,
                           depth = 6,
                           learning_rate = 0.05,
                           random_strength = 0.5,
                           #eval_metric = 'AUC',
                           od_type = "Iter",
                           #l2_leaf_reg = 100,
                           od_wait = 100,
                           task_type="GPU")

model.fit(
    X, y,
    cat_features = ['season'],
    plot = True, 
    verbose = False
)

In [None]:
plot_feature_importance(model.feature_importances_,model.feature_names_, 'catboost')

### Making submission

In [None]:
sub = pd.concat([test['Transaction_ID'].reset_index(drop= True), 
           pd.DataFrame(model.predict_proba(test[X.columns]), columns = model.classes_)], axis = 1)#[ss.columns]

Save results in the CSV file.

In [None]:
# Create the submission csv file
sub[ss.columns].to_csv(f'sub_{datetime.now().strftime("%d_%m_%Y__%H_%M_%S")}.csv', index = False)