# Import modules to preprocess data and make models.


In [1]:
# install catboost to use CatboostClassifier and Tokenizer
!pip install catboost

# modules to transform the data
import pandas as pd
import numpy as np
import tensorflow as tf

# special parts to prepare text data
from sklearn.feature_extraction.text import CountVectorizer
from catboost.text_processing import Tokenizer

# to split data and score the model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# models
from sklearn.linear_model import LogisticRegression
from catboost import Pool, CatBoostClassifier
from tensorflow.keras import Sequential



# Read dataset and prepare it

In [2]:
# read from github
url = 'https://raw.githubusercontent.com/Losvuex/Predicting-successful-upsells/main/data.csv'
cdf = pd.read_csv(url)

# leave onlly lines with text
cdf = cdf[cdf['Диалог'] == cdf['Диалог']]
cdf=cdf.reset_index()

# rename to easy usage
cdf = cdf.rename(columns={'Диалог': 'dialogue', 'Допродажи': 't', 'Успех допродажи': 'st'})


# transform class's columns to int type
cdf.t = pd.Categorical(cdf.t)
cdf['t'] = cdf.t.cat.codes
cdf.st = pd.Categorical(cdf.st)
cdf['st'] = cdf.st.cat.codes

# make terms columns and code classes
cdf['terms']=[list(['t' if a == 0 else 'Nan','st' if b == 0 else 'Nan']) for a,b in zip(cdf.t,cdf.st)]
cdf = cdf.drop(columns=['t', 'st'])
cdf['terms'] = cdf['terms'].apply(
    lambda row: [len(val) for val in row if val != 'Nan']
)
cdf['terms'] = cdf['terms'].apply(
    lambda row: len(row)
)
# 0 = no upsells
# 1 = try to make upsell
# 2 = successful try

In [3]:
# make vector from dialogue, use ngrams
vectorizerCount = CountVectorizer(analyzer='word', ngram_range = (6, 6), token_pattern=None,
                                  tokenizer=Tokenizer(separator_type='BySense',token_types=['Word']).tokenize)
allDataVectorized = pd.DataFrame(vectorizerCount.fit_transform(cdf['dialogue']).todense())

# leave only ngrams with often appearence
indexMore = next(x for x, val in enumerate(allDataVectorized.sum(axis=0).sort_values()) if val > 3)
index_new = allDataVectorized.sum(axis=0).sort_values()[indexMore:].index
dfFrequent = allDataVectorized[index_new]

# add vector to data
cdf_vec = pd.concat([cdf,dfFrequent], axis=1)
cdf_vec.columns = cdf_vec.columns.astype(str)

In [4]:
#split to train and test data
test_split = 0.2

train_vec, test_vec = train_test_split(
    cdf_vec,
    test_size=test_split,
    stratify=cdf_vec["terms"].values,
)

X_train = train_vec.drop(['terms', 'dialogue'], axis=1)
X_test = test_vec.drop(['terms', 'dialogue'], axis=1)
y_train = train_vec['terms']
y_test = test_vec['terms']

# Fit and predict

In [5]:
clf = LogisticRegression(max_iter = 10000, random_state = 0).fit(X_train, y_train)
pred = clf.predict(X_test)
lr_score = accuracy_score(y_test, pred)

In [6]:
train_pool = Pool(data = X_train, label = y_train)
test_pool = Pool(data = X_test, label = y_test)

model = CatBoostClassifier(iterations = 100, eval_metric = 'Accuracy', depth = 5, random_seed = 42)
res = model.fit(train_pool, eval_set = test_pool, verbose = 0)
cc_score = res.best_score_['validation']['Accuracy']

In [7]:
# divide class column to 0-1 columns
v = tf.keras.utils.to_categorical(cdf_vec['terms'], num_classes=3)
cdf_vec['n'] = v[:,0]
cdf_vec['t'] = v[:,1]
cdf_vec['st'] = v[:,2]

train_vec, test_vec = train_test_split(
    cdf_vec,
    test_size=test_split,
    stratify=cdf_vec["terms"].values,
)

X_train = train_vec.drop(['terms', 'dialogue', 'n', 't', 'st'], axis=1)
X_test = test_vec.drop(['terms', 'dialogue', 'n', 't', 'st'], axis=1)
y_train = train_vec[['n', 't', 'st']]
y_test = test_vec[['n', 't', 'st']]

model = Sequential()
model.add(tf.keras.layers.Dense(10, input_dim=X_train.shape[1], activation='relu'))
model.add(tf.keras.layers.Dense(10, activation="relu"))
model.add(tf.keras.layers.Dense(3, activation="sigmoid"))

model.compile(loss="binary_crossentropy", optimizer= "adam", metrics=['accuracy'])

model.fit(X_train, y_train, epochs = 10, verbose = 0)

_, ks_score = model.evaluate(X_test, y_test, verbose = 0)

# Score of models

In [8]:
print(f"sklearn.linear_model.LogisticRegression: {round(lr_score * 100, 2)}%.")
print(f"Catboost.CatBoostClassifier: {round(cc_score * 100, 2)}%.")
print(f"Keras.Sequential: {round(ks_score * 100, 2)}%.")

#  we use accuracy, it is not the best metric to this case due to different factors

sklearn.linear_model.LogisticRegression: 65.88%.
Catboost.CatBoostClassifier: 67.06%.
Keras.Sequential: 62.35%.
