In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import f1_score

# Load Data

In [3]:
df = pd.read_csv("../data/kaggle/search_terms/trainSet.csv", header=0, names=["term", "tag"])

df.head()

Unnamed: 0,term,tag
0,fire detection shop,19
1,cheap couch roll,398
2,extra watermelon gum,1108
3,used generators for sale uk,213
4,rose gold pearl necklace,821


In [4]:
tags = df["tag"].unique()

tags.sort()
print(tags)

[   0    1    2 ... 1416 1417 1418]


In [5]:
df0 = df[df["tag"] == 0]

df0

Unnamed: 0,term,tag
777,buy expired domains,0
1562,godaddy promo code,0
2245,art domain,0
4535,value my domain,0
5184,how to buy domain name,0
...,...,...
594991,register uk domain,0
596408,domain suggestion,0
599311,purchase domain,0
601756,1st domains,0


In [6]:
df_0 = df[df["tag"] != 0].sample(n=len(df0), random_state=42)

df_0["tag"] = -1
df_0

Unnamed: 0,term,tag
426700,personalised car mats,-1
193749,dragon sailing boat for sale,-1
31895,nutrition coach online,-1
605330,graphic novel publishers uk,-1
269658,free file recovery,-1
...,...,...
402702,montgenevre ski,-1
320364,jojoba oil uk,-1
470500,single duvet cover size,-1
418948,survival food list,-1


In [7]:
data = pd.concat([df0, df_0])

data

Unnamed: 0,term,tag
777,buy expired domains,0
1562,godaddy promo code,0
2245,art domain,0
4535,value my domain,0
5184,how to buy domain name,0
...,...,...
402702,montgenevre ski,-1
320364,jojoba oil uk,-1
470500,single duvet cover size,-1
418948,survival food list,-1


# Split dataset

In [8]:
X_train, X_test, y_train, y_test = train_test_split(data["term"], 
                                                    data["tag"], 
                                                    test_size=0.2, 
                                                    random_state=42)

# Model Build and Validation

In [9]:
model = Pipeline([
    ("tdidf", TfidfVectorizer()),
    ("clf", CalibratedClassifierCV(LinearSVC())),
],
verbose=1)

In [53]:
model.fit(X_train, y_train)

[Pipeline] ............. (step 1 of 2) Processing tdidf, total=   0.0s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.0s


Pipeline(steps=[('tdidf', TfidfVectorizer()),
                ('clf', CalibratedClassifierCV(base_estimator=LinearSVC()))],
         verbose=1)

In [54]:
y_pred = model.predict(X_test)

In [55]:
print(f1_score(y_test, y_pred, average='weighted'))

0.9506227106227108


In [56]:
y_pred

array([-1,  0,  0, -1, -1, -1, -1,  0, -1, -1, -1, -1,  0,  0,  0, -1,  0,
        0, -1, -1,  0,  0,  0, -1, -1, -1,  0, -1, -1,  0,  0, -1,  0, -1,
        0, -1, -1,  0,  0, -1,  0,  0,  0, -1, -1,  0,  0,  0,  0,  0, -1,
       -1, -1, -1,  0,  0,  0,  0, -1, -1, -1,  0, -1, -1, -1,  0, -1, -1,
       -1,  0,  0, -1, -1, -1,  0,  0, -1,  0, -1,  0,  0,  0,  0,  0,  0,
        0,  0, -1,  0, -1,  0, -1, -1,  0,  0, -1, -1, -1,  0, -1,  0, -1,
        0, -1,  0, -1, -1, -1, -1,  0,  0,  0, -1, -1,  0, -1, -1, -1, -1,
        0, -1,  0,  0, -1, -1,  0,  0,  0,  0,  0,  0,  0, -1, -1, -1,  0,
        0, -1, -1,  0, -1,  0,  0,  0, -1, -1, -1,  0, -1,  0, -1,  0,  0,
       -1,  0, -1, -1,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1, -1,  0,  0,
       -1, -1,  0, -1, -1, -1, -1, -1, -1,  0, -1,  0])

In [57]:
y_test.to_numpy()

array([-1,  0,  0, -1, -1, -1, -1,  0, -1, -1, -1, -1,  0,  0,  0, -1,  0,
        0, -1, -1,  0,  0,  0, -1, -1, -1,  0, -1, -1,  0,  0, -1,  0, -1,
        0, -1, -1,  0,  0, -1,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0, -1,
       -1, -1, -1,  0,  0,  0,  0, -1, -1, -1,  0, -1, -1,  0,  0, -1, -1,
       -1,  0,  0, -1, -1, -1,  0,  0, -1,  0, -1,  0,  0,  0,  0,  0,  0,
        0,  0, -1,  0, -1,  0, -1,  0,  0,  0, -1, -1,  0,  0,  0,  0,  0,
        0, -1,  0, -1, -1, -1, -1, -1,  0,  0,  0, -1,  0, -1, -1, -1, -1,
        0, -1,  0,  0, -1, -1,  0,  0,  0,  0,  0,  0,  0, -1, -1, -1,  0,
        0, -1, -1,  0, -1,  0,  0,  0, -1, -1, -1,  0, -1,  0, -1,  0,  0,
       -1,  0, -1, -1,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1, -1,  0,  0,
       -1, -1,  0, -1,  0, -1, -1, -1, -1,  0, -1,  0])

In [61]:
model.classes_.tolist()

[-1, 0]

In [85]:
model.predict_proba(["personalised car mats"])[0][1]

0.01286976165931952

# Save model

In [48]:
from joblib import dump

In [49]:
dump(model, '../models/model_0.joblib') 

['../models/model_0.joblib']

In [62]:
temp = df[df["tag"] == 587]

temp

Unnamed: 0,term,tag
124006,client onboarding investment banking,587
143200,fintech big data,587
333392,innovation in financial services,587
373541,m commerce uk,587


In [63]:
new_df = pd.concat([temp]*5, ignore_index=True)

new_df

Unnamed: 0,term,tag
0,client onboarding investment banking,587
1,fintech big data,587
2,innovation in financial services,587
3,m commerce uk,587
4,client onboarding investment banking,587
5,fintech big data,587
6,innovation in financial services,587
7,m commerce uk,587
8,client onboarding investment banking,587
9,fintech big data,587


In [64]:
from joblib import load

In [72]:
clf = load("../src/models/model_1.joblib")

clf.classes_

array([-1,  1])

array([ -1, 100])

In [73]:
from glob import glob

In [77]:
len(glob("../src/models/*.joblib"))

1419

In [108]:
def predict_tag(text: str, top_n: int):
    models = glob("../src/models/*.joblib")
    probs = [0]*len(models)
    
    for model in models:
        clf = load(model)
        probs[clf.classes_[1]] = clf.predict_proba([text])[0][1]
    
    return sorted(range(len(probs)), key=lambda i: probs[i], reverse=True)[:top_n]

In [90]:
probs = predict_tag("client onboarding investment banking")

In [91]:
probs[587]

0.6968584688016274

In [92]:
probs[1257]

0.9999995729566228

In [94]:
d = df[df["tag"] == 1257]

d

Unnamed: 0,term,tag
794,aberdeen investments,1257
2779,uk property funds,1257
3333,jpm emerging markets fund,1257
4394,property unit trusts,1257
4577,artemis investments,1257
...,...,...
601784,compare funds,1257
604498,baillie gifford corporate bond,1257
604899,investment trust discounts,1257
606081,money market funds uk,1257


In [95]:
d1 = df[df["tag"] == 587]

d1

Unnamed: 0,term,tag
124006,client onboarding investment banking,587
143200,fintech big data,587
333392,innovation in financial services,587
373541,m commerce uk,587


In [97]:
predict_tag("investment trust discounts")

108

In [98]:
df[df["tag"] == 108]

Unnamed: 0,term,tag
655,montage wealth management,108
661,st james place wealth management charges,108
3683,ubs investments,108
3785,fintech report,108
3826,succession wealth management,108
...,...,...
602484,investec asset finance,108
602652,gresham investment management,108
603849,barclays trader,108
604317,fund administration jobs,108


In [99]:
predict_tag("portman asset finance ltd")

108

In [100]:
predict_tag("gresham investment management")

1257

In [102]:
probs = predict_tag("portman asset finance ltd")

In [106]:
res = 

In [114]:
predict_tag("fintech big data", 20)

[1202,
 1197,
 64,
 798,
 420,
 33,
 421,
 158,
 1352,
 691,
 1165,
 319,
 855,
 874,
 1093,
 715,
 776,
 236,
 1173,
 840]