In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from bot_config import BOT_CONFIG

### Подготовка датасета

In [2]:
dataset = []

for intent, intent_data in BOT_CONFIG["intents"].items():
    for example in intent_data["examples"]:
        dataset.append((example, intent))

In [3]:
len(dataset)

828

In [4]:
dataset[:10]

[('здарова', 'hello'),
 ('алоха', 'hello'),
 ('Hello', 'hello'),
 ('Whats up', 'hello'),
 ('приветствую', 'hello'),
 ('Здравствуйсте', 'hello'),
 ('ку', 'hello'),
 ('Приффет! Как делиффки?', 'hello'),
 ('Вечер в хату', 'hello'),
 ('hello', 'hello')]

In [5]:
corpus = [text for text, intent in dataset]
y = [intent for text, intent in dataset]

In [6]:
corpus[:5]

['здарова', 'алоха', 'Hello', 'Whats up', 'приветствую']

In [7]:
y[:5]

['hello', 'hello', 'hello', 'hello', 'hello']

### Векторизация

In [58]:
vectorizer = CountVectorizer(analyzer="char", ngram_range=(3, 3))
X = vectorizer.fit_transform(corpus)

In [9]:
# print(vectorizer.get_feature_names())
print(X.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


### Классификация

In [10]:
clf = LogisticRegression()
clf.fit(X, y)

LogisticRegression()

In [12]:
clf.predict(vectorizer.transform(["привет", "эй робот привет"]))

array(['hello', 'hello'], dtype='<U22')

In [13]:
clf.predict(vectorizer.transform(["что любишь слушать"]))

array(['music'], dtype='<U22')

In [15]:
clf.predict_proba(vectorizer.transform(["эй робот привет"]))

array([[0.00374701, 0.00295317, 0.00364921, 0.00194462, 0.00317425,
        0.00233095, 0.00206893, 0.00214798, 0.00256883, 0.00555842,
        0.00346639, 0.00339746, 0.0029189 , 0.00441856, 0.00432891,
        0.00301087, 0.00289834, 0.00306224, 0.00280115, 0.00843672,
        0.00362272, 0.0087851 , 0.00220135, 0.03186449, 0.00351273,
        0.00229284, 0.00193523, 0.00231038, 0.00301457, 0.00504284,
        0.00247331, 0.00356893, 0.01006714, 0.0037534 , 0.00308512,
        0.00215778, 0.00354099, 0.00510278, 0.00644229, 0.00320919,
        0.00688516, 0.00258782, 0.00340923, 0.00377194, 0.00376216,
        0.00288194, 0.0046402 , 0.00268251, 0.00341314, 0.00340703,
        0.00765594, 0.00218341, 0.20301062, 0.00314634, 0.00488725,
        0.0059772 , 0.00284491, 0.00284717, 0.00331043, 0.00282219,
        0.00328953, 0.0033707 , 0.00310014, 0.00206878, 0.00246348,
        0.00243831, 0.00684916, 0.00203481, 0.00650694, 0.00358713,
        0.00343923, 0.00892618, 0.00247284, 0.00

### Валидация 

In [48]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.15
)

In [49]:
clf = LogisticRegression()

In [50]:
clf.fit(X_train, y_train)

LogisticRegression()

In [51]:
clf.score(X_test, y_test)

0.16

In [59]:
scores = []

for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
#     clf = LogisticRegression()
    clf = LinearSVC()
    clf.fit(X_train, y_train)
    
    scores.append(clf.score(X_test, y_test))
    
print(scores)
print(sum(scores) / len(scores))

[0.24096385542168675, 0.27710843373493976, 0.25903614457831325, 0.23493975903614459, 0.23493975903614459, 0.25301204819277107, 0.29518072289156627, 0.28313253012048195, 0.2710843373493976, 0.26506024096385544]
0.26144578313253014


In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [54]:
vectorizer = TfidfVectorizer()

In [55]:
X = vectorizer.fit_transform(corpus)

In [62]:
clf = LinearSVC(
)
clf.fit(X, y)
clf.score(X, y)

0.8864734299516909