In [12]:
import sqlite3
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import pickle

In [2]:
non_programming = []
with open('../data/source/dialogues.tsv') as f:
    f.readline()
    for line in f:
        non_programming.append(line.split('\t')[0])

In [3]:
len(non_programming)

219574

In [5]:
con = sqlite3.connect('D:/SNLP data/posts.db')
cur = con.cursor()

In [6]:
programming = [x[0] for x in cur.execute('select title from Posts where score > 5;')]

In [7]:
len(programming)

1005508

In [8]:
labels = np.array([0] * len(non_programming) + [1] * len(programming))

In [9]:
dataset = non_programming + programming

In [10]:
del non_programming, programming

In [13]:
train_x, test_x, train_y, test_y = train_test_split(dataset, labels)

In [14]:
tfidf = TfidfVectorizer(max_df=0.95, min_df=5, ngram_range=(1, 2), max_features=5000)

In [15]:
train_tfidf = tfidf.fit_transform(train_x)

In [16]:
test_tfidf = tfidf.transform(test_x)

In [17]:
lr = LogisticRegression()

In [18]:
lr.fit(train_tfidf, train_y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [19]:
roc_auc_score(test_y, lr.predict_proba(test_tfidf)[:, 1])

0.9993398358388313

In [20]:
from sklearn.pipeline import make_pipeline

In [21]:
pipeline = make_pipeline(
    TfidfVectorizer(max_df=0.95, min_df=5, ngram_range=(1, 2), max_features=5000),
    LogisticRegression(),
)

In [22]:
pipeline.fit(train_x + test_x, np.concatenate((train_y, test_y)))

Pipeline(memory=None,
     steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.95, max_features=5000, min_df=5,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [23]:
phrases = [
    'hello',
    'what is your name?',
    'who are you?',
    'Are you a bot?',
    'How to delete a table?',
    'python list comprehention',
    'c# windows forms',
    'linux download file',
]
for p in phrases:
    print(p, ': ', pipeline.predict([p]))


hello :  [0]
what is your name? :  [0]
who are you? :  [0]
Are you a bot? :  [0]
How to delete a table? :  [1]
python list comprehention :  [1]
c# windows forms :  [1]
linux download file :  [1]


In [24]:
with open('../data/train/intent_classifier.pkl', 'wb') as f:
    pickle.dump(pipeline, f)