### comparison

In [1]:
import numpy as np
import pandas as pd

In [2]:
train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")

### countvectorizer vs. tfidfvectorizer

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = list(train_df.question_text)
cvectorizer = CountVectorizer()
XLc = cvectorizer.fit_transform(corpus)

In [5]:
from sklearn.linear_model import LogisticRegression
lgc = LogisticRegression(C=1.0)
lgc.fit(XLc, list(train_df.target))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [6]:
cvectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
tvectorizer = TfidfVectorizer()
XLt = tvectorizer.fit_transform(corpus)

In [8]:
lgt = LogisticRegression(C=1.0)
lgt.fit(XLt, list(train_df.target))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [9]:
tvectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

### tfidf logistic coefficient

In [10]:
tfeatures = tvectorizer.get_feature_names()

In [11]:
lgt_scores = list(zip(tfeatures, lgt.coef_[0]))

In [12]:
tscore_dict = dict(lgt_scores)

### countvectorizer logistic coefficient

In [13]:
cfeatures = cvectorizer.get_feature_names()

In [14]:
lgc_scores = list(zip(cfeatures, lgc.coef_[0]))

In [15]:
cscore_dict = dict(lgc_scores)

### scoring difference

In [16]:
import nltk
from nltk.corpus import stopwords

In [17]:
stopwords = set(stopwords.words('english'))

In [19]:
def scoring_words(vect_method='tfidf', query='Has the United States become the largest dictatorship in the world?'):
    # preprocessing
    from nltk import word_tokenize
    tokens = word_tokenize(query.lower())
    print(tokens)
    
    if vect_method == 'tfidf':
        b = tscore_dict
    else:
        b = cscore_dict
        
    scores = [b[t] if (t in b) and (t not in stopwords) else 0 for t in tokens]
    
    print(scores)

    import numpy as np
    arr = np.array(scores)
    indices = [s for s in arr.argsort()[-5:][::-1] if s > 1.0]
    # uncomment the following to change 
#     print(scores)
#     indices = [scores.index(ii) for ii in scores if (ii >= 1.0) and (tokens[scores.index(ii)] not in stopwords)]
    
    words = list(set([tokens[i] for i in indices]))
    print(words)

    ans = []
    for ind, ii in enumerate(query.lower().split()):
        for jj in words:
            if jj in ii and len(ii) - len(jj) <= 1:
                ans.append(ind)
    return ans

In [20]:
scoring_words(vect_method='tfidf', query="Why don't poor countries print more money to use for paying for education, etc.?")

['why', 'do', "n't", 'poor', 'countries', 'print', 'more', 'money', 'to', 'use', 'for', 'paying', 'for', 'education', ',', 'etc', '.', '?']
[0, 0, 0, 1.9050414062898482, 0.08918326712227044, -0.90463146657921123, 0, 0.75202372081650515, 0, -0.40337277627113183, 0, 0.66426794313579152, 0, -0.072099506105499991, 0, 1.2447212208436367, 0, 0]
['money', 'etc', 'paying', 'countries', 'poor']


[2, 3, 6, 10]

In [21]:
scoring_words(vect_method='count', query="Why don't poor countries print more money to use for paying for education, etc.?")

['why', 'do', "n't", 'poor', 'countries', 'print', 'more', 'money', 'to', 'use', 'for', 'paying', 'for', 'education', ',', 'etc', '.', '?']
[0, 0, 0, 0.64791479878555613, -0.044814572523616994, -0.44811434626675867, 0, 0.24047668307504766, 0, -0.076402267102584034, 0, 0.21851726703762406, 0, -0.023666621953956558, 0, -0.39314092549589297, 0, 0]
['money', 'paying', '.', 'poor', '?']


[2, 6, 10]

In [22]:
scoring_words(vect_method='tfidf', query="Why don't USA citizens realize that Trump is rapidly doing what terrorists could not, i.e., push the country towards irrevocable catastrophe?")

['why', 'do', "n't", 'usa', 'citizens', 'realize', 'that', 'trump', 'is', 'rapidly', 'doing', 'what', 'terrorists', 'could', 'not', ',', 'i.e.', ',', 'push', 'the', 'country', 'towards', 'irrevocable', 'catastrophe', '?']
[0, 0, 0, 1.9629288282951698, 2.2265968115514649, 3.2391450545783962, 0, 5.694714780064527, 0, 0.4230896111986327, 0, 0, 4.0264540375332336, -0.67789223151347455, 0, 0, 0, 0, 0.88831637999819757, 0, 1.2667648593983192, 0.85720849626362028, 0.03976211550709665, -0.15962684893709167, 0]
['citizens', 'usa', 'trump', 'realize', 'terrorists']


[2, 3, 4, 6, 11]

In [23]:
scoring_words(vect_method='count', query="Why don't USA citizens realize that Trump is rapidly doing what terrorists could not, i.e., push the country towards irrevocable catastrophe?")

['why', 'do', "n't", 'usa', 'citizens', 'realize', 'that', 'trump', 'is', 'rapidly', 'doing', 'what', 'terrorists', 'could', 'not', ',', 'i.e.', ',', 'push', 'the', 'country', 'towards', 'irrevocable', 'catastrophe', '?']
[0, 0, 0, 0.61731510387052568, 0.64162569671886782, 0.97711021199434023, 0, 1.4349582961003773, 0, 0.24509268053202549, 0, 0, 1.5607103828349602, -0.11215263541423476, 0, 0, 0, 0, 0.2323969184874197, 0, 0.3755065649627764, -0.036098490919582615, -0.01900937913138941, -0.29710712730296779, 0]
['citizens', 'usa', 'trump', 'realize', 'terrorists']


[2, 3, 4, 6, 11]

In [24]:
scoring_words(vect_method='tfidf', query="How can you tell the difference between a Russian Internet troll and an American one?")

['how', 'can', 'you', 'tell', 'the', 'difference', 'between', 'a', 'russian', 'internet', 'troll', 'and', 'an', 'american', 'one', '?']
[0, 0, 0, -0.091157046460830088, 0, -1.0831477290627616, 0, 0, 2.8571626483781083, -0.21569446989575794, 3.6278528915119521, 0, 0, 2.7549586725459432, -0.3507462133621001, 0]
['an', 'troll', 'russian', 'american', '?']


[1, 8, 10, 11, 12, 13]

In [25]:
scoring_words(vect_method='count', query="How can you tell the difference between a Russian Internet troll and an American one?")

['how', 'can', 'you', 'tell', 'the', 'difference', 'between', 'a', 'russian', 'internet', 'troll', 'and', 'an', 'american', 'one', '?']
[0, 0, 0, -0.061350640038029175, 0, -0.29270115514699058, 0, 0, 0.88018751839839993, -0.051015757369231488, 1.2272879540297057, 0, 0, 0.72268028338740065, -0.051562626526923325, 0]
['an', 'troll', 'russian', 'american', '?']


[1, 8, 10, 11, 12, 13]