In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from tqdm import tqdm
import re, string
from nltk.stem import PorterStemmer

In [2]:
train = pd.read_csv('train.csv')

In [3]:
# Choose 10% data as validation data
validation = train.sample(frac=0.1, replace=False, weights=None, random_state=None, axis=None)
validation.head()

Unnamed: 0,qid,question_text,target
844150,a56de6e0ae64511bdbf3,"I told my daughter not to dye her hair, but sh...",0
276996,36341b11c3b1a57f56fc,How can you count the values of an array in PHP?,0
119963,177d148141bc3f28c390,How would you describe the personality of the ...,0
464687,5afe3a144a2612fe2cf0,Does Dendula surname belongs to Brahmins?,0
226279,2c3ebc52dc1fccd29ec5,What should a bill of sale look like?,0


In [4]:
train = train[~train.index.isin(validation.index)]
train.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [5]:
# cleaning insignificant symbols
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’""])')

In [6]:
# Tokenize function without stemming
def tokenize_nonstem(s): 
    s= re.sub('[^a-zA-Z0-9]'," ",s)
    tokens=re_tok.sub(r' \1',s).split()
    return tokens

In [7]:
# Tokenize function with stemming
def tokenize_stem(s): 
    s= re.sub('[^a-zA-Z0-9]'," ",s)
    tokens=re_tok.sub(r' \1',s).split()
    for i in range(len(tokens)):
        tokens[i]=PorterStemmer().stem(tokens[i])
    return tokens

In [8]:
n = train.shape[0]
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize_nonstem,
                     min_df=3, max_df=0.9, strip_accents='unicode',
                     use_idf=1, smooth_idf=1, sublinear_tf=1)

In [9]:
# Calculate TF-IDF weight
trn_term_doc = vec.fit_transform(train['question_text'])
val_term_doc = vec.transform(validation['question_text'])

In [10]:
x = trn_term_doc
x_val = val_term_doc

In [11]:
# Generate a Logistic Regression model
def get_model(y):
    y = y.values
    m = LogisticRegression(C=4, dual=False,solver='liblinear')    
    return m.fit(x, y)

In [12]:
preds = np.zeros((len(validation), 1))
labels_cols=['target']

In [13]:
# for i, j in enumerate(labels_cols):
#     print('fit', j)
m= get_model(train['target'])
preds = m.predict_proba(x_val)[:,1]

In [14]:
for i in range(len(preds)):
    if preds[i] > 0.5:
        preds[i]=1
    else:
        preds[i]=0

In [15]:
# Calculate precision
val=validation['target'].values
val.reshape(preds.shape)
temp=0
for i in range(len(val)):
    if preds[i]==val[i]:
        temp+=1
precision = temp/len(val)
precision

0.955825974174598

In [16]:
# Calculate F1 score
b=np.sum(preds)
a = np.sum(val)
c=0
for i in range(len(val)):
    if preds[i]==1 and val[i]==1:
        c+=1
precision = c / b
recall = c / a
f1_score = 2 * precision * recall / (precision + recall)
f1_score

0.577063550036523

In [17]:
n = train.shape[0]
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize_stem,
                     min_df=3, max_df=0.9, strip_accents='unicode',
                     use_idf=1, smooth_idf=1, sublinear_tf=1)

In [18]:
trn_term_doc = vec.fit_transform(train['question_text'])
val_term_doc = vec.transform(validation['question_text'])

In [19]:
x = trn_term_doc
x_val = val_term_doc

In [20]:
preds = np.zeros((len(validation), 1))
labels_cols=['target']
m= get_model(train['target'])
preds = m.predict_proba(x_val)[:,1]

In [21]:
for i in range(len(preds)):
    if preds[i] > 0.5:
        preds[i]=1
    else:
        preds[i]=0

In [22]:
# Calculate precision
val=validation['target'].values
val.reshape(preds.shape)
temp=0
for i in range(len(val)):
#     print(preds[i],val[i])
    if preds[i]==val[i]:
        temp+=1
precision = temp/len(val)
precision

0.955988098189933

In [23]:
# Calculate F1 score
b=np.sum(preds)
a = np.sum(val)
c=0
for i in range(len(val)):
    if preds[i]==1 and val[i]==1:
        c+=1
precision = c / b
recall = c / a
f1_score = 2 * precision * recall / (precision + recall)
f1_score

0.5785772988768149