In [18]:
!mv kaggle.json /root/.kaggle/kaggle.json

mv: cannot stat 'kaggle.json': No such file or directory


In [19]:
!kaggle competitions download -c quora-insincere-questions-classification

Downloading train.csv.zip to /content
 76% 42.0M/54.9M [00:00<00:00, 49.3MB/s]
100% 54.9M/54.9M [00:00<00:00, 86.5MB/s]
Downloading sample_submission.csv.zip to /content
  0% 0.00/4.09M [00:00<?, ?B/s]
100% 4.09M/4.09M [00:00<00:00, 67.0MB/s]
Downloading embeddings.zip to /content
100% 5.95G/5.96G [01:26<00:00, 54.2MB/s]
100% 5.96G/5.96G [01:26<00:00, 74.1MB/s]
Downloading test.csv.zip to /content
 44% 7.00M/15.8M [00:00<00:00, 40.1MB/s]
100% 15.8M/15.8M [00:00<00:00, 62.5MB/s]


In [13]:
!unzip embeddings.zip -d embeddings

Archive:  embeddings.zip
   creating: embeddings/GoogleNews-vectors-negative300/
   creating: embeddings/glove.840B.300d/
   creating: embeddings/paragram_300_sl999/
   creating: embeddings/wiki-news-300d-1M/
  inflating: embeddings/glove.840B.300d/glove.840B.300d.txt  
  inflating: embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin  
  inflating: embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec  
  inflating: embeddings/paragram_300_sl999/README.txt  
  inflating: embeddings/paragram_300_sl999/paragram_300_sl999.txt  


In [4]:
import pandas as pd
import string
import json
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score
import matplotlib.pyplot as plt

#nltk.download('stopwords')
pd.set_option('display.float_format', lambda x: '%.6f' % x)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Utils


In [2]:
def under_sampling(df, percent=1, random_seed=11):
    majority = df[df['target'] == 0]
    minority = df[df['target'] == 1]
    lower_data = majority.sample(n=int(percent * len(minority)), replace=False, random_state=random_seed, axis=0)
    return (pd.concat([lower_data, minority]))

def statistic_features(df_X, embedding):
  df_X["text_len"] = df_X["question_text"].apply(lambda x: len(x.split()))
  df_X["clean_text"] = df_X["question_text"].apply(lambda x: text_cleaning(x))
  df_X['clean_text_len'] = df_X["clean_text"].apply(lambda x: len(x.split()))
  df_X["oov_rate"] = df_X["clean_text"].apply(lambda x: compute_oov_rate(x, embedding))
  return df_X

def concate_features(df):
    feature_matrix = []
    for row in df.iterrows():
        x = row[1]
        new_vectors = x['word_vector']
        new_vectors = np.append(new_vectors, x["oov_rate"])
        new_vectors = np.append(new_vectors, x["text_len"])
        new_vectors = np.append(new_vectors, x["clean_text_len"])
        feature_matrix.append(new_vectors)
    return pd.DataFrame(feature_matrix)


def vectorize(text, embeddings_index, max_len=20):
    text_list = text.split()
    init_vector = [0]*300
    if len(text_list) >= max_len:
        text_list = text_list[:max_len]
    else:
        pad_len = max_len - len(text_list)
        for _ in range(pad_len):
            text_list.append("<PAD>")
    vectors = []
    for word in text_list:
        if word == "<PAD>":
            vectors.append(init_vector)
        elif word in embeddings_index:
            vectors.append(embeddings_index[word])
        else:
            vectors.append(init_vector)
    vectors = np.mean(vectors, axis=0)
    return vectors

def compute_oov_rate(text, embeddings_index):
    text_list = text.split()
    num_of_words = len(text_list)
    num_of_known_words = 0
    for word in text_list:
        if word in embeddings_index:
            num_of_known_words += 1
    oov_rate = 1 - num_of_known_words / num_of_words if num_of_words else None
    return oov_rate

### Text cleaning

In [5]:
%%time
# replace unicode space character with space ' '
spaces = ['\u200b', '\u200e', '\u202a', '\u202c', '\ufeff', '\uf0d8', '\u2061', '\x10', '\x7f', '\x9d', '\xad', '\xa0']
def replace_space(text):
    for s in spaces:
        text = text.replace(s, ' ')
    return text

# clean rare words
# with open('rare_words.json') as f:
#     rare_words_mapping = json.load(f)
def clean_rare_words(text):
    for w in rare_words_mapping:
        if text.count(w) > 0:
            text = text.replace(w, rare_words_mapping[w])
    return text

def clean_decontracted(text):
    # specific
    text = re.sub(r"(W|w)on(\'|\’)t ", "will not ", text)
    text = re.sub(r"(C|c)an(\'|\’)t ", "can not ", text)
    text = re.sub(r"(Y|y)(\'|\’)all ", "you all ", text)
    text = re.sub(r"(Y|y)a(\'|\’)ll ", "you all ", text)

    # general
    text = re.sub(r"(I|i)(\'|\’)m ", "i am ", text)
    text = re.sub(r"(A|a)in(\'|\’)t ", "is not ", text)
    text = re.sub(r"n(\'|\’)t ", " not ", text)
    text = re.sub(r"(\'|\’)re ", " are ", text)
    text = re.sub(r"(\'|\’)s ", " is ", text)
    text = re.sub(r"(\'|\’)d ", " would ", text)
    text = re.sub(r"(\'|\’)ll ", " will ", text)
    text = re.sub(r"(\'|\’)t ", " not ", text)
    text = re.sub(r"(\'|\’)ve ", " have ", text)
    return text


# with open('misspell_words.json') as f:
#     misspell_words_mapping = json.load(f)
def clean_misspell(text):
    for w in misspell_words_mapping:
        if text.count(w) > 0:
            text = text.replace(w, misspell_words_mapping[w])
    return text


# replace punctuation with space
def replace_punctuation(text):
    punct = str.maketrans('', '', string.punctuation)
    return text.translate(punct)


# clean repeated letters
def clean_repeat_words(text):
    text = text.replace("img", "ing")

    text = re.sub(r"(I|i)(I|i)+ng", "ing", text)
    text = re.sub(r"(L|l)(L|l)(L|l)+y", "lly", text)
    text = re.sub(r"(A|a)(A|a)(A|a)+", "a", text)
    text = re.sub(r"(C|c)(C|c)(C|c)+", "cc", text)
    text = re.sub(r"(D|d)(D|d)(D|d)+", "dd", text)
    text = re.sub(r"(E|e)(E|e)(E|e)+", "ee", text)
    text = re.sub(r"(F|f)(F|f)(F|f)+", "ff", text)
    text = re.sub(r"(G|g)(G|g)(G|g)+", "gg", text)
    text = re.sub(r"(I|i)(I|i)(I|i)+", "i", text)
    text = re.sub(r"(K|k)(K|k)(K|k)+", "k", text)
    text = re.sub(r"(L|l)(L|l)(L|l)+", "ll", text)
    text = re.sub(r"(M|m)(M|m)(M|m)+", "mm", text)
    text = re.sub(r"(N|n)(N|n)(N|n)+", "nn", text)
    text = re.sub(r"(O|o)(O|o)(O|o)+", "oo", text)
    text = re.sub(r"(P|p)(P|p)(P|p)+", "pp", text)
    text = re.sub(r"(Q|q)(Q|q)+", "q", text)
    text = re.sub(r"(R|r)(R|r)(R|r)+", "rr", text)
    text = re.sub(r"(S|s)(S|s)(S|s)+", "ss", text)
    text = re.sub(r"(T|t)(T|t)(T|t)+", "tt", text)
    text = re.sub(r"(V|v)(V|v)+", "v", text)
    text = re.sub(r"(Y|y)(Y|y)(Y|y)+", "y", text)
    text = re.sub(r"plzz+", "please", text)
    text = re.sub(r"(Z|z)(Z|z)(Z|z)+", "zz", text)
    return text


def lower_words(text):
    return text.lower()

stop_words = stopwords.words('english')
def remove_stopwords(text):
    """
    remove stop words and extra space
    params: string
    return: list
    """
    words = text.split()
    new_words = []
    for w in words:
        if w not in stop_words and w != ' ':
            new_words.append(w)
    return ' '.join(new_words)


def stemming(text):
    pass

# apply all the clean methods
def text_cleaning(text):
    text = replace_space(text)
    #text = clean_rare_words(text)
    text = clean_decontracted(text)
    #text = clean_misspell(text)
    text = replace_punctuation(text)
    text = clean_repeat_words(text)
    text = lower_words(text)
    text = remove_stopwords(text)
    return text


CPU times: user 1.46 ms, sys: 0 ns, total: 1.46 ms
Wall time: 1.47 ms


### Word embedding

In [6]:
def load_embed(typeToLoad):
    def get_coefs(word, *arr):
        return word, np.asarray(arr, dtype='float16')

    if typeToLoad == "glove":
        file = 'embeddings/glove.840B.300d/glove.840B.300d.txt'
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file) if len(o) > 100)
    elif typeToLoad == "word2vec":
        # file = 'embeddings⁩/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin⁩'
        file = 'embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
        embeddings_index = KeyedVectors.load_word2vec_format(file, binary=True)  # query word vector from the file
    elif typeToLoad == "fasttext":
        # file = "⁨embeddings⁩/wiki-news-300d-1M⁩/wiki-news-300d-1M.vec"
        file = 'embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file, encoding='latin'))

    return embeddings_index

### Model

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score, cohen_kappa_score
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.model_selection import GridSearchCV 

In [8]:
def evaluate_models(y_true, y_pred):
    f1 = f1_score(y_true, y_pred)
    cks = cohen_kappa_score(y_true, y_pred)
    print("f1_score", f1)
    print('cohen_kappa_score:', cks)
    return f1, cks

def find_best_model(model, param_grid, X_train, Y_train, kfold):
    grid_search = GridSearchCV(model, param_grid, scoring='f1', n_jobs=-1, cv=kfold)# scoring指定损失函数类型，n_jobs指定全部cpu跑，cv指定交叉验证
    grid_search.fit(X_train, Y_train)  
    print("best_params_", grid_search.best_params_)
    print("best_score_", grid_search.best_score_)
    print("cv_results_", grid_search.cv_results_)
    return grid_search

def build_cv_model(model, train_x, y, kfold):
    k = 0
    results = []
    for train_index, test_index in kfold.split(train_x, y):
        k += 1
        print("--- cv", k)
        X_train, y_train = train_x.iloc[train_index], y[train_index]
        X_test, y_test = train_x.iloc[test_index], y[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        f1, cks = evaluate_models(y_test, y_pred)
        results.append([f1,cks])
    avg_f1, avg_cks = np.array(results).mean(axis=0) 
    print("Average cv score, f1 {}, cohen_kappa_score {}".format(avg_f1, avg_cks))
    return results

# PCA
def reduce_demension(X, n):
    pca = PCA(n_components=n)
    newX = pca.fit_transform(X)
    print("pca.explained_variance_", pca.explained_variance_)
    print("pca.explained_variance_ratio_", pd.DataFrame(pca.explained_variance_ratio_))
    print("total variance ratio", sum(pca.explained_variance_ratio_))
    return newX

### Training

In [9]:
print("Load embedding")
embedding = load_embed("fasttext")

Load embedding


In [10]:
data_path = "./"
print("Load training data")
train = pd.read_csv(data_path +"train.csv.zip")

Load training data


In [11]:
X, y = train["question_text"], train["target"]
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.1, random_state=42)

sample_train = under_sampling(pd.concat([train_x, train_y], axis=1), percent=2, random_seed=11)
#sample_train = train.sample(frac=0.1, random_state=100)  # 261224 rows
print("sample size", sample_train.shape)

sample_train.reset_index(drop=True, inplace=True)
train_x, y = sample_train["question_text"], sample_train["target"]

print("Text cleaning")
train_x = train_x.apply(text_cleaning)

print("Text vectorizing")
train_x = train_x.apply(lambda text: vectorize(text, embedding, max_len=20))
train_x = pd.DataFrame(train_x.array)

sample size (218835, 2)
Text cleaning
Text vectorizing


In [12]:
#preprocess testing data
test_x = test_x.apply(text_cleaning)
test_x = test_x.apply(lambda text: vectorize(text, embedding, max_len=20))
test_x = pd.DataFrame(test_x.array)

In [13]:
kfold = StratifiedKFold(n_splits=5) 

#logistic regression
lr = LogisticRegression(solver='saga')
res = build_cv_model(lr, train_x, y, kfold)

print("Predicting on testing data")
lr.fit(train_x, y)
pred_y = lr.predict(test_x)
evaluate_models(test_y, pred_y)

--- cv 1
f1_score 0.7832132011269255
cohen_kappa_score: 0.6854483356633456
--- cv 2
f1_score 0.7793799302880206
cohen_kappa_score: 0.6803466057094254
--- cv 3
f1_score 0.7852823315118398
cohen_kappa_score: 0.6877516422970968
--- cv 4
f1_score 0.78580546336482
cohen_kappa_score: 0.6886937470183931
--- cv 5
f1_score 0.7789589847342285
cohen_kappa_score: 0.6801625179639126
Average cv score, f1 0.782527982205167, cohen_kappa_score 0.6844805697304347
Predicting on testing data
f1_score 0.5194793536804309
cohen_kappa_score: 0.4788738760899417


(0.5194793536804309, 0.4788738760899417)

In [15]:
#naive bayes
nb = GaussianNB()
res = build_cv_model(nb, train_x, y, kfold)

print("\n=====================")
print("Predicting on testing data")
nb.fit(train_x, y)
pred_y = nb.predict(test_x)
evaluate_models(test_y, pred_y)

--- cv 1
f1_score 0.5802205749412404
cohen_kappa_score: 0.38720461703587705
--- cv 2
f1_score 0.5688233370872234
cohen_kappa_score: 0.37206595538312326
--- cv 3
f1_score 0.5814449383786909
cohen_kappa_score: 0.388833183809172
--- cv 4
f1_score 0.5802877697841727
cohen_kappa_score: 0.38565762224016287
--- cv 5
f1_score 0.5762369086312749
cohen_kappa_score: 0.3809875501160582
Average cv score, f1 0.5774027057645205, cohen_kappa_score 0.3829497857168787

Predicting on testing data
f1_score 0.2563950558733734
cohen_kappa_score: 0.18073046944050508


(0.2563950558733734, 0.18073046944050508)