In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
import string
import json
from gensim.models import KeyedVectors
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, f1_score   # evaluation
from sklearn.metrics import classification_report, cohen_kappa_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import en_core_web_sm
import matplotlib.pyplot as plt
import re
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.model_selection import StratifiedKFold #交叉验证
from sklearn.model_selection import GridSearchCV #网格搜索


In [4]:
import matplotlib
from sklearn.decomposition import PCA
import spacy # tokenize text
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB

sp = spacy.load("en_core_web_sm")
pd.set_option('display.max_colwidth', -1)

In [5]:
dataset = pd.read_csv("train.csv")

In [6]:
# test using small sample to do 
sample = dataset.sample(frac=0.2,random_state = 100) #261224 rows

In [7]:
# split the data set into train and test
train_set, test_set = train_test_split(dataset, test_size=0.2, random_state=42)

In [8]:
train_set.target.value_counts()
test_set.target.value_counts()

0    245369
1    15856 
Name: target, dtype: int64

In [9]:
 #under_sampling to balance data
def under_sampling(df,percent=1):
    majority = df[df['target'] == 0]  
    minority = df[df['target'] == 1]     
    lower_data = majority.sample(n=int(percent*len(minority)),replace=False,random_state=890,axis=0)   
    return (pd.concat([lower_data,minority]))


 #over sampling to balance data
def over_sampling(df,percent=1):
#通过numpy随机选取多数样本的采样下标
    '''
    percent:多数类别下采样的数量相对于少数类别样本数量的比例
    '''
    most_data = df[df['label'] == 1]  # 多数类别的样本
    minority_data = df[df['label'] == 0]  # 少数类别的样本
    index = np.random.randint(len(most_data), size=int(percent *len(minority_data)) )
    #下采样后数据样本
    lower_data = most_data.iloc[list(index)]  # 下采样
    return(pd.concat([lower_data, minority_data]))


# text cleaning

In [12]:
# replace unicode space character with space ' '
spaces = ['\u200b', '\u200e', '\u202a', '\u202c', '\ufeff', '\uf0d8', '\u2061', '\x10', '\x7f', '\x9d', '\xad', '\xa0']
def replace_space(text):
    for s in spaces:
        text = text.replace(s, ' ')
    return text

# clean rare words
with open('rare_words.json') as f:
    rare_words_mapping = json.load(f)
    #print(rare_words_mapping)
    
def clean_rare_words(text):
    for w in rare_words_mapping:
        if text.count(w) > 0:
            text = text.replace(w, rare_words_mapping[w])
    return text
    
# decontracted
def clean_decontracted(text):
    # specific
    text = re.sub(r"(W|w)on(\'|\’)t ", "will not ", text)
    text = re.sub(r"(C|c)an(\'|\’)t ", "can not ", text)
    text = re.sub(r"(Y|y)(\'|\’)all ", "you all ", text)
    text = re.sub(r"(Y|y)a(\'|\’)ll ", "you all ", text)

    # general
    text = re.sub(r"(I|i)(\'|\’)m ", "i am ", text)
    text = re.sub(r"(A|a)in(\'|\’)t ", "is not ", text)
    text = re.sub(r"n(\'|\’)t ", " not ", text)
    text = re.sub(r"(\'|\’)re ", " are ", text)
    text = re.sub(r"(\'|\’)s ", " is ", text)
    text = re.sub(r"(\'|\’)d ", " would ", text)
    text = re.sub(r"(\'|\’)ll ", " will ", text)
    text = re.sub(r"(\'|\’)t ", " not ", text)
    text = re.sub(r"(\'|\’)ve ", " have ", text)
    return text

# misspelling
with open('misspell_words.json') as f:
    misspell_words_mapping = json.load(f)
def clean_misspell(text):
    for w in misspell_words_mapping:
        if text.count(w) > 0:
            text = text.replace(w, misspell_words_mapping[w])
    return text

#replace punctuation with space
def replace_punctuation(text):
    punct = str.maketrans('','',string.punctuation)
    return text.translate(punct)

# clean repeated letters
def clean_repeat_words(text):
    text = text.replace("img", "ing")

    text = re.sub(r"(I|i)(I|i)+ng", "ing", text)
    text = re.sub(r"(L|l)(L|l)(L|l)+y", "lly", text)
    text = re.sub(r"(A|a)(A|a)(A|a)+", "a", text)
    text = re.sub(r"(C|c)(C|c)(C|c)+", "cc", text)
    text = re.sub(r"(D|d)(D|d)(D|d)+", "dd", text)
    text = re.sub(r"(E|e)(E|e)(E|e)+", "ee", text)
    text = re.sub(r"(F|f)(F|f)(F|f)+", "ff", text)
    text = re.sub(r"(G|g)(G|g)(G|g)+", "gg", text)
    text = re.sub(r"(I|i)(I|i)(I|i)+", "i", text)
    text = re.sub(r"(K|k)(K|k)(K|k)+", "k", text)
    text = re.sub(r"(L|l)(L|l)(L|l)+", "ll", text)
    text = re.sub(r"(M|m)(M|m)(M|m)+", "mm", text)
    text = re.sub(r"(N|n)(N|n)(N|n)+", "nn", text)
    text = re.sub(r"(O|o)(O|o)(O|o)+", "oo", text)
    text = re.sub(r"(P|p)(P|p)(P|p)+", "pp", text)
    text = re.sub(r"(Q|q)(Q|q)+", "q", text)
    text = re.sub(r"(R|r)(R|r)(R|r)+", "rr", text)
    text = re.sub(r"(S|s)(S|s)(S|s)+", "ss", text)
    text = re.sub(r"(T|t)(T|t)(T|t)+", "tt", text)
    text = re.sub(r"(V|v)(V|v)+", "v", text)
    text = re.sub(r"(Y|y)(Y|y)(Y|y)+", "y", text)
    text = re.sub(r"plzz+", "please", text)
    text = re.sub(r"(Z|z)(Z|z)(Z|z)+", "zz", text)
    return text
# make text lower case
def lower_words(text):
    return text.lower()

stop_words = stopwords.words('english')
def remove_stopwords(text):
    """
    remove stop words and extra space
    params: string
    return: list
    """
    words = text.split()
    new_words = []
    for w in words:
        if w not in stop_words and w != ' ':
            new_words.append(w)
    return ' '.join(new_words)
            
def stemming(text):
    pass

#apply all the clean methods
def text_cleaning(text):
    text = replace_space(text)
    text = clean_rare_words(text)
    text = clean_decontracted(text)
    text = clean_misspell(text)
    text = replace_punctuation(text)
    text = clean_repeat_words(text)
    text = lower_words(text)
    text = remove_stopwords(text)
    return text
    
    

In [13]:
#data after cleaning
# clean_text = sample.question_text.apply(lambda x: text_cleaning(x))

In [196]:
text_cleaning('i like apples, just like hahahhhhh?')

'like apples like hahahhhhh'

In [195]:
itext = 'i like apples, just like hahahhhhh?'
for i in itext.split():
    if i in stop_words:
        print(i)

i
just


In [94]:
# keras is on top of TensorFlow
# from keras.preprocessing.text import Tokenizer
# from keras.preprocessing.sequence import pad_sequences
# tokenize text 
# embed_size = 300 #词向量维度
# max_features = 95000 #设置词典大小
# max_len = 70 #设置输入的长度

# tokenizer = Tokenizer(num_words = max_features)
# tokenizer.fit_on_texts(list(train_X))
# train_X = tokenizer.texts_to_sequences(train_X)
# test_X = tokenizer.texts_to_sequences(test_X)
 
# # pad the sentences
# train_X = pad_sequences(train_X, maxlen = max_len)
# test_X = pad_sequences(test_X, maxlen = max_len)


# word embedding

In [14]:
def load_embed(typeToLoad):
    def get_coefs(word,*arr): 
        return word, np.asarray(arr, dtype='float16')
    
    if typeToLoad=="glove":
        file = 'embeddings/glove.840B.300d/glove.840B.300d.txt'
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file) if len(o)>100)
    elif typeToLoad=="word2vec":
        #file = 'embeddings⁩/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin⁩'
        file = 'embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
        embeddings_index = KeyedVectors.load_word2vec_format(file, binary=True)  # query word vector from the file
    elif typeToLoad=="fasttext":
        #file = "⁨embeddings⁩/wiki-news-300d-1M⁩/wiki-news-300d-1M.vec"
        file = 'embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file, encoding='latin'))
        
    return embeddings_index



In [15]:
def build_embedding_maxtrix2(embeddings_index, word_index, max_features=100000, emb_size=300):
    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    emb_size = all_embs.shape[1]

    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, emb_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix


In [16]:
def build_embedding_matrix(token_list, embeddings_index):
    matrix = []
    for tl in token_list:
        vectors = []
        for word in tl:
            if word in embeddings_index:
                vector = embeddings_index[word]
                vectors.append(vector)
        tl_vectors = np.mean(np.array(vectors), axis=0) if vectors else [0]*300
        matrix.append(tl_vectors)
    
    return np.array(matrix)


In [23]:
def vectorize(text,embeddings_index):
    text_list = text.split()
    vectors = []
    for word in text_list:
        if word in embeddings_index:
            vector = embeddings_index[word]
            vectors.append(vector)
    avg_vectors = np.mean(np.array(vectors), axis=0) if vectors else [0]*300
    return avg_vectors

In [24]:
def compute_oov_rate(text, embeddings_index):
    text_list = text.split()
    num_of_words = len(text_list)
    num_of_known_words = 0
    for word in text_list:
        if word in embeddings_index:
            num_of_known_words += 1
    oov_rate = 1 - num_of_known_words / num_of_words if num_of_words else None
    return oov_rate  

In [36]:
%%time
# embed_glove = load_embed('glove')

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.01 µs


In [37]:
embed_word2vec = load_embed('word2vec')
embed_fasttext = load_embed('fasttext')

In [26]:
itext = 'Whyyyyyyyyy the h*ck do Brits end every sentence with "x"?'

In [38]:
vector = vectorize(itext,embed_word2vec)

# word coverage

In [53]:
def build_vocab(texts):
    sentences = texts.apply(lambda x: x.split()).values
    vocab = {}
    for sentence in sentences:
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

def check_coverage(vocab, embeddings_index):
    known_words = {}
    unknown_words = {}
    num_known_words = 0
    num_unknown_words = 0
    for word in vocab.keys():
        if word in embeddings_index:
            known_words[word] = embeddings_index[word]
            num_known_words += vocab[word]

        else:
            unknown_words[word] = vocab[word]
            num_unknown_words += vocab[word]

    print('Found embeddings for {:.2%} of vocab'.format(len(known_words) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(num_known_words / (num_known_words + num_unknown_words)))
#     unknown_words = sorted(unknown_words.items(), key=operator.itemgetter(1))[::-1]

    return unknown_words

def vocab_check_coverage(texts, embed_method):

    vocab = build_vocab(texts)
    oov_words = check_coverage(vocab, embed_method)
    oov = {"oov_rate": len(oov_words) / len(vocab), 'oov_words': oov_words}
    print("oov_rate", oov["oov_rate"])
    
    return oov

# feature engineering

In [72]:
# concate_features
def concate_features(df):
    feature_matrix = []
    cnt = 0
    for row in df.iterrows():
        x = row[1]
        new_vectors = x['word_vector']
        new_vectors = np.append(new_vectors, x["oov_rate"])
        new_vectors = np.append(new_vectors, x["text_len"])
        new_vectors = np.append(new_vectors, x["clean_text_len"])
        feature_matrix.append(new_vectors)
    return feature_matrix


In [73]:
def concate_features2(vectors, *args):
    new_vectors = vectors + [*args]
    return new_vectors
           

# PCA 

In [31]:
def reduce_demension(X, n):
    """
    X: features matrix
    n: number of compoments or total explained ratio we want
    return:
    ev: explained variance of each component
    evr: explained variance ratio of each component
    """
    pca = PCA(n_components=n)
    pca.fit(X)
    ev = pca.explained_variance_
    evr = pca.explained_variance_ratio_
    return ev, evr

In [32]:
# test_set = test_set.sample(frac=0.1,random_state=100)

In [33]:
sample_train_set = under_sampling(train_set, percent=5)

In [34]:
sample_train_set['text_len'] = sample_train_set.question_text.apply(lambda x: len(x.split()))

In [55]:
sample_train_set["clean_text"] = sample_train_set.question_text.apply(lambda x: text_cleaning(x))

In [56]:
sample_train_set['clean_text_len'] = sample_train_set.clean_text.apply(lambda x: len(x.split()))

In [63]:
sample_train_set["word_vector"] = sample_train_set.clean_text.apply(lambda x:vectorize(x,embed_glove ))

In [64]:
sample_train_set["oov_rate"]  = sample_train_set.clean_text.apply(lambda x:compute_oov_rate(x, embed_glove))

In [65]:
# standardlize
# text_len_mean = np.mean(sample_train_set["text_len"])
# text_len_std = np.std(sample_train_set["text_len"])
# sample_train_set["text_len_standard"] = sample_train_set['text_len'].apply(lambda x: x - text_len_mean/ text_len_std)
# sample_train_set["clean_text_len_standard"] = sample_train_set['clean_text_len'].apply(lambda x: x - np.mean(x)/ np.std(x))


In [74]:
# train feature matrix
train_matrix = concate_features(sample_train_set)

In [None]:
######################Word Embedding############

In [60]:
%%time
#word coverage
oov_words_glove = vocab_check_coverage(sample_train_set.clean_text, embed_glove)

Found embeddings for 64.96% of vocab
Found embeddings for  97.83% of all text
oov_rate 0.35042372535745453
CPU times: user 1.71 s, sys: 1.15 s, total: 2.86 s
Wall time: 4.61 s


In [61]:
%%time
oov_words_word2vec = vocab_check_coverage(sample_train_set.clean_text, embed_word2vec)

Found embeddings for 45.24% of vocab
Found embeddings for  93.52% of all text
oov_rate 0.5476130274136848
CPU times: user 1.77 s, sys: 3.78 s, total: 5.55 s
Wall time: 14.3 s


In [62]:
%%time
oov_words_fasttext = vocab_check_coverage(sample_train_set.clean_text, embed_fasttext)

Found embeddings for 53.48% of vocab
Found embeddings for  96.51% of all text
oov_rate 0.46517621917897606
CPU times: user 1.73 s, sys: 150 ms, total: 1.88 s
Wall time: 1.99 s


In [None]:
###########################

In [79]:
# test data preprocess
test_set['text_len'] = test_set.question_text.apply(lambda x: len(x.split()))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [82]:
test_set["clean_text"] = test_set.question_text.apply(lambda x: text_cleaning(x))
test_set['clean_text_len'] = test_set.clean_text.apply(lambda x: len(x.split()))
test_set["oov_rate"]  = test_set.clean_text.apply(lambda x:compute_oov_rate(x, embed_glove))
test_set["word_vector"] = test_set.clean_text.apply(lambda x:vectorize(x, embed_glove))
test_matrix = concate_features(test_set)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .l

In [83]:
np.shape(test_matrix)

(261225, 303)

In [276]:
np.shape(y_test)

(5224,)

In [86]:
test_matrix[10][301]

5.0

(261225,)

# Evaluation

In [94]:
def evaluate_models(y_true, y_pred):
    print(classification_report(y_test,y_pred))
    print('confusion_matrix(0,1):')
    print(confusion_matrix(y_test,y_pred))
    print('cohen_kappa_score:', cohen_kappa_score(y_test,y_pred))   


# modeling

In [87]:
kflod = StratifiedKFold(n_splits=10, shuffle = True,random_state=7)#将训练/测试数据集划分10个互斥子集，

def find_best_model(model, param_grid, X_train, Y_train, X_test, Y_test):
    grid_search = GridSearchCV(model,param_grid,scoring = 'f1',n_jobs = -1,cv = kflod)
    #scoring指定损失函数类型，n_jobs指定全部cpu跑，cv指定交叉验证
    grid_search.fit(X_train, Y_train) #运行网格搜索
    print(grid_search.best_params_)
    print(grid_search.best_score_)
    print(grid_search.cv_results_)
    y_pred = grid_search.predict(X_test)
    evaluate_models(Y_test, y_pred)
    return y_pred


In [88]:
def define_model(model, X_train, Y_train, X_test, Y_test):
    model.fit(X_train, Y_train)
    Y_predict = model.predict(X_test)
    evaluate_models(Y_test, Y_predict)
    return Y_predict

In [91]:
X_train = np.nan_to_num(train_matrix)
y_train = sample_train_set.target
X_test = np.nan_to_num(test_matrix)
y_test = test_set.target
# model = LogisticRegression(solver = 'saga')

### Logistic Regression

In [90]:
%%time
model = LogisticRegression(solver = 'saga')
define_model(model,X_train, y_train, X_test, y_test)



ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [92]:

Y_predict = model.predict(X_test)

In [96]:
evaluate_models(y_test, Y_predict)

              precision    recall  f1-score   support

           0       0.97      0.96      0.97    245369
           1       0.49      0.60      0.54     15856

    accuracy                           0.94    261225
   macro avg       0.73      0.78      0.75    261225
weighted avg       0.94      0.94      0.94    261225

confusion_matrix(0,1):
[[235581   9788]
 [  6312   9544]]
cohen_kappa_score: 0.5097613535232608


### knn

In [None]:
# %%time
# model = KNeighborsClassifier(n_neighbors=2)
# define_model(model,X_train, y_train, X_test, y_test)

### SVM

In [None]:
%%time
model = SVC(kernel='linear', C=1)
define_model(model,X_train, y_train, X_test, y_test)

In [None]:
#grid search 
# model = SVC()
# param_grid = [
# #     {'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]}, 
#     {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}
#              ]
# # find_best_model(model, param_grid, X_train, y_train, X_test, y_test)
# grid_search = GridSearchCV(model,param_grid,scoring = 'f1',n_jobs = -1,cv = kflod)
# #scoring指定损失函数类型，n_jobs指定全部cpu跑，cv指定交叉验证
# grid_search.fit(X_train, y_train) #运行网格搜索
# print(grid_search.best_params_)
# print(grid_search.best_score_)
# print(grid_search.cv_results_)
# y_pred = grid_search.predict(X_test)
# evaluate_models(y_test, y_pred)

### NB

In [296]:
%%time
model = GaussianNB()
define_model(model,X_train, y_train, X_test, y_test)

              precision    recall  f1-score   support

           0       0.98      0.77      0.86      4900
           1       0.19      0.81      0.31       324

    accuracy                           0.77      5224
   macro avg       0.59      0.79      0.59      5224
weighted avg       0.93      0.77      0.83      5224

confusion_matrix(0,1):
[[3777 1123]
 [  62  262]]
cohen_kappa_score: 0.22911818384685267
CPU times: user 348 ms, sys: 37 ms, total: 385 ms
Wall time: 116 ms


array([1, 0, 0, ..., 0, 0, 0])

### Decision Tree

In [297]:
%%time
model = DecisionTreeClassifier() #default=”gini”
define_model(model,X_train, y_train, X_test, y_test)

              precision    recall  f1-score   support

           0       0.96      0.89      0.92      4900
           1       0.21      0.44      0.28       324

    accuracy                           0.86      5224
   macro avg       0.58      0.66      0.60      5224
weighted avg       0.91      0.86      0.88      5224

confusion_matrix(0,1):
[[4351  549]
 [ 181  143]]
cohen_kappa_score: 0.215190748704305
CPU times: user 2.5 s, sys: 14.9 ms, total: 2.51 s
Wall time: 2.53 s


array([0, 0, 0, ..., 0, 0, 0])

### Random Forest

In [298]:
%%time
model = RandomForestClassifier(n_estimators=100, random_state=0)
define_model(model,X_train, y_train, X_test, y_test)

              precision    recall  f1-score   support

           0       0.96      0.99      0.97      4900
           1       0.66      0.37      0.47       324

    accuracy                           0.95      5224
   macro avg       0.81      0.68      0.72      5224
weighted avg       0.94      0.95      0.94      5224

confusion_matrix(0,1):
[[4840   60]
 [ 205  119]]
cohen_kappa_score: 0.4488310612816919
CPU times: user 7.88 s, sys: 27.3 ms, total: 7.91 s
Wall time: 7.92 s


array([0, 0, 0, ..., 0, 0, 0])

In [None]:
x = sample_train_set[sample_train_set['oov_rate']<0.2]['oov_rate']


In [None]:
# plot histogram
#sample_train_set['text_len']
# x = sample_train_set[sample_train_set['oov_rate']<0.2]['oov_rate']
mu = np.mean(x)
sigma = np.std(x)

num_bins = 50

fig, ax = plt.subplots()

# the histogram of the data
n, bins, patches = ax.hist(x, num_bins, density=1, color='#fcb43e')

# add a 'best fit' line
# y = ((1 / (np.sqrt(2 * np.pi) * sigma)) *
#      np.exp(-0.5 * (1 / sigma * (bins - mu))**2))
# ax.plot(bins)
ax.set_xlabel('oov_rate')
ax.set_ylabel('Probability density')
# ax.set_title(r'')

# Tweak spacing to prevent clipping of ylabel
fig.tight_layout()
plt.savefig('oov_rate.png')
plt.show()



In [2]:
x

NameError: name 'x' is not defined

In [None]:
a = 1