In [1]:
import pandas as pd
import os
import distance  
import Levenshtein
import time
import lightgbm as lgb
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
from catboost import CatBoostClassifier
from numba import jit
from sklearn import metrics
from sklearn.model_selection import KFold

In [2]:
# !pip install distance
!pip install python-Levenshtein



You are using pip version 9.0.1, however version 22.0.3 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


## 加载数据

In [3]:
train=pd.read_csv('data/train.csv',sep='\t',header=None)
train.columns=['q1','q2','label']
test=pd.read_csv('data/test.csv',sep='\t',header=None)
test.columns=['q1','q2']
test['label']=1
sample_submit=pd.read_csv('data/sample_submit.csv')

In [4]:
train.head()

Unnamed: 0,q1,q2,label
0,有哪些女明星被潜规则啦,哪些女明星被潜规则了,1
1,怎么支付宝绑定银行卡？,银行卡怎么绑定支付宝,1
2,请问这部电视剧叫什么名字,请问谁知道这部电视剧叫什么名字,1
3,泰囧完整版下载,エウテルペ完整版下载,0
4,在沧州市区哪家卖的盐焗鸡好吃？,沧州饭店哪家便宜又好吃又实惠,0


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   q1      5000 non-null   object
 1   q2      5000 non-null   object
 2   label   5000 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 117.3+ KB


In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   q1      5000 non-null   object
 1   q2      5000 non-null   object
 2   label   5000 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 117.3+ KB


In [7]:
train['label'].value_counts(normalize=True)

1    0.5784
0    0.4216
Name: label, dtype: float64

In [8]:
data=pd.concat([train,test],axis=0).reset_index(drop=True)
train_size=len(train)

## 特征工程

### 1 基础特征

In [9]:
# 文本长度特征
data['q1_len']=data['q1'].astype(str).map(len)
data['q2_len']=data['q2'].astype(str).map(len)

In [10]:
data['q1_len'].describe()

count    10000.000000
mean        10.658400
std          4.019095
min          3.000000
25%          8.000000
50%         10.000000
75%         12.000000
max         49.000000
Name: q1_len, dtype: float64

In [11]:
# 长度差特征：差/比例
data['q1q2_len_diff']=data['q1_len']-data['q2_len']
data['q1q2_len_diff_abs']=np.abs(data['q1_len']-data['q2_len'])
data['q1q2_rate']=data['q1_len']/data['q2_len']
data['q2q1_rate']=data['q2_len']/data['q1_len']


In [12]:
## 特殊符号特征
data['q1_end_special']=data['q1'].str.endswith('？').astype(int)
data['q2_end_special']=data['q2'].str.endswith('？').astype(int)


## 2 共现字特征


In [13]:
data['comm_q1q2char_nums']=data.apply(lambda  row:len(set(row['q1'])&set(row['q2'])),axis=1)

In [14]:
# 共现字位置
def char_match_pos(q1, q2, pos_i):
    q1 = list(q1)
    q2 = list(q2)

    if pos_i < len(q1):
        q2_len = min(len(q2), 25)  # q2_len只匹配前25个字
        for pos_j in range(q2_len):
            if q1[pos_i] == q2[pos_j]:
                q_pos = pos_j + 1  # 如果匹配上了 记录匹配的位置
                break
            elif pos_j == q2_len - 1:
                q_pos = 0  # 如果没有匹配上 赋值为0
    else:
        q_pos = -1  # 如果后续长度不存在 赋值为-1

    return q_pos


for pos_i in range(8):
    data['q1_pos_' + str(pos_i + 1)] = data.apply(
        lambda row: char_match_pos(row['q1'], row['q2'], pos_i), axis=1).astype(np.int8)

In [15]:
# todo 这里也可以用结巴分词，改成“词”粒度的
data["q1_pos_1"]

0        0
1        4
2        1
3        0
4        0
        ..
9995     4
9996     1
9997     0
9998    11
9999     0
Name: q1_pos_1, Length: 10000, dtype: int8

## 3 距离特征

In [16]:
print("===========距离特征 =============")
sim_func_dict = {"jaccard": distance.jaccard,
                 "sorensen": distance.sorensen,
                 "levenshtein": distance.levenshtein,
                 "ratio": Levenshtein.ratio
                 }

for sim_func in tqdm(sim_func_dict, desc="距离特征"):
    data[sim_func] = data.apply(lambda row: sim_func_dict[sim_func](row["q1"],row["q2"]), axis=1)
    qt = [[3, 3], [3, 5], [5, 5], [5, 10], [10, 10], [10, 15], [15, 15], [15, 25]]

    for qt_len in qt:
        if qt_len[0] == 3 and sim_func == "levenshtein":
            pass
        else:
            data[sim_func + '_q' + str(qt_len[0]) + '_t' + str(qt_len[1])] = data.apply(
                lambda row: sim_func_dict[sim_func](row["q1"][:qt_len[0]],
                                                    row["q2"][:qt_len[1]]),
                axis=1)




距离特征: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:07<00:00,  1.78s/it]


## 4 文本向量匹配特征

In [17]:
import os
import gensim
import jieba
import numpy as np
from gensim.models import KeyedVectors
from gensim.models import word2vec

## 分词

In [18]:
data['q1_words_list']=data['q1'].apply(lambda x:[w for w in jieba.cut(x) if w])
data['q2_words_list']=data['q2'].apply(lambda x:[w for w in jieba.cut(x) if w])

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\User\AppData\Local\Temp\jieba.cache
Loading model cost 0.515 seconds.
Prefix dict has been built successfully.


In [19]:
data["q1_words_list"]

0                             [有, 哪些, 女明星, 被, 潜规则, 啦]
1                               [怎么, 支付宝, 绑定, 银行卡, ？]
2                            [请问, 这部, 电视剧, 叫, 什么, 名字]
3                                     [泰, 囧, 完整版, 下载]
4                   [在, 沧州, 市区, 哪家, 卖, 的, 盐焗鸡, 好吃, ？]
                            ...                      
9995                                    [小额贷款, 怎么, 贷]
9996                               [这是, 什么, 乌龟, 阿, ？]
9997                                 [如何, 申请, 福利, 企业]
9998    [安徽, 三联, 学院, 2015, 新生, 学费, 可以, 开学, 自己, 带去, 吗]
9999                 [这, 只能, 说明, 你, 不, 矜持, !, 什么, 意思]
Name: q1_words_list, Length: 10000, dtype: object

In [20]:
sentences=data['q1_words_list'].values.tolist()+data['q2_words_list'].values.tolist()
len(sentences)
sentences[:3]

[['有', '哪些', '女明星', '被', '潜规则', '啦'],
 ['怎么', '支付宝', '绑定', '银行卡', '？'],
 ['请问', '这部', '电视剧', '叫', '什么', '名字']]

## 训练词向量

In [21]:
if not os.path.exists('models'):
    os.mkdir('models')
w2v_model = word2vec.Word2Vec(sentences,
                                  vector_size=100, window=10, min_count=1, workers=4,
                                  sg=1)
w2v_model.save('models/' + 'word2vec.model')
w2v_model.wv.save_word2vec_format('models/' + 'word2vec.txt', binary=False)

In [22]:
len(w2v_model.wv.index_to_key)

11027

In [23]:
from scipy.spatial.distance import cosine, cityblock, canberra, euclidean, \
    minkowski, braycurtis, correlation, chebyshev, jensenshannon, mahalanobis, \
    seuclidean, sqeuclidean

from tqdm import tqdm

tqdm.pandas()

# 计算词向量的相似度
def get_w2v(query, title, num):
    q = np.zeros(100)
    count = 0
    for w in query:
        if w in w2v_model.wv:
            q += w2v_model.wv[w]
            count += 1
    if count == 0:
        query_vec = q
    query_vec = (q / count).tolist()

    t = np.zeros(100)
    count = 0
    for w in title:
        if w in w2v_model.wv:
            t += w2v_model.wv[w]
            count += 1
    if count == 0:
        title_vec = q
    title_vec = (t / count).tolist()

    if num == 1:
        try:
            vec_cosine = cosine(query_vec, title_vec)
            return vec_cosine
        except Exception as e:
            return 0
    if num == 2:
        try:
            vec_canberra = canberra(query_vec, title_vec) / len(query_vec)
            return vec_canberra
        except Exception as e:
            return 0
    if num == 3:
        try:
            vec_cityblock = cityblock(query_vec, title_vec) / len(query_vec)
            return vec_cityblock
        except Exception as e:
            return 0
    if num == 4:
        try:
            vec_euclidean = euclidean(query_vec, title_vec)
            return vec_euclidean
        except Exception as e:
            return 0
    if num == 5:
        try:
            vec_braycurtis = braycurtis(query_vec, title_vec)
            return vec_braycurtis
        except Exception as e:
            return 0
    if num == 6:
        try:
            vec_minkowski = minkowski(query_vec, title_vec)
            return vec_minkowski
        except Exception as e:
            return 0
    if num == 7:
        try:
            vec_correlation = correlation(query_vec, title_vec)
            return vec_correlation
        except Exception as e:
            return 0

    if num == 8:
        try:
            vec_chebyshev = chebyshev(query_vec, title_vec)
            return vec_chebyshev
        except Exception as e:
            return 0

    if num == 9:
        try:
            vec_jensenshannon = jensenshannon(query_vec, title_vec)
            return vec_jensenshannon
        except Exception as e:
            return 0

    if num == 10:
        try:
            vec_mahalanobis = mahalanobis(query_vec, title_vec)
            return vec_mahalanobis
        except Exception as e:
            return 0

    if num == 11:
        try:
            vec_seuclidean = seuclidean(query_vec, title_vec)
            return vec_seuclidean
        except Exception as e:
            return 0
    if num == 12:
        try:
            vec_sqeuclidean = sqeuclidean(query_vec, title_vec)
            return vec_sqeuclidean
        except Exception as e:
            return 0
# 词向量的相似度特征
data['vec_cosine'] = data.progress_apply(lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 1),
                                         axis=1)
data['vec_canberra'] = data.progress_apply(
    lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 2), axis=1)
data['vec_cityblock'] = data.progress_apply(
    lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 3), axis=1)
data['vec_euclidean'] = data.progress_apply(
    lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 4), axis=1)
data['vec_braycurtis'] = data.progress_apply(
    lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 5), axis=1)
data['vec_minkowski'] = data.progress_apply(
    lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 6), axis=1)
data['vec_correlation'] = data.progress_apply(
    lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 7), axis=1)

data['vec_chebyshev'] = data.progress_apply(
    lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 8), axis=1)
data['vec_jensenshannon'] = data.progress_apply(
    lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 9), axis=1)
data['vec_mahalanobis'] = data.progress_apply(
    lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 10), axis=1)
data['vec_seuclidean'] = data.progress_apply(
    lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 11), axis=1)
data['vec_sqeuclidean'] = data.progress_apply(
    lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 12), axis=1)

data['vec_cosine'] = data['vec_cosine'].astype('float32')
data['vec_canberra'] = data['vec_canberra'].astype('float32')
data['vec_cityblock'] = data['vec_cityblock'].astype('float32')
data['vec_euclidean'] = data['vec_euclidean'].astype('float32')
data['vec_braycurtis'] = data['vec_braycurtis'].astype('float32')
data['vec_correlation'] = data['vec_correlation'].astype('float32')

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:01<00:00, 8538.36it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:01<00:00, 8650.99it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 11096.91it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 10180.99it/s]
100%|███████████████████████████████████████████████████████

In [24]:
data['vec_cosine']

0       0.041676
1       0.009618
2       0.007044
3       0.015732
4       0.108460
          ...   
9995    0.069496
9996    0.057783
9997    0.039493
9998    0.048787
9999    0.029870
Name: vec_cosine, Length: 10000, dtype: float32

## 5 向量特征

In [25]:
def w2v_sent2vec(words):
    """计算句子的平均word2vec向量, sentences是一个句子, 句向量最后会归一化"""

    M = []
    for word in words:
        try:
            M.append(w2v_model.wv[word])
        except KeyError:  # 不在词典里
            continue

    M = np.array(M)
    v = M.sum(axis=0)
    return (v / np.sqrt((v ** 2).sum())).astype(np.float32).tolist()


fea_names = ['q1_vec_{}'.format(i) for i in range(100)]
data[fea_names] = data.progress_apply(lambda row: w2v_sent2vec(row['q1_words_list']), result_type='expand', axis=1)

fea_names = ['q2_vec_{}'.format(i) for i in range(100)]
data[fea_names] = data.progress_apply(lambda row: w2v_sent2vec(row['q2_words_list']), result_type='expand', axis=1)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 10799.70it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:01<00:00, 9465.82it/s]


In [26]:
data.columns

Index(['q1', 'q2', 'label', 'q1_len', 'q2_len', 'q1q2_len_diff',
       'q1q2_len_diff_abs', 'q1q2_rate', 'q2q1_rate', 'q1_end_special',
       ...
       'q2_vec_90', 'q2_vec_91', 'q2_vec_92', 'q2_vec_93', 'q2_vec_94',
       'q2_vec_95', 'q2_vec_96', 'q2_vec_97', 'q2_vec_98', 'q2_vec_99'],
      dtype='object', length=268)

## 5 模型训练

In [27]:
no_feas=['q1','q2','label','q1_words_list','q2_words_list']
features=[col for col in data.columns if col not in no_feas]

train,test=data[:train_size],data[train_size:]
print(len(features))
print(features)

263
['q1_len', 'q2_len', 'q1q2_len_diff', 'q1q2_len_diff_abs', 'q1q2_rate', 'q2q1_rate', 'q1_end_special', 'q2_end_special', 'comm_q1q2char_nums', 'q1_pos_1', 'q1_pos_2', 'q1_pos_3', 'q1_pos_4', 'q1_pos_5', 'q1_pos_6', 'q1_pos_7', 'q1_pos_8', 'jaccard', 'jaccard_q3_t3', 'jaccard_q3_t5', 'jaccard_q5_t5', 'jaccard_q5_t10', 'jaccard_q10_t10', 'jaccard_q10_t15', 'jaccard_q15_t15', 'jaccard_q15_t25', 'sorensen', 'sorensen_q3_t3', 'sorensen_q3_t5', 'sorensen_q5_t5', 'sorensen_q5_t10', 'sorensen_q10_t10', 'sorensen_q10_t15', 'sorensen_q15_t15', 'sorensen_q15_t25', 'levenshtein', 'levenshtein_q5_t5', 'levenshtein_q5_t10', 'levenshtein_q10_t10', 'levenshtein_q10_t15', 'levenshtein_q15_t15', 'levenshtein_q15_t25', 'ratio', 'ratio_q3_t3', 'ratio_q3_t5', 'ratio_q5_t5', 'ratio_q5_t10', 'ratio_q10_t10', 'ratio_q10_t15', 'ratio_q15_t15', 'ratio_q15_t25', 'vec_cosine', 'vec_canberra', 'vec_cityblock', 'vec_euclidean', 'vec_braycurtis', 'vec_minkowski', 'vec_correlation', 'vec_chebyshev', 'vec_jensensh

In [28]:
X = train[features] # 训练集输入
y = train['label'] # 训练集标签
X_test = test[features] # 测试集输入

In [29]:
import time
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder

In [30]:
n_fold = 5
folds = KFold(n_splits=n_fold, shuffle=True,random_state=1314)

In [31]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'num_leaves': 5,
    'max_depth': 6,
    'min_data_in_leaf': 450,
    'learning_rate': 0.1,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.95,
    'bagging_freq': 5,
    'lambda_l1': 1,  
    'lambda_l2': 0.001,  # 越小l2正则程度越高
    'min_gain_to_split': 0.2,
}
 
oof = np.zeros(len(X))
prediction = np.zeros(len(X_test))
for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
    X_train, X_valid = X[features].iloc[train_index], X[features].iloc[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]
    model = lgb.LGBMRegressor(**params, n_estimators=50000, n_jobs=-1)
    model.fit(X_train, y_train,
              eval_set=[(X_train, y_train), (X_valid, y_valid)],
              eval_metric='binary_logloss',
              verbose=50, early_stopping_rounds=200)
    y_pred_valid = model.predict(X_valid)
    y_pred = model.predict(X_test, num_iteration=model.best_iteration_)
    oof[valid_index] = y_pred_valid.reshape(-1, )
    prediction += y_pred
prediction /= n_fold

Training until validation scores don't improve for 200 rounds
[50]	training's binary_logloss: 0.350516	valid_1's binary_logloss: 0.405855
[100]	training's binary_logloss: 0.309711	valid_1's binary_logloss: 0.389669
[150]	training's binary_logloss: 0.282233	valid_1's binary_logloss: 0.382597
[200]	training's binary_logloss: 0.26038	valid_1's binary_logloss: 0.38043
[250]	training's binary_logloss: 0.240912	valid_1's binary_logloss: 0.37968
[300]	training's binary_logloss: 0.223483	valid_1's binary_logloss: 0.378818
[350]	training's binary_logloss: 0.208562	valid_1's binary_logloss: 0.381546
[400]	training's binary_logloss: 0.194883	valid_1's binary_logloss: 0.382515
Early stopping, best iteration is:
[235]	training's binary_logloss: 0.246771	valid_1's binary_logloss: 0.378248
Training until validation scores don't improve for 200 rounds
[50]	training's binary_logloss: 0.352481	valid_1's binary_logloss: 0.395883
[100]	training's binary_logloss: 0.309016	valid_1's binary_logloss: 0.38409


[200]	training's binary_logloss: 0.255646	valid_1's binary_logloss: 0.388844
[250]	training's binary_logloss: 0.237118	valid_1's binary_logloss: 0.388846
[300]	training's binary_logloss: 0.220535	valid_1's binary_logloss: 0.390432
[350]	training's binary_logloss: 0.205238	valid_1's binary_logloss: 0.391897
[400]	training's binary_logloss: 0.19206	valid_1's binary_logloss: 0.394902
Early stopping, best iteration is:
[245]	training's binary_logloss: 0.238918	valid_1's binary_logloss: 0.387534


In [32]:
from sklearn.metrics import accuracy_score
y_pred = (oof > 0.5)
# score=accuracy_score(np.round(abs(oof)) ,train['label'].values)
score=accuracy_score(y_pred ,train['label'].values)

score

0.8388

In [33]:
sub_pred = (prediction > 0.5).astype(int)
sample_submit['label']=sub_pred

In [34]:
sample_submit[['label']].to_csv('lgb.csv',index=None)

In [35]:
sample_submit['label'].value_counts()

1    2906
0    2094
Name: label, dtype: int64