In [3]:
import gc
import os
import sys
import warnings

import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectPercentile, f_classif, chi2
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.preprocessing import Binarizer, LabelEncoder, scale
from sklearn.neighbors import NearestNeighbors

warnings.filterwarnings('ignore')

In [4]:
# 数据目录
data_path = '/home/kesci/input/smart_edu7557'
# 工作目录
current_path = '/home/kesci/work'

In [5]:
# 简单统计
def stat(df, df_merge, group_by, agg):
    group = df.groupby(group_by).agg(agg)

    columns = []
    for on, methods in agg.items():
        for method in methods:
            columns.append('{}_{}_{}'.format('_'.join(group_by), on, method))
    group.columns = columns
    group.reset_index(inplace=True)
    df_merge = df_merge.merge(group, on=group_by, how='left')

    del (group)
    gc.collect()
    return df_merge

In [6]:
# cv 统计
def stat_cv(df, df_merge, group_by, on):
    group = df.groupby(group_by).agg({on: ['std', 'mean']})
    group.columns = ['std', 'mean']
    group.reset_index(inplace=True)

    group['cv'] = group['std'] / group['mean']
    group.drop(['std', 'mean'], axis=1, inplace=True)

    group.columns = group_by + ['{}_{}_cv'.format('_'.join(group_by), on)]
    df_merge = df_merge.merge(group, on=group_by, how='left')

    del (group)
    gc.collect()
    return df_merge

In [7]:
# rank 统计
def stat_rank(df, df_merge, group_by):
    df_temp = df.copy(deep=True)
    df_temp['rank'] = df.groupby(['exam_id'])['score'].rank(method='min',
                                                            ascending=False)
    methods = ['mean', 'std', 'max', 'min']
    df_temp = df_temp.groupby(group_by).agg({'rank': methods})
    
    columns = []
    for method in methods:
        columns.append('{}_rank_{}'.format('_'.join(group_by), method))
    df_temp.columns = columns
    df_temp.reset_index(inplace=True)

    df_merge = df_merge.merge(df_temp, how='left')
    return df_merge

In [8]:
# student score mean / exam mean score
def stat_ratio_mean(df, df_merge, group_by):
    df_temp = df.copy(deep=True)
    df_temp['exam_mean_score'] = df.groupby(['exam_id'
                                             ])['score'].transform('mean')
    df_temp['student_score_ratio_exam_mean_score'] = df_temp[
        'score'] / df_temp['exam_mean_score']

    df_temp = df_temp.groupby(
        group_by)['student_score_ratio_exam_mean_score'].mean().to_frame()
    df_temp.reset_index(inplace=True)
    df_temp.columns = group_by + [
        'student_score_exam_mean_score_ratio_{}_mean'.format(
            '_'.join(group_by))
    ]

    df_merge = df_merge.merge(df_temp, how='left')
    return df_merge

In [9]:
# time_feature
def stat_time(df, df_merge, window, group_by, on, method):
    df_temp = df.copy(deep=True)
    
    name = 'pre_{}_{}_{}'.format(window, on, method)
    
    df_temp.sort_values(group_by+['order'], inplace=True)
    df_temp[name] = df_temp.groupby(group_by).shift(1).rolling(window=window, min_periods=1).agg({on: method})
    
    df_temp = df_temp[group_by+['order', name]]
    df_merge = df_merge.merge(df_temp, on=group_by+['order'], how='left')
    return df_merge

In [10]:
def gen_feature(df, index, columns, values):
    p = df.pivot_table(index=index, columns=columns,
                       values=values).reset_index().fillna(0)
    columns = p.columns.values.tolist()
    columns = ['{}_{}'.format(c, values) if c != index else c for c in columns]
    p.columns = columns

    return p

In [11]:
def select_feature(df):
    p = 86

    df.fillna(0, inplace=True)
    ycol = 'score'
    feature_names = [f for f in df.columns if f not in [ycol, 'student_id', 'course', 'exam_id']]

    X = df[feature_names]
    Y = df[ycol]
    X_bin = Binarizer().fit_transform(scale(X))

    selectChi2 = SelectPercentile(chi2, percentile=p).fit(X_bin, Y)
    selectF_classif = SelectPercentile(f_classif, percentile=p).fit(X, Y)

    chi2_selected = selectChi2.get_support()
    print('Chi2 selected {} features.'.format(chi2_selected.sum()))
    f_classif_selected = selectF_classif.get_support()
    print('F_classif selected {} features.'.format(f_classif_selected.sum()))
    selected = chi2_selected & f_classif_selected
    print('Chi2 & F_classif selected {} features'.format(selected.sum()))
    features = [f for f, s in zip(feature_names, selected) if s]

    x = list(set(feature_names) - set(features))
    x.sort()
    print(x)
    df_sel = df[features]
    df_basic = df[[ycol, 'student_id', 'course', 'exam_id']]
    df_sel = pd.concat([df_sel, df_basic], axis=1)
    return df_sel

In [12]:
# 读取数据集
# all_knowledge.csv
df_knowledge = pd.read_csv(os.path.join(data_path, 'all_knowledge.csv'))
        
# 对 section category knowledge_point 赋予课程之间唯一标示
df_knowledge['section'] = df_knowledge['course'] + df_knowledge['section']
df_knowledge['category'] = df_knowledge['course'] + df_knowledge['category']
df_knowledge['knowledge_point'] = df_knowledge['course'] + df_knowledge['knowledge_point']

# course.csv
df_course = pd.read_csv(os.path.join(data_path, 'course.csv'))

# student.csv
df_student = pd.read_csv(os.path.join(data_path, 'student.csv'))
    
# submission_s2.csv
df_test_score = pd.read_csv(os.path.join(data_path, 'submission_s2.csv'))
df_test_score.rename(columns={'pred': 'score'}, inplace=True)

# exam_score.csv
df_train_score_all = pd.read_csv(os.path.join(data_path, 'exam_score.csv'))

# 去掉 0 分数
df_train_score = df_train_score_all[df_train_score_all.score != 0]

# course1_exams.csv ~ course8_exams.csv
df_list = []
df_exams_order_list = []
for i in range(1, 9):
    df_exams = pd.read_csv(os.path.join(data_path, 'course{}_exams.csv'.format(i)))
    df_exams_order = df_exams[['exam_id']]
    df_exams_order['order'] = df_exams.index
    df_exams_order_list.append(df_exams_order)

    del (df_exams_order)
    gc.collect()

    df_exams.set_index('exam_id', inplace=True)
    df_exams = df_exams.stack().to_frame()
    df_exams.reset_index(inplace=True)
    df_exams.rename(columns={
            'level_1': 'knowledge_point',
            0: 'ratio'
    },inplace=True)
    df_exams['knowledge_point'] = 'course{}'.format(i) + df_exams['knowledge_point']

    df_list.append(df_exams)

df_exams_ratio = pd.concat(df_list, axis=0)
df_exams_orders = pd.concat(df_exams_order_list, axis=0)

In [13]:
# 特征表
df_feature = pd.concat([df_train_score, df_test_score])

In [14]:
# 不涉及score的特征构造
# 性别
df_feature = df_feature.merge(df_student, on='student_id', how='left')

# 考试次序
df_feature = df_feature.merge(df_exams_orders, how='left')

# 考试知识点数量
df_temp = df_exams_ratio.groupby(['exam_id'])['ratio'].apply(lambda x: (x != 0).sum() / x.shape[0]).\
        reset_index().rename(columns={'ratio': 'exam_kp_num'})
df_feature = df_feature.merge(df_temp, on='exam_id', how='left')
del (df_temp)
gc.collect()

# 考试知识点 max-min
df_temp = df_exams_ratio[df_exams_ratio.ratio != 0]
df_temp['kp'] = df_temp['knowledge_point'].str.split(':', expand=True)[1]
df_temp['kp'] = df_temp['kp'].astype('int')
df_temp['kp_max'] = df_temp.groupby(['exam_id'])['kp'].transform('max')
df_temp['kp_min'] = df_temp.groupby(['exam_id'])['kp'].transform('min')
df_temp['kp_range'] = df_temp['kp_max'] - df_temp['kp_min']
df_temp = df_temp[['exam_id', 'kp_range']].drop_duplicates()
df_feature = df_feature.merge(df_temp, on='exam_id', how='left')
del (df_temp)
gc.collect()

# 考试知识点section数量 (不为0 / 总个数)
df_temp = df_exams_ratio.merge(df_knowledge, on=['knowledge_point'], how='left')
df_temp = stat(df_temp, df_temp, ['exam_id', 'section'], {'ratio': ['sum']})
df_temp = df_temp[['exam_id', 'section', 'exam_id_section_ratio_sum']]
df_temp.drop_duplicates(inplace=True)
df_temp = df_temp.groupby(['exam_id'])['exam_id_section_ratio_sum'].\
            apply(lambda x: (x != 0).sum() / x.shape[0]).reset_index().rename(columns={'exam_id_section_ratio_sum': 'exam_section_num'})
df_feature = df_feature.merge(df_temp, on='exam_id', how='left')
del (df_temp)
gc.collect()

# 考试知识点section max-min
df_temp = df_exams_ratio.merge(df_knowledge, on=['knowledge_point'], how='left')
df_temp = stat(df_temp, df_temp, ['exam_id', 'section'], {'ratio': ['sum']})
df_temp = df_temp[['exam_id', 'section', 'exam_id_section_ratio_sum']]
df_temp.drop_duplicates(inplace=True)
df_temp = df_temp[df_temp.exam_id_section_ratio_sum != 0]
df_temp['sec'] = df_temp['section'].str.split(':', expand=True)[1]
df_temp['sec'] = df_temp['sec'].astype('int')
df_temp['sec_max'] = df_temp.groupby(['exam_id'])['sec'].transform('max')
df_temp['sec_min'] = df_temp.groupby(['exam_id'])['sec'].transform('min')
df_temp['sec_range'] = df_temp['sec_max'] - df_temp['sec_min']
df_temp = df_temp[['exam_id', 'sec_range']].drop_duplicates()
df_feature = df_feature.merge(df_temp, on='exam_id', how='left')
del (df_temp)
gc.collect()

# 考试知识点category数量 (不为0 / 总个数)
df_temp = df_exams_ratio.merge(df_knowledge, on=['knowledge_point'], how='left')
df_temp = stat(df_temp, df_temp, ['exam_id', 'category'], {'ratio': ['sum']})
df_temp = df_temp[['exam_id', 'category', 'exam_id_category_ratio_sum']]
df_temp.drop_duplicates(inplace=True)
df_temp = df_temp.groupby(['exam_id'])['exam_id_category_ratio_sum'].\
            apply(lambda x: (x != 0).sum() / x.shape[0]).reset_index().rename(columns={'exam_id_category_ratio_sum': 'exam_category_num'})
df_feature = df_feature.merge(df_temp, on='exam_id', how='left')
del (df_temp)
gc.collect()

# 考试知识点category max-min
df_temp = df_exams_ratio.merge(df_knowledge, on=['knowledge_point'], how='left')
df_temp = stat(df_temp, df_temp, ['exam_id', 'category'], {'ratio': ['sum']})
df_temp = df_temp[['exam_id', 'category', 'exam_id_category_ratio_sum']]
df_temp.drop_duplicates(inplace=True)
df_temp = df_temp[df_temp.exam_id_category_ratio_sum != 0]
df_temp['cat'] = df_temp['category'].str.split(':', expand=True)[1]
df_temp['cat'] = df_temp['cat'].astype('int')
df_temp['cat_max'] = df_temp.groupby(['exam_id'])['cat'].transform('max')
df_temp['cat_min'] = df_temp.groupby(['exam_id'])['cat'].transform('min')
df_temp['cat_range'] = df_temp['cat_max'] - df_temp['cat_min']
df_temp = df_temp[['exam_id', 'cat_range']].drop_duplicates()
df_feature = df_feature.merge(df_temp, on='exam_id', how='left')
del (df_temp)
gc.collect()

# 考试的各难度占比
df_course_exam_feature = df_exams_ratio.merge(df_knowledge,
                                                  on=['knowledge_point'],
                                                  how='left')
df_temp = df_course_exam_feature[['exam_id', 'complexity', 'ratio']]
df_temp = stat(df_temp, df_temp, ['exam_id', 'complexity'], {'ratio': ['sum']})
p = gen_feature(df_temp, 'exam_id', 'complexity', 'exam_id_complexity_ratio_sum')
df_feature = df_feature.merge(p, on='exam_id', how='left')

del (df_course_exam_feature)
del (df_temp)
del (p)
gc.collect()

# 考试总体难度
df_course_exam_feature = df_exams_ratio.merge(df_knowledge, on=['knowledge_point'], how='left')
df_course_exam_feature['complexity_ratio'] = df_course_exam_feature['ratio'] * df_course_exam_feature['complexity'] * 0.01
df_exam_complexity_sum = df_course_exam_feature.groupby(['exam_id'])['complexity_ratio'].sum().reset_index()
df_exam_complexity_sum.rename(columns={'complexity_ratio': 'exam_complexity'}, inplace=True)
df_feature = df_feature.merge(df_exam_complexity_sum, on='exam_id',  how='left')

del (df_course_exam_feature)
del (df_exam_complexity_sum)
gc.collect()

# course_class 属性
df_feature = df_feature.merge(df_course, on='course', how='left')

# 学生课程最近三次考试平均成绩
df_score = pd.concat([df_train_score, df_test_score])
df_temp = df_score.merge(df_exams_orders, how='left')
df_temp['rank'] = df_temp.groupby(['exam_id'])['score'].rank(method='min',
                                                            ascending=False)
df_temp.sort_values(['student_id', 'course', 'order'], inplace=True)

df_feature = stat_time(df_temp, df_feature, 3, ['student_id', 'course'], 'score', 'mean')
df_feature = stat_time(df_temp, df_feature, 3, ['student_id', 'course'], 'score', 'std')
df_feature = stat_time(df_temp, df_feature, 3, ['student_id', 'course'], 'rank', 'mean')
df_feature = stat_time(df_temp, df_feature, 3, ['student_id', 'course'], 'rank', 'std')

df_temp = df_temp.merge(df_feature[['student_id', 'course', 'order', 'pre_3_score_mean']], how='left')
df_feature = stat_time(df_temp, df_feature, 8, ['student_id', 'course'], 'pre_3_score_mean', 'mean')

df_temp['pre_3_score_diff_mean'] = df_temp.groupby(
        ['student_id', 'course'])['score'].shift(1).diff().rolling(window=3, min_periods=1).mean()
df_temp = df_temp[['student_id', 'course', 'order', 'pre_3_score_diff_mean']]
df_feature = df_feature.merge(df_temp, on=['student_id', 'course', 'order'], how='left')

df_temp = df_temp.merge(df_feature[['student_id', 'course', 'order', 'pre_3_score_mean']], how='left')
df_temp['pre_3_pre_3_score_mean_diff_mean'] = df_temp.groupby(
        ['student_id', 'course'])['pre_3_score_mean'].shift(1).diff().rolling(window=3, min_periods=1).mean()
df_temp = df_temp[['student_id', 'course', 'order', 'pre_3_pre_3_score_mean_diff_mean']]
df_feature = df_feature.merge(df_temp, on=['student_id', 'course', 'order'], how='left')

del (df_score)
del (df_temp)
gc.collect()

# 最近三次考试成绩
df_temp = df_feature[['student_id', 'course', 'order', 'score']]
df_temp.sort_values(['student_id', 'course', 'order'], inplace=True)
gg = df_temp.groupby(['student_id', 'course'])
for i in range(3):
    df_temp['pre_{}_score'.format(i+1)] = gg['score'].shift(i+1)

df_temp.drop(['score'], axis=1, inplace=True)
df_feature = df_feature.merge(df_temp, on=['student_id', 'course', 'order'], how='left')
del (df_temp)
gc.collect()

# 嫁接
df_jiajie = pd.read_csv(os.path.join(current_path, 'jiajie.csv'))
df_feature = df_feature.merge(df_jiajie, on=['student_id', 'exam_id'], how='left')
del(df_jiajie)
gc.collect()

In [17]:
# 知识点个数 考试难度 拼接
df_temp = df_exams_ratio.groupby(['exam_id'])['ratio'].apply(lambda x: (x != 0).sum()).\
        reset_index().rename(columns={'ratio': 'exam_kp_num'})
df_temp = df_temp.merge(df_feature[['exam_id', 'exam_complexity']].drop_duplicates(), how='left')
df_temp['exam_kp_num_exam_complexity'] = df_temp['exam_kp_num'].astype('str') + df_temp['exam_complexity'].astype('str')
df_temp['exam_kp_num_exam_complexity'] = df_temp['exam_kp_num_exam_complexity'].astype('float')
df_feature = df_feature.merge(df_temp[['exam_id', 'exam_kp_num_exam_complexity']], how='left')

del(df_temp)
gc.collect()

In [18]:
df_feature.tail()

In [19]:
seed = 2008

In [20]:
# 考试的知识点占比
dimension1 = 60

df_course_exam_feature = df_exams_ratio.merge(df_knowledge,
                                              on=['knowledge_point'],
                                              how='left')
df_temp = df_course_exam_feature[['exam_id', 'knowledge_point', 'ratio']]
p = gen_feature(df_temp, 'exam_id', 'knowledge_point', 'ratio')
df_sparse_f = p.drop(['exam_id'], axis=1)
df_exam_f = p[['exam_id']]
pca = PCA(n_components=dimension1, random_state=seed)
df_no_sparse_f = pd.DataFrame(pca.fit_transform(df_sparse_f))
df_no_sparse_f.columns = [ 'exam_point_ratio' + str(c) for c in df_no_sparse_f.columns]
df_exam_f = pd.concat([df_exam_f, df_no_sparse_f], axis=1)
df_feature = df_feature.merge(df_exam_f, on='exam_id', how='left')

del (df_course_exam_feature)
del (df_temp)
del (df_sparse_f)
del (df_exam_f)
del (df_no_sparse_f)
del (p)
gc.collect()

df_feature.shape

In [21]:
# 考试各段落和种类占比
dimension2 = 60

df_course_exam_feature = df_exams_ratio.merge(df_knowledge,
                                              on=['knowledge_point'],
                                              how='left')
df_temp = df_course_exam_feature[['exam_id', 'section', 'category', 'ratio']]

df_temp = stat(df_temp, df_temp, ['exam_id', 'section'], {'ratio': ['sum']})
df_temp = stat(df_temp, df_temp, ['exam_id', 'category'], {'ratio': ['sum']})

df_exam_f = df_temp[['exam_id']]
df_exam_f.drop_duplicates(inplace=True)
p = gen_feature(df_temp, 'exam_id', 'section', 'exam_id_section_ratio_sum')
df_exam_f = df_exam_f.merge(p, on='exam_id', how='left')
p = gen_feature(df_temp, 'exam_id', 'category', 'exam_id_category_ratio_sum')
df_exam_f = df_exam_f.merge(p, on='exam_id', how='left')

df_sparse_f = df_exam_f.drop(['exam_id'], axis=1)
df_exam_f = df_exam_f[['exam_id']]
pca = PCA(n_components=dimension2, random_state=seed)
df_no_sparse_f = pd.DataFrame(pca.fit_transform(df_sparse_f))
df_no_sparse_f.columns = [
    'exam_section_cat_ratio' + str(c) for c in df_no_sparse_f.columns
]
df_exam_f = pd.concat([df_exam_f, df_no_sparse_f], axis=1)
df_feature = df_feature.merge(df_exam_f, on='exam_id', how='left')

del (df_course_exam_feature)
del (df_temp)
del (df_sparse_f)
del (df_exam_f)
del (df_no_sparse_f)
del (p)
gc.collect()

df_feature.shape

In [22]:
# 根据知识点分布找相似特征
K = 3
df_temp = df_exams_ratio.merge(df_knowledge, on=['knowledge_point'], how='left')
df_temp = df_temp[['exam_id', 'knowledge_point', 'ratio']]
p = gen_feature(df_temp, 'exam_id', 'knowledge_point', 'ratio')

index = list(p.index.values)
exam_ids = list(p['exam_id'].values)
index_exam_map = dict(zip(index,exam_ids))

p.drop(['exam_id'], axis=1, inplace=True)
nbrs = NearestNeighbors(n_neighbors=K+1, algorithm="auto").fit(p)
distances, indices = nbrs.kneighbors(p)
df_distance = pd.DataFrame(distances)
df_distance.drop([0], axis=1, inplace=True)
df_distance.columns = ['dis_'+str(c) for c in df_distance.columns]
df_distance = df_distance.apply(lambda x: 1/(x+10**-6), axis=1)

df_indice = pd.DataFrame(indices)
df_indice.drop([0], axis=1, inplace=True)
df_indice = df_indice.replace(index_exam_map)
df_indice.columns = ['sim_exam_'+str(c) for c in df_indice.columns]

df_sim_exam = pd.concat([df_distance, df_indice], axis=1)
df_sim_exam['exam_id'] = exam_ids

df_sim_exam = df_sim_exam.merge(df_feature[['student_id' ,'exam_id']], on='exam_id', how='right')
for i in range(K):
    df_sim_exam = df_sim_exam.merge(df_train_score[['student_id', 'exam_id', 'score']].rename(columns={'exam_id': 'eid'}),
                                    left_on=['student_id', 'sim_exam_'+str(i+1)],
                                    right_on=['student_id', 'eid'], how='left')
    df_sim_exam['sim_score_'+str(i+1)] = df_sim_exam['score']
    df_sim_exam.drop(['score', 'eid'], axis=1, inplace=True)

def cal_score(x):
    sum_dis = 0
    sim_score = 0
    for i in range(K):
        if x['sim_score_'+str(i+1)] == 0:
            x['dis_'+str(i+1)] = 0
            x['sim_score_'+str(i+1)] = 0
        sum_dis += x['dis_'+str(i+1)]
    for i in range(K):
        sim_score += x['dis_'+str(i+1)] / sum_dis * x['sim_score_'+str(i+1)]

    return sim_score

df_sim_exam.fillna(0, inplace=True)
df_sim_exam['sim_exam_score'] = df_sim_exam.apply(cal_score, axis=1)

df_feature = df_feature.merge(df_sim_exam[['student_id' ,'exam_id', 'sim_exam_score']], on=['student_id', 'exam_id'], how='left')
del(df_temp)
del(p)
del(df_distance)
del(df_indice)
del(df_sim_exam)

gc.collect()

In [23]:
# 根据 section 分布找相似试卷
K = 3
df_temp = df_exams_ratio.merge(df_knowledge, on=['knowledge_point'], how='left')
df_temp = df_temp[['exam_id', 'section', 'ratio']]
df_temp = stat(df_temp, df_temp, ['exam_id', 'section'], {'ratio': ['sum']})
p = gen_feature(df_temp, 'exam_id', 'section', 'exam_id_section_ratio_sum')

index = list(p.index.values)
exam_ids = list(p['exam_id'].values)
index_exam_map = dict(zip(index,exam_ids))

p.drop(['exam_id'], axis=1, inplace=True)
nbrs = NearestNeighbors(n_neighbors=K+1, algorithm="auto").fit(p)
distances, indices = nbrs.kneighbors(p)
df_distance = pd.DataFrame(distances)
df_distance.drop([0], axis=1, inplace=True)
df_distance.columns = ['dis_'+str(c) for c in df_distance.columns]
df_distance = df_distance.apply(lambda x: 1/(x+10**-6), axis=1)

df_indice = pd.DataFrame(indices)
df_indice.drop([0], axis=1, inplace=True)
df_indice = df_indice.replace(index_exam_map)
df_indice.columns = ['sim_exam_'+str(c) for c in df_indice.columns]

df_sim_exam = pd.concat([df_distance, df_indice], axis=1)
df_sim_exam['exam_id'] = exam_ids

df_sim_exam = df_sim_exam.merge(df_feature[['student_id' ,'exam_id']], on='exam_id', how='right')
for i in range(K):
    df_sim_exam = df_sim_exam.merge(df_train_score[['student_id', 'exam_id', 'score']].rename(columns={'exam_id': 'eid'}),
                                    left_on=['student_id', 'sim_exam_'+str(i+1)],
                                    right_on=['student_id', 'eid'], how='left')
    df_sim_exam['sim_score_'+str(i+1)] = df_sim_exam['score']
    df_sim_exam.drop(['score', 'eid'], axis=1, inplace=True)

def cal_score(x):
    sum_dis = 0
    sim_score = 0
    for i in range(K):
        if x['sim_score_'+str(i+1)] == 0:
            x['dis_'+str(i+1)] = 0
            x['sim_score_'+str(i+1)] = 0
        sum_dis += x['dis_'+str(i+1)]
    if sum_dis == 0:
        return np.nan
    for i in range(K):
        sim_score += x['dis_'+str(i+1)] / sum_dis * x['sim_score_'+str(i+1)]

    return sim_score

df_sim_exam.fillna(0, inplace=True)
df_sim_exam['sim_exam_form_section_score'] = df_sim_exam.apply(cal_score, axis=1)

df_feature = df_feature.merge(df_sim_exam[['student_id' ,'exam_id', 'sim_exam_form_section_score']], 
                              on=['student_id', 'exam_id'], how='left')
del(df_temp)
del(p)
del(df_distance)
del(df_indice)
del(df_sim_exam)

gc.collect()

In [24]:
# 根据 category 分布找相似试卷
K = 3
df_temp = df_exams_ratio.merge(df_knowledge, on=['knowledge_point'], how='left')
df_temp = df_temp[['exam_id', 'category', 'ratio']]
df_temp = stat(df_temp, df_temp, ['exam_id', 'category'], {'ratio': ['sum']})
p = gen_feature(df_temp, 'exam_id', 'category', 'exam_id_category_ratio_sum')
index = list(p.index.values)
exam_ids = list(p['exam_id'].values)
index_exam_map = dict(zip(index,exam_ids))

from sklearn.neighbors import NearestNeighbors
p.drop(['exam_id'], axis=1, inplace=True)
nbrs = NearestNeighbors(n_neighbors=K+1, algorithm="auto").fit(p)
distances, indices = nbrs.kneighbors(p)
df_distance = pd.DataFrame(distances)
df_distance.drop([0], axis=1, inplace=True)
df_distance.columns = ['dis_'+str(c) for c in df_distance.columns]
df_distance = df_distance.apply(lambda x: 1/(x+10**-6), axis=1)

df_indice = pd.DataFrame(indices)
df_indice.drop([0], axis=1, inplace=True)
df_indice = df_indice.replace(index_exam_map)
df_indice.columns = ['sim_exam_'+str(c) for c in df_indice.columns]

df_sim_exam = pd.concat([df_distance, df_indice], axis=1)
df_sim_exam['exam_id'] = exam_ids

df_sim_exam = df_sim_exam.merge(df_feature[['student_id' ,'exam_id']], on='exam_id', how='right')
for i in range(K):
    df_sim_exam = df_sim_exam.merge(df_train_score[['student_id', 'exam_id', 'score']].rename(columns={'exam_id': 'eid'}),
                                    left_on=['student_id', 'sim_exam_'+str(i+1)],
                                    right_on=['student_id', 'eid'], how='left')
    df_sim_exam['sim_score_'+str(i+1)] = df_sim_exam['score']
    df_sim_exam.drop(['score', 'eid'], axis=1, inplace=True)

def cal_score(x):
    sum_dis = 0
    sim_score = 0
    for i in range(K):
        if x['sim_score_'+str(i+1)] == 0:
            x['dis_'+str(i+1)] = 0
            x['sim_score_'+str(i+1)] = 0
        sum_dis += x['dis_'+str(i+1)]
    if sum_dis == 0:
        return np.nan
    for i in range(K):
        sim_score += x['dis_'+str(i+1)] / sum_dis * x['sim_score_'+str(i+1)]

    return sim_score

df_sim_exam.fillna(0, inplace=True)
df_sim_exam['sim_exam_form_category_score'] = df_sim_exam.apply(cal_score, axis=1)

df_feature = df_feature.merge(df_sim_exam[['student_id' ,'exam_id', 'sim_exam_form_category_score']], 
                              on=['student_id', 'exam_id'], how='left')
del(df_temp)
del(p)
del(df_distance)
del(df_indice)
del(df_sim_exam)

gc.collect()

In [25]:
df_feature.head()

In [26]:
# 关于 score 的统计特征
def statis_feat(df_know, df_unknow):
    # 成绩 max mean std cv
    df_unknow = stat(df_know, df_unknow, ['student_id'], {'score': ['max', 'mean', 'std', 'median']})
    df_unknow = stat_cv(df_know, df_unknow, ['student_id'], 'score')
    
    # mean(个人考试成绩 / 本次考试平均成绩)
    df_unknow = stat_ratio_mean(df_know, df_unknow, ['student_id'])
    
    # 平均排名
    df_unknow = stat_rank(df_know, df_unknow, ['student_id'])
    
    # score / 难度 衡量学生抗压能力
    df_know['s/c'] = df_know['score'] / df_know['exam_complexity']

    df_unknow = stat(df_know, df_unknow, ['student_id'], {'s/c': ['mean', 'std', 'max', 'median']})
    df_unknow = stat(df_know, df_unknow, ['student_id', 'course'], {'s/c': ['mean', 'std', 'max', 'median']})
    df_unknow = stat_cv(df_know, df_unknow, ['student_id', 'course'], 's/c')
    df_unknow = stat_cv(df_know, df_unknow, ['student_id'], 's/c')
    
    df_know.drop(['s/c'], axis=1, inplace=True)
    
    # ******* 课程特征 ******** #
    # max mean std cv
    df_unknow = stat(df_know, df_unknow, ['course'], {'score': ['mean', 'std']})
    df_unknow = stat_cv(df_know, df_unknow, ['course'], 'score')
    
    # ******* 学生，课程组合特征 ******** #
    # max mean std cv
    df_unknow = stat(df_know, df_unknow, ['student_id', 'course'], {'score': ['max', 'mean', 'std', 'median']})
    df_unknow = stat_cv(df_know, df_unknow, ['student_id', 'course'], 'score')
    df_unknow = stat(df_know, df_unknow, ['student_id', 'course_class'], {'score': ['max', 'mean', 'std', 'median']})
    df_unknow = stat_cv(df_know, df_unknow, ['student_id', 'course_class'], 'score')
    
    # 课程平均排名
    df_unknow = stat_rank(df_know, df_unknow, ['student_id', 'course'])
    
    # 课程平均分
    df_unknow = stat_ratio_mean(df_know, df_unknow, ['student_id', 'course'])
    
    # 性别在课程考试 max mean std cv
    df_unknow = stat(df_know, df_unknow, ['gender', 'course'], {'score': ['mean', 'std', 'median']})
    df_unknow = stat_cv(df_know, df_unknow, ['gender', 'course'], 'score')
    
    return df_unknow

In [27]:
# 5折交叉 有关 score 的特征构造
df_train = df_feature[~df_feature['score'].isnull()]
df_train = df_train.reset_index(drop=True)
df_test = df_feature[df_feature['score'].isnull()]
        
seed = 2008
df_stas_feat = None
kf = KFold(n_splits=5, random_state=2018, shuffle=True)
for train_index, val_index in kf.split(df_train):
    df_fold_train = df_train.iloc[train_index]
    df_fold_val = df_train.iloc[val_index]
    
    df_fold_val = statis_feat(df_fold_train, df_fold_val)
    df_stas_feat = pd.concat([df_stas_feat, df_fold_val], axis=0)
    
    del(df_fold_train)
    del(df_fold_val)
    gc.collect()

df_test = statis_feat(df_train, df_test)
df_feature = pd.concat([df_stas_feat, df_test], axis=0)

del(df_stas_feat)
del(df_train)
del(df_test)
gc.collect()

In [28]:
# LabelEncoder
for f in df_feature.select_dtypes('object'):
    lbl = LabelEncoder()
    df_feature[f] = lbl.fit_transform(df_feature[f].astype(str))

In [29]:
df_feature = select_feature(df_feature)

In [30]:
df_train = df_feature.iloc[:df_train_score.shape[0]]
df_test = df_feature.iloc[df_train_score.shape[0]:]

In [31]:
df_train['score'] = np.log1p(df_train['score'])

In [1]:
def myscore(preds, xgbtrain):
    label = xgbtrain.get_label()
    
    preds = np.expm1(preds)
    label = np.expm1(label)
    
    score = 10 * np.log10(np.sqrt(mean_squared_error(label, preds)))
    return 'myrmse', score

seed = 2008
nfold = 5

ycol = 'score'

feature_names = list(
    filter(lambda x: x not in [ycol, 'student_id', 'kp_distance'], df_train.columns))

test_pred = pd.read_csv(os.path.join(data_path, 'submission_s2.csv'))
test_pred.fillna(0, inplace=True)

kfolder = KFold(n_splits=nfold, shuffle=True, random_state=seed)
score_train_total = 0
score_val_total = 0
df_importance_list = []
for fold_id, (trn_idx, val_idx) in enumerate(kfolder.split(df_train)):
    print(
        '\nxgboost Fold_{} Training ================================\n'.format(
            fold_id))
    X_train = df_train.iloc[trn_idx][feature_names]
    Y_train = df_train.iloc[trn_idx][ycol]

    X_val = df_train.iloc[val_idx][feature_names]
    Y_val = df_train.iloc[val_idx][ycol]

    model_xgb = xgb.XGBRegressor(random_state=seed,
                                 max_depth=8,
                                 n_estimators=20000,
                                 min_child_weight=300,
                                 colsample_bytree=0.8,
                                 subsample=0.8,
                                 learning_rate=0.01,
                                 reg_alpha=1,
                                 reg_lambda=0.8,
                                 objective="reg:linear")
    model_xgb.fit(X_train,
                  Y_train,
                  eval_set=[(X_train, Y_train), (X_val, Y_val)],
                  verbose=1000,
                  early_stopping_rounds=500, 
                  eval_metric=myscore)

    pred_val = model_xgb.predict(X_val)
    pred_train = model_xgb.predict(X_train)
    pred_test = model_xgb.predict(df_test[feature_names])
    
    # 反变换
    pred_val = np.expm1(pred_val)
    pred_train = np.expm1(pred_train)
    pred_test = np.expm1(pred_test)
    Y_val = np.expm1(Y_val)
    Y_train = np.expm1(Y_train)

    score_val = 10 * np.log10(np.sqrt(mean_squared_error(Y_val, pred_val)))
    score_train = 10 * np.log10(np.sqrt(mean_squared_error(Y_train, pred_train)))
    
    score_train_total += score_train
    score_val_total += score_val
    print('xgb', score_train, score_val)
    
    test_pred['pred'] += pred_test / nfold

    df_importance = pd.DataFrame({
        'column': feature_names,
        'importance': model_xgb.feature_importances_,
    })
    df_importance_list.append(df_importance)

test_pred['pred'] = test_pred['pred'].map(lambda x: round(x, 1))

df_importance = pd.concat(df_importance_list)
df_importance = df_importance.groupby(['column'])['importance'].agg('mean').sort_values(ascending=False)

df_importance.to_csv(os.path.join(current_path, 'importance.csv'))

print(df_importance.head(10))
print(df_importance.tail())
print(score_train_total / nfold, score_val_total / nfold)

test_pred.to_csv(os.path.join(current_path, 'xgb.csv'), index=False)

In [33]:
def myscore(preds, xgbtrain):    
    preds = np.expm1(preds)
    xgbtrain = np.expm1(xgbtrain)
    
    score = 10 * np.log10(np.sqrt(mean_squared_error(xgbtrain, preds)))
    return 'myrmse', score, False

seed = 2008
nfold = 5

ycol = 'score'

feature_names = list(
    filter(lambda x: x not in [ycol, 'student_id'], df_train.columns))

test_pred = pd.read_csv(os.path.join(data_path, 'submission_s2.csv'))
test_pred.fillna(0, inplace=True)

kfolder = KFold(n_splits=nfold, shuffle=True, random_state=seed)
score_train_total = 0
score_val_total = 0
df_importance_list = []
for fold_id, (trn_idx, val_idx) in enumerate(kfolder.split(df_train)):
    print(
        '\nlightgbm Fold_{} Training ================================\n'.format(
            fold_id))
    X_train = df_train.iloc[trn_idx][feature_names]
    Y_train = df_train.iloc[trn_idx][ycol]

    X_val = df_train.iloc[val_idx][feature_names]
    Y_val = df_train.iloc[val_idx][ycol]

    model_lgb = lgb.LGBMRegressor(random_state=seed,
                                  max_depth=8,
                                  n_estimators=10000,
                                  min_child_weight=300,
                                  colsample_bytree=0.8,
                                  subsample=0.8,
                                  learning_rate=0.1,
                                  reg_alpha=1,
                                  reg_lambda=0.8)

    model_lgb.fit(X_train,
                  Y_train,
                  eval_set=[(X_train, Y_train), (X_val, Y_val)],
                  verbose=1000,
                  early_stopping_rounds=500, 
                  eval_metric=myscore)

    pred_val = model_lgb.predict(X_val)
    pred_train = model_lgb.predict(X_train)
    pred_test = model_lgb.predict(df_test[feature_names])
    
    # 反变换
    pred_val = np.expm1(pred_val)
    pred_train = np.expm1(pred_train)
    pred_test = np.expm1(pred_test)
    Y_val = np.expm1(Y_val)
    Y_train = np.expm1(Y_train)

    score_val = 10 * np.log10(np.sqrt(mean_squared_error(Y_val, pred_val)))
    score_train = 10 * np.log10(np.sqrt(mean_squared_error(Y_train, pred_train)))
    
    score_train_total += score_train
    score_val_total += score_val
    print('lgb', score_train, score_val)
    
    test_pred['pred'] += pred_test / nfold

    df_importance = pd.DataFrame({
        'column': feature_names,
        'importance': model_lgb.feature_importances_,
    })
    df_importance_list.append(df_importance)

test_pred['pred'] = test_pred['pred'].map(lambda x: round(x, 1))

df_importance = pd.concat(df_importance_list)
df_importance = df_importance.groupby(['column'])['importance'].agg('mean').sort_values(ascending=False)

df_importance.to_csv(os.path.join(current_path, 'importance.csv'))

print(df_importance.head(10))
print(df_importance.tail())
print(score_train_total / nfold, score_val_total / nfold)

test_pred.to_csv(os.path.join(current_path, 'lgb.csv'), index=False)

In [31]:
# 融合
df_xgb = pd.read_csv(os.path.join(current_path, 'xgb.csv'))
df_xgb.rename(columns={'pred': 'xgb'}, inplace=True)
df_lgb = pd.read_csv(os.path.join(current_path, 'lgb.csv'))
df_lgb.rename(columns={'pred': 'lgb'}, inplace=True)

df_all = df_xgb.merge(df_lgb, how='left')
df_all['pred'] = (df_all['xgb'] + df_all['lgb']) / 2
df_all = df_all[['student_id', 'course', 'exam_id', 'pred']]
df_all.to_csv(os.path.join(current_path, 'submit.csv'), index=False)