In [1]:
from sklearn.model_selection import KFold,StratifiedKFold,GroupKFold
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm_notebook, tnrange
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import datetime
import gc
import lightgbm as lgb
import os
DATA_PATH = './datasets/old/'
warnings.filterwarnings("ignore")
%matplotlib inline

In [3]:
df_data = pd.read_csv(DATA_PATH+'df_data.csv')
df_data.head()

Unnamed: 0,card_id,feature_1,feature_2,feature_3,first_active_month,is_in_historical,is_in_new_monthlag1,is_in_new_monthlag2,is_test,target,...,new_purchaseAmountMean_2,new_purchaseAmountMax_2,new_purchaseAmountSum2,new_purcahseAmountStd2,new_purcahseAmountCount2,new_installmentsMean2,new_installmentsMax2,new_installmentsSum2,new_installmentsMin2,new_installmentsStd2
0,C_ID_92a2005557,5,2,1,2017-06,1,1,1,0,-0.820283,...,-0.595834,-0.307758,-6.554174,0.12591,11.0,0.0,0.0,0.0,0.0,0.0
1,C_ID_3d0044924f,4,1,0,2017-01,1,1,1,0,0.392913,...,-0.71702,-0.701858,-2.15106,0.016052,3.0,1.0,1.0,3.0,1.0,0.0
2,C_ID_d639edf6cd,2,2,0,2016-08,1,0,1,0,0.688056,...,-0.700326,-0.700326,-0.700326,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,C_ID_186d6a6901,4,3,0,2017-09,1,1,1,0,0.142495,...,-0.67445,-0.56674,-3.37225,0.065935,5.0,0.8,1.0,4.0,0.0,0.447214
4,C_ID_cdbd2c0db2,1,3,0,2017-11,1,1,1,0,-0.159749,...,-0.516017,0.450885,-10.320344,0.269158,20.0,1.05,2.0,21.0,1.0,0.223607


#### FFM

In [14]:
class FFMFormatPandas:
    def __init__(self):
        self.field_index_ = None
        self.feature_index_ = None
        self.y = None

    def fit(self, df, y=None):
        self.y = y
        df_ffm = df[df.columns.difference([self.y])]
        if self.field_index_ is None:
            self.field_index_ = {col: i for i, col in enumerate(df_ffm)}

        if self.feature_index_ is not None:
            last_idx = max(list(self.feature_index_.values()))

        if self.feature_index_ is None:
            self.feature_index_ = dict()
            last_idx = 0

        for col in df.columns:
            vals = df[col].unique()
            for val in vals:
                if pd.isnull(val):
                    continue
                name = '{}_{}'.format(col, val)
                if name not in self.feature_index_:
                    self.feature_index_[name] = last_idx
                    last_idx += 1
            self.feature_index_[col] = last_idx
            last_idx += 1
        return self

    def fit_transform(self, df, y=None):
        self.fit(df, y)
        return self.transform(df)

    def transform_row_(self, row, t):
        ffm = []
        if self.y != None:
            ffm.append(str(row.loc[row.index == self.y][0]))
        if self.y is None:
            ffm.append(str(0))

        for col, val in row.loc[row.index != self.y].to_dict().items():
            col_type = t[col]
            name = '{}_{}'.format(col, val)
            if col_type.kind ==  'O':
                ffm.append('{}:{}:1'.format(self.field_index_[col], self.feature_index_[name]))
#             elif col_type.kind == 'i':
            else:
                ffm.append('{}:{}:{}'.format(self.field_index_[col], self.feature_index_[col], val))
        return ' '.join(ffm)

    def transform(self, df):
        t = df.dtypes.to_dict()
        return pd.Series({idx: self.transform_row_(row, t) for idx, row in df.iterrows()})
    
def dtype_to_str(df,categoryCols):
    for col in categoryCols:
        df[col] = df[col].map(str)
    return df
print("........start FFM...")

length = df_data[df_data.is_test==0].shape[0]

df_data_ffm = df_data.copy()

dropCols = ['card_id','first_active_month','is_test','is_outlier']
tr_features = [_f for _f in df_data_ffm.columns if _f not in dropCols]
categoryCols = ['feature_1','feature_2','feature_3','active_year','active_month','new_merchant_id_frequenceMax',
                'histAuth_N_merchant_id_frequenceMax','histAuth_Y_merchant_id_frequenceMax']
numCols = [_f for _f in tr_features if _f not in categoryCols]

df_data_ffm = dtype_to_str(df_data_ffm,categoryCols)
df_data_ffm = df_data_ffm[tr_features]

df_data_ffm[numCols] = df_data_ffm[numCols].apply(lambda series:series-np.min(series)/(np.max(series)-np.min(series)))

ffm_utils = FFMFormatPandas()
ffm_data = ffm_utils.fit_transform(df_data_ffm,y='target')

ffm_data.to_csv(DATA_PATH+'ffm_data.csv',index=False)
print('......done！....')
#文件保存
index_train = int(0.7*length)
with open(DATA_PATH+'ffm_data.csv') as fin:
    df_ffm_train = open(DATA_PATH+'df_ffm_train.csv','w')
    df_ffm_val = open(DATA_PATH+'df_ffm_val.csv','w')
    df_ffm_test = open(DATA_PATH+'df_ffm_test.csv','w')
    
    for (i,line) in enumerate(fin):
        if i<index_train:
            df_ffm_train.write(line)
        elif i<length:
                df_ffm_val.write(line)
        else:
            df_ffm_test.write(line)
    df_ffm_train.close()
    df_ffm_test.close()

........start FFM...
......done！....


#### FFM Trainging

In [None]:
from sklearn import preprocessing
import xlearn as xl
ffm_model = xl.create_ffm()
ffm_model.setTrain('./datasets/old/df_ffm_train.csv')
ffm_model.setValidate('./datasets/old/df_ffm_val.csv')
ffm_model.setTest('./datasets/old/df_ffm_test.csv')

params = {'task':'reg','lr':0.01,'metric':'rmse','epoch':200,'fold':5,'opt':'ftrl','k':5}
ffm_model.cv(params)
# ffm_model.fit(params,'./ffm_model.out')

ffm_model.predict('./ffm_model.out','./submission/ffm_predict.txt')
ffm_model.show()
# create submission file
df_sub = pd.read_csv('./datasets/sample_submission.csv')
df_sub['target'] = np.loadtxt('./submission/ffm_predict.txt')
df_sub.fillna(method='ffill',inplace=True)
df_sub.to_csv('./submission/df_ffm_submission.csv', index=False)

In [None]:
# from sklearn import preprocessing
# import xlearn as xl
# def df_to_ffm(df,tr_features):
#     field_dict = dict(zip(tr_features,range(len(tr_features))))
#     ffm = pd.DataFrame()
#     idx = 0
#     t = df.dtypes.to_dict()
#     for col in tr_features:
#         col_type = t[col]
#         if col_type.kind ==  'O':  ##category数据
#             col_value = df[col].unique()
#             feat_dict = dict(zip(col_value,range(idx,idx+len(col_value))))
#             se = df[col].apply(lambda x: (field_dict[col],feat_dict[x],1))
#             ffm = pd.concat([ffm,se],axis=1)
#             idx += len(col_value)
#         else:               ##数值型数据
#             min_max_scaler = preprocessing.MinMaxScaler()   ##归一化处理
#             df[col] = min_max_scaler.fit_transform(df[col].values.reshape(-1,1))
            
#             si = df[col].apply(lambda x: (field_dict[col],field_dict[col],x))
#             ffm = pd.concat([ffm,si],axis=1)
#     return ffm

# def dtype_to_str(df,categoryCols):
#     for col in categoryCols:
#         df[col] = df[col].map(str)
#     return df
# print("start FFM...")
# len_train = df_data[df_data.is_test==0].shape[0]
# len_test = df_data[df_data.is_test==1].shape[0]

# df_data_ffm = df_data.copy()
# df_data_ffm.drop(columns=['target'],inplace=True)
# df_train_y = df_data[df_data.is_test==0]['target'].values

# dropCols = ['card_id','first_active_month','is_test','targFet','is_outlier']
# tr_features = [_f for _f in df_data_ffm.columns if _f not in dropCols]
# categoryCols = ['feature_1','feature_2','feature_3','start_year','start_month','feature_1_2','merchant_category_id_frequenceMax_byCardId'
#                ,'category_3_frequenceMax_byCardId','category_2_frequenceMax_byCardId','city_id_frequenceMax_byCardId',
#                'category_1_frequenceMax_byCardId','authorized_flag_frequenceMax_byCardId','monthLageFrequenceMax_byCardId']
# df_data_ffm = dtype_to_str(df_data_ffm,categoryCols)
# df_data_ffm = df_to_ffm(df_data_ffm,tr_features)