In [1]:
import pickle
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from datetime import datetime
from scipy import sparse 
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder,StandardScaler

In [2]:
train_df = pd.read_csv( 'train_sessions.csv',
                       index_col='session_id')
test_df = pd.read_csv( 'test_sessions.csv',
                      index_col='session_id')

In [3]:
train_df.head(3)

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,user_id
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,23713,2014-03-24 15:22:40,23720.0,2014-03-24 15:22:48,23713.0,2014-03-24 15:22:48,23713.0,2014-03-24 15:22:54,23720.0,2014-03-24 15:22:54,...,2014-03-24 15:22:55,23713.0,2014-03-24 15:23:01,23713.0,2014-03-24 15:23:03,23713.0,2014-03-24 15:23:04,23713.0,2014-03-24 15:23:05,653
2,8726,2014-04-17 14:25:58,8725.0,2014-04-17 14:25:59,665.0,2014-04-17 14:25:59,8727.0,2014-04-17 14:25:59,45.0,2014-04-17 14:25:59,...,2014-04-17 14:26:01,45.0,2014-04-17 14:26:01,5320.0,2014-04-17 14:26:18,5320.0,2014-04-17 14:26:47,5320.0,2014-04-17 14:26:48,198
3,303,2014-03-21 10:12:24,19.0,2014-03-21 10:12:36,303.0,2014-03-21 10:12:54,303.0,2014-03-21 10:13:01,303.0,2014-03-21 10:13:24,...,2014-03-21 10:13:36,303.0,2014-03-21 10:13:54,309.0,2014-03-21 10:14:01,303.0,2014-03-21 10:14:06,303.0,2014-03-21 10:14:24,34


In [4]:
def convert_to_sparse(ar):
    def tokenizer(s):
        return list(s)
    vectorizer = CountVectorizer(analyzer=tokenizer)
    X = vectorizer.fit_transform(ar)
    return X
def get_score(clf,X,y):
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=17, stratify=y)
    cv_score=cross_val_score(clf,X_train,y_train,cv=3).mean()
    clf.fit(X_train,y_train)
    score=clf.score(X_valid,y_valid)
    print ("CV_Score: ",round(cv_score,3))
    print ("Holdout : ",round(score,3))
def write_to_submission_file(predicted_labels, out_file,
                             target='user_id', index_label="session_id"):
    # turn predictions into data frame and save as csv file
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)
def read_ftr(filename):
    with open(filename, 'rb') as file_pkl:
        res=pickle.load(file_pkl)
    return res

## baseline

In [5]:
def run(train_df,test_df,clf,result_file):
    print ('Загрузка данных')
    train_test_df = pd.concat([train_df, test_df])
    train_test_df_sites = train_test_df[['site1', 'site2', 'site3', 
                                     'site4','site5', 
                                     'site6','site7', 'site8', 
                                     'site9', 'site10']].fillna(0).astype('int')
    print('Конвертация в sparse')
    X_train_test_sparse=convert_to_sparse(train_test_df_sites.values)[:,1:]
    X_train_sparse=X_train_test_sparse[:train_df.shape[0],:]
    X_test_sparse = X_train_test_sparse[train_df.shape[0]:,:]
    y = train_df.user_id.values
    print('Получаем оценку модели на train')
    get_score(clf,X_train_sparse,y)
    print('Записываем предсказания')
    clf.fit(X_train_sparse,y)
    y_pred = clf.predict(X_test_sparse)
    write_to_submission_file(y_pred,result_file)

In [6]:
clf = SGDClassifier(loss='log',random_state=17,n_jobs=-1)
run(train_df,test_df,clf,'result_baseline.csv')

Загрузка данных
Конвертация в sparse
Получаем оценку модели на train
CV_Score:  0.291
Holdout :  0.302
Записываем предсказания


### window_size = 3
base=0.18192

In [7]:
clf = SGDClassifier(loss='log',random_state=17,n_jobs=-1)
train_df_new=read_ftr('train_3_session.pkl')
run(train_df_new,test_df,clf,'result_3.csv')

Загрузка данных
Конвертация в sparse
Получаем оценку модели на train
CV_Score:  0.343
Holdout :  0.347
Записываем предсказания


### window_size = 5
base=0.17787

In [8]:
train_df_new=read_ftr('train_5_session.pkl')
clf = SGDClassifier(loss='log',random_state=17,n_jobs=-1)
run(train_df_new,test_df,clf,'result_5.csv')

Загрузка данных
Конвертация в sparse
Получаем оценку модели на train
CV_Score:  0.327
Holdout :  0.339
Записываем предсказания


### window_size = 7

In [9]:
clf = SGDClassifier(loss='log',random_state=17,n_jobs=-1)
train_df_new=read_ftr('train_7_session.pkl')
run(train_df_new,test_df,clf,'result_7.csv')

Загрузка данных
Конвертация в sparse
Получаем оценку модели на train
CV_Score:  0.32
Holdout :  0.327
Записываем предсказания


### window_size = 10

In [39]:
clf = SGDClassifier(loss='log',random_state=17,n_jobs=-1)
train_df_new=read_ftr('train_10_session.pkl')
run(train_df_new,test_df,clf,'result_10.csv')

Загрузка данных
Конвертация в sparse
Получаем оценку модели на train
CV_Score:  0.299
Holdout :  0.309
Записываем предсказания


## Feature 1

Функции для генерации новых признаков

In [56]:
def ftr_unique_sites(row):
    sites=row[['site1', 'site2', 'site3', 'site4','site5', 'site6','site7', 'site8','site9', 'site10']]
    return len(np.unique(sites))

def ftr_session_timespan(row):
    def isNaN(num):
        return num != num
    time_start= datetime.strptime(row['time1'], '%Y-%m-%d %H:%M:%S')
    time_end=row['time10']
    if (isNaN(time_end)):
        for x in range(9,0,-1):
            t=row['time'+str(x)]
            if not(isNaN(t)):
                time_end= datetime.strptime(t, '%Y-%m-%d %H:%M:%S')
                break
    else:  
        time_end= datetime.strptime(row['time10'], '%Y-%m-%d %H:%M:%S')
        
    res=(time_end-time_start).total_seconds()
    return int(res)

def ftr_start_year(row):
    time_start= datetime.strptime(row['time1'], '%Y-%m-%d %H:%M:%S')
    res=time_start.year
    return int(res)
def ftr_start_month(row):
    time_start= datetime.strptime(row['time1'], '%Y-%m-%d %H:%M:%S')
    res=time_start.month
    return int(res)
def ftr_start_day(row):
    time_start= datetime.strptime(row['time1'], '%Y-%m-%d %H:%M:%S')
    res=time_start.day
    return int(res)
def ftr_start_hour(row):
    time_start= datetime.strptime(row['time1'], '%Y-%m-%d %H:%M:%S')
    res=time_start.hour
    return int(res)

def ftr_weekday(row):
    time_start= datetime.strptime(row['time1'], '%Y-%m-%d %H:%M:%S')
    res=time_start.weekday()
    return int(res)

# Генерация всех новых признаков в виде dataframe
def gen_features(df):
    newdf=pd.DataFrame(index=df.index)
    newdf['unique_sites']=df.apply(ftr_unique_sites, axis=1)
    newdf['session_timespan']=df.apply(ftr_session_timespan, axis=1)
    newdf['start_year']=df.apply(ftr_start_year, axis=1)
    newdf['start_month']=df.apply(ftr_start_month, axis=1)
    newdf['start_day']=df.apply(ftr_start_day, axis=1)
    newdf['start_hour']=df.apply(ftr_start_hour, axis=1)
    newdf['weekday']=df.apply(ftr_weekday, axis=1) 
    return newdf

# Сохранение признаков в файл
def save_ftr(ftr_df,filename):
    with open(filename, 'wb') as file_pkl:
        pickle.dump(ftr_df, file_pkl, protocol=2) 

# Чтение признаков из файла     
def read_ftr(filename):
    with open(filename, 'rb') as file_pkl:
        res=pickle.load(file_pkl)
    return res

def myfunc1 (row):
    def isNaN(num):
        return num != num
    
    time_start= datetime.strptime(row['time1'], '%Y-%m-%d %H:%M:%S')
    ftr_weekday=time_start.weekday()
    ftr_start_year=time_start.year
    ftr_start_month=time_start.month
    ftr_start_day=time_start.day
    ftr_start_hour=time_start.hour
    
    sites=row[['site1', 'site2', 'site3', 'site4','site5', 'site6','site7', 'site8','site9', 'site10']]
    ftr_unique_sites=len(np.unique(sites))
    
    time_end=row['time10']
    if (isNaN(time_end)):
        for x in range(9,0,-1):
            t=row['time'+str(x)]
            if not(isNaN(t)):
                time_end= datetime.strptime(t, '%Y-%m-%d %H:%M:%S')
                break
    else:  
        time_end= datetime.strptime(row['time10'], '%Y-%m-%d %H:%M:%S')
        
    ftr_session_timespan=int((time_end-time_start).total_seconds())

    return pd.Series([ftr_unique_sites,ftr_session_timespan,ftr_start_year,ftr_start_month,ftr_start_day,ftr_start_hour,ftr_weekday], 
                     index=['unique_sites','session_timespan','start_year','start_month','start_day','start_hour','weekday'])

# Генерация всех новых признаков в виде dataframe
def gen_features2(df):
    newdf=df.apply(myfunc1 ,axis=1)
    return newdf



Генерация признаков и сохранение их в файл

In [62]:
train_df=read_ftr('train_3_session.pkl')
new_train_ftr=gen_features(train_df)
save_ftr(new_train_ftr,'train_features.pkl') 

Чтение новых признаков из файла

In [63]:
new_train_ftr=read_ftr('train_features.pkl')
new_test_ftr=read_ftr('test_features.pkl')
new_train_ftr.head()

Unnamed: 0,unique_sites,session_timespan,start_year,start_month,start_day,start_hour,weekday
272112,5,2630546,2013,3,12,21,1
272115,6,2630255,2013,3,12,21,1
272118,9,1209,2013,4,12,8,4
272121,7,11,2013,4,12,8,4
272124,6,60,2013,4,12,8,4


Добавление признаков к матрице

In [64]:
train_test_df = pd.concat([train_df, test_df])
train_test_df_sites = train_test_df[['site1', 'site2', 'site3', 
                                     'site4','site5', 
                                     'site6','site7', 'site8', 
                                     'site9', 'site10']].fillna(0).astype('int')
X_train_test_sparse=convert_to_sparse(train_test_df_sites.values)[:,1:]
X_train_sparse=X_train_test_sparse[:train_df.shape[0],:]
X_test_sparse = X_train_test_sparse[train_df.shape[0]:,:]
y = train_df.user_id.values

In [77]:
enc = OneHotEncoder()
cols=['start_year','start_month','start_day']
ohe_ftr_train=new_train_ftr[cols].values
ohe_ftr_train=enc.fit_transform(ohe_ftr_train) 
ohe_ftr_test=new_test_ftr[cols].values
ohe_ftr_test=enc.transform(ohe_ftr_test) 
X_train_sparse_ftr=sparse.hstack([X_train_sparse, ohe_ftr_train])
X_test_sparse_ftr=sparse.hstack([X_test_sparse, ohe_ftr_test])

In [78]:
clf = SGDClassifier(loss='log',random_state=17,n_jobs=-1)
get_score(clf,X_train_sparse_ftr,y)
# full
#CV_Score:  0.526
#Holdout :  0.533
# 'unique_sites','start_hour','weekday'
#CV_Score:  0.434
#Holdout :  0.439
# 'start_year','start_month','start_day'
# CV_Score:  0.464
# Holdout :  0.471

CV_Score:  0.464
Holdout :  0.471


In [74]:
clf.fit(X_train_sparse_ftr,y)
y_pred = clf.predict(X_test_sparse_ftr)
write_to_submission_file(y_pred,'result_3_ohe.csv')

In [70]:
scalar = StandardScaler()
scalar_ftr_train=new_train_ftr[['session_timespan']].values
scalar_ftr_train=scalar.fit_transform(scalar_ftr_train)
scalar_ftr_test=new_test_ftr[['session_timespan']].values
scalar_ftr_test=scalar.transform(scalar_ftr_test) 
X_train_sparse_ftr=sparse.hstack([X_train_sparse,scalar_ftr_train])
X_test_sparse_ftr=sparse.hstack([X_test_sparse, scalar_ftr_test])



In [71]:
clf = SGDClassifier(loss='log',random_state=17,n_jobs=-1)
get_score(clf,X_train_sparse_ftr,y)

CV_Score:  0.345
Holdout :  0.348


In [81]:
X_train_sparse_ftr=sparse.hstack([X_train_sparse,ohe_ftr_train,scalar_ftr_train])
X_test_sparse_ftr=sparse.hstack([X_test_sparse, ohe_ftr_test,scalar_ftr_test])
clf = SGDClassifier(loss='log',random_state=17,n_jobs=-1,alpha=1e-5,penalty='l2')
get_score(clf,X_train_sparse_ftr,y)
# baseline
#CV_Score:  0.526
#Holdout :  0.534

CV_Score:  0.528
Holdout :  0.554


In [82]:
clf.fit(X_train_sparse_ftr,y)
y_pred = clf.predict(X_test_sparse_ftr)
write_to_submission_file(y_pred,'result_3_ohesparce.csv')

Подбор гипер параметров

In [None]:
tuned_parameters= [{'alpha':[0.00001, 0.0001, 0.001, 0.01, 0.1, 1]}]
tuned_parameters = [{'l1_ratio': np.linspace(0.1,1,10).tolist()}]

#tuned_parameters= [{'penalty':['none', 'l1', 'l2', 'elasticnet']}]

#tuned_parameters= [{'alpha':10.0**-np.arange(1,7)}]
tuned_parameters

In [None]:
sgd_logit=SGDClassifier(loss='log',random_state=17,n_jobs=-1,alpha=1e-5)
clf = GridSearchCV(sgd_logit, tuned_parameters, cv=3)
clf.fit(X_train_sparse_ftr, y)

print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
        % (mean, std * 2, params))
    print()