In [31]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

#
# Dataset fields
#
numeric_features = ["if"+str(i) for i in range(1,14)]
#categorical_features = ["cf"+str(i) for i in range(1,27)] + ["day_number"]

#fields = ["id", "label"] + numeric_features + categorical_features
fields = ["id", "label"] + numeric_features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
#    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)
    ]
 )

# Now we have a full prediction pipeline.
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('logisticregression', LogisticRegression())
])

In [32]:
import os, sys
import logging

import pandas as pd
from sklearn.model_selection import train_test_split
from joblib import dump

#
# Read dataset
#
train_path = '/home/users/datasets/criteo/criteo_train1'

read_table_opts = dict(sep="\t", names=fields, index_col=False)
df = pd.read_table(train_path, **read_table_opts)

print(list(df.columns))

['id', 'label', 'if1', 'if2', 'if3', 'if4', 'if5', 'if6', 'if7', 'if8', 'if9', 'if10', 'if11', 'if12', 'if13']


In [11]:
df.head()

Unnamed: 0,id,label,if1,if2,if3,if4,if5,if6,if7,if8,if9,if10,if11,if12,if13
0,0,0,,600.0,1.0,342.0,9.0,0.0,0.0,18,1,0.0,4.0,27648.0,1.0
1,2,0,3.0,239.0,3.0,,8.0,0.0,0.0,139,0,0.0,2.0,27.0,3.0
2,4,0,3.0,124.0,39.0,398.0,3.0,0.0,0.0,1,40,0.0,1.0,21612.0,39.0
3,6,0,12.0,85.0,4.0,59.0,6.0,0.0,0.0,7,1,0.0,2.0,5878.0,4.0
4,8,0,,2.0,,,,,,-1,0,,,6042.0,


In [12]:
df.tail()

Unnamed: 0,id,label,if1,if2,if3,if4,if5,if6,if7,if8,if9,if10,if11,if12,if13
5014585,10029170,0,19.0,80.0,6.0,20.0,,1.0,0.0,57,4,1.0,,1016.0,6.0
5014586,10029172,0,16.0,379.0,3.0,,,0.0,0.0,93,7,0.0,,110954.0,3.0
5014587,10029174,0,,914.0,2.0,157.0,1.0,0.0,0.0,0,30,0.0,1.0,4289.0,2.0
5014588,10029176,0,7.0,175.0,8.0,8.0,1.0,2.0,2.0,185,7,1.0,1.0,54.0,8.0
5014589,10029178,0,,499.0,13.0,613.0,22.0,0.0,0.0,1,41,0.0,10.0,1172.0,13.0


In [13]:
5014589*2

10029178

In [33]:
from joblib import load

In [34]:
model = load("/home/users/Daniil-Selikhanovych/1.joblib")

#fields = """doc_id,hotel_name,hotel_url,street,city,state,country,zip,class,price,
#num_reviews,CLEANLINESS,ROOM,SERVICE,LOCATION,VALUE,COMFORT,overall_ratingsource""".replace("\n",'').split(",")

#read and infere
read_opts=dict(
        sep='\t', names=fields, index_col=False, header=None,
        iterator=True, chunksize=100
)

In [20]:
val_path = '/home/users/datasets/criteo/criteo_valid1_labels'

In [21]:
df = pd.read_csv(val_path, **read_opts)

In [27]:
numeric_features = ["if"+str(i) for i in range(1,14)]
valid_fields = ["id"] + numeric_features

In [30]:
#from model import valid_fields

#
# Init the logger
#
#logging.basicConfig(level=logging.DEBUG)
#logging.info("CURRENT_DIR {}".format(os.getcwd()))
#logging.info("SCRIPT CALLED AS {}".format(sys.argv[0]))
#logging.info("ARGS {}".format(sys.argv[1:]))

#load the model
model = load("/home/users/Daniil-Selikhanovych/ozon-masters-bigdata/1.joblib")

#fields = """doc_id,hotel_name,hotel_url,street,city,state,country,zip,class,price,
#num_reviews,CLEANLINESS,ROOM,SERVICE,LOCATION,VALUE,COMFORT,overall_ratingsource""".replace("\n",'').split(",")

#read and infere
read_opts=dict(
        sep='\t', names=valid_fields, index_col=False, header=None,
        iterator=True, chunksize=100
)

for df in pd.read_csv(train_path, **read_opts):
    print(df.head())
    pred = model.predict_proba(df)
    out = zip(df.id, pred[:, 1])
    print("\n".join(["{0},{1}".format(*i) for i in out]))



   id  if1   if2    if3   if4    if5  if6  if7  if8  if9  if10  if11  if12  \
0   0    0   NaN  600.0   1.0  342.0  9.0  0.0  0.0   18     1   0.0   4.0   
1   2    0   3.0  239.0   3.0    NaN  8.0  0.0  0.0  139     0   0.0   2.0   
2   4    0   3.0  124.0  39.0  398.0  3.0  0.0  0.0    1    40   0.0   1.0   
3   6    0  12.0   85.0   4.0   59.0  6.0  0.0  0.0    7     1   0.0   2.0   
4   8    0   NaN    2.0   NaN    NaN  NaN  NaN  NaN   -1     0   NaN   NaN   

      if13  
0  27648.0  
1     27.0  
2  21612.0  
3   5878.0  
4   6042.0  
0,0.0
2,0.07997970868964606
4,0.0
6,6.562319458582365e-109
8,2.275425527980783e-112
10,1.8432066690698835e-156
12,0.7600686403241614
14,2.186197771210604e-75
16,0.0590321385474242
18,0.0
20,0.0
22,8.96410411966104e-27
24,1.795941001668492e-125
26,2.3685015245600493e-12
28,3.3756902261917714e-55
30,0.0
32,0.0
34,0.0
36,1.3234416627924655e-73
38,7.268015816100891e-127
40,1.5192865532348336e-06
42,0.7604775608080774
44,0.006128025029576899
46,3.1466012

        id  if1    if2     if3   if4   if5  if6  if7  if8  if9  if10  if11  \
1100  2200    0   29.0  5478.0   0.0   NaN  NaN  0.0  0.0   40    20   0.0   
1101  2202    0    9.0    48.0   1.0  18.0  6.0  0.0  0.0   81     0   0.0   
1102  2204    0    NaN   585.0   2.0  50.0  NaN  7.0  0.0    1    41   1.0   
1103  2206    0    8.0   203.0   NaN   NaN  3.0  0.0  0.0  106     1   0.0   
1104  2208    0  269.0   657.0  26.0   NaN  NaN  0.0  0.0    7     0   0.0   

      if12     if13  
1100   NaN      4.0  
1101   1.0   4350.0  
1102   NaN    851.0  
1103   3.0  16236.0  
1104   NaN   7109.0  
2200,1.0
2202,2.143369990221809e-81
2204,1.9650250726865013e-08
2206,1.7683594557252516e-297
2208,3.389610290059806e-129
2210,1.40479138521269e-20
2212,1.1226200658600185e-13
2214,0.9302063363067565
2216,3.7031674620701666e-249
2218,0.0
2220,3.504107109103777e-179
2222,0.0
2224,2.921456659430227e-27
2226,8.13568454654276e-37
2228,0.0
2230,0.9997314880005101
2232,0.8642734707484215
2234,0.0
2236,0

        id  if1  if2    if3  if4   if5  if6  if7  if8   if9  if10  if11  if12  \
2200  4400    0  8.0  113.0  8.0  20.0  1.0  1.0  0.0    35    23   1.0   1.0   
2201  4402    0  4.0  245.0  2.0  72.0  3.0  0.0  0.0     0     2   0.0   2.0   
2202  4404    0  NaN  401.0  3.0  70.0  1.0  0.0  0.0  1824    11   0.0   1.0   
2203  4406    0  2.0   94.0  1.0  45.0  1.0  0.0  0.0     6     2   0.0   1.0   
2204  4408    0  9.0  187.0  0.0  43.0  1.0  0.0  0.0     0    16   0.0   1.0   

         if13  
2200    887.0  
2201   5132.0  
2202   2689.0  
2203  18650.0  
2204  11656.0  
4400,4.897829933661272e-14
4402,1.7780334685899142e-94
4404,2.198158321585917e-49
4406,0.0
4408,2.213375639844281e-211
4410,0.04988383228960045
4412,0.0
4414,0.9999969734576952
4416,0.43242047721986115
4418,1.156634680840818e-131
4420,0.03669010354319092
4422,0.0
4424,4.966369380041537e-108
4426,2.4825537285681986e-05
4428,1.1090529719007034e-86
4430,0.7513512563845199
4432,2.8234137764148375e-08
4434,0.9999637556

        id  if1   if2     if3   if4   if5   if6  if7  if8  if9  if10  if11  \
3300  6600    0  20.0  1831.0  13.0   NaN   NaN  0.0  0.0    0    17   0.0   
3301  6602    0   6.0  1145.0   NaN   NaN   5.0  0.0  0.0  138     2   0.0   
3302  6604    0  14.0    67.0   1.0   NaN   NaN  0.0  0.0  144     1   0.0   
3303  6606    0   NaN   342.0   NaN  64.0  34.0  NaN  0.0    1     0   NaN   
3304  6608    0  19.0    60.0  17.0  43.0   NaN  4.0  0.0    2    19   1.0   

      if12     if13  
3300   NaN   1235.0  
3301   3.0   7354.0  
3302   NaN  63544.0  
3303  12.0   8619.0  
3304   NaN    915.0  
6600,2.3434481684532834e-14
6602,1.9407763323120888e-131
6604,0.0
6606,5.058959200288516e-158
6608,1.9938829353552527e-15
6610,0.9848757644776374
6612,8.463779830108551e-295
6614,5.433169207076545e-124
6616,0.9999886516277776
6618,9.467140558672622e-37
6620,5.17136594346733e-103
6622,3.6684157634762816e-57
6624,1.2149053526475133e-36
6626,0.0
6628,4.815569778487203e-136
6630,1.7400627007363679e-1

        id  if1   if2    if3  if4    if5  if6  if7  if8  if9  if10  if11  \
4400  8800    0   2.0  148.0  NaN    NaN  NaN  0.0  0.0   14     7   0.0   
4401  8802    0   7.0    4.0  1.0    1.0  1.0  1.0  0.0    1     2   1.0   
4402  8804    0  18.0  608.0  7.0  582.0  2.0  0.0  0.0   14    22   0.0   
4403  8806    0   5.0   21.0  NaN   25.0  2.0  NaN  NaN    1     0   NaN   
4404  8808    0  23.0  118.0  4.0    NaN  NaN  0.0  0.0   42     0   0.0   

      if12     if13  
4400   NaN  29276.0  
4401   1.0    586.0  
4402   2.0  13325.0  
4403   1.0  41214.0  
4404   NaN  10419.0  
8800,0.0
8802,1.7888688845763426e-12
8804,2.2409675651268584e-239
8806,0.0
8808,8.336949193324453e-192
8810,4.2797982658211716e-26
8812,2.8338841746951637e-28
8814,0.0
8816,0.9999579428836965
8818,1.1692804964586833e-05
8820,0.03494822544326728
8822,1.8334978955125886e-110
8824,0.0
8826,1.8949943204519964e-51
8828,2.368102869134669e-123
8830,1.5636335868994542e-22
8832,0.9997038750248761
8834,0.0
8836,2.2016

         id  if1    if2     if3  if4    if5    if6  if7  if8   if9  if10  \
5500  11000    0    NaN   166.0  NaN   18.0    8.0  NaN  0.0    -1     0   
5501  11002    0    9.0   191.0  NaN   83.0    1.0  0.0  0.0  1320     5   
5502  11004    0    6.0     9.0  1.0    4.0   14.0  0.0  0.0    36     4   
5503  11006    0  139.0  1705.0  5.0  386.0  163.0  0.0  0.0     0     6   
5504  11008    0    2.0  1905.0  1.0    NaN    NaN  0.0  0.0    77     1   

      if11  if12    if13  
5500   NaN   2.0  3981.0  
5501   0.0   1.0  8917.0  
5502   0.0   1.0  1565.0  
5503   0.0  16.0  6402.0  
5504   0.0   NaN  2004.0  
11000,4.260825752922526e-74
11002,1.9542857869525492e-164
11004,5.0455328620470666e-30
11006,3.63334557208694e-111
11008,7.773608585985622e-31
11010,0.0004593505025965247
11012,5.034492216478019e-78
11014,0.00034190125669744767
11016,0.9991935740666158
11018,6.367540853852985e-153
11020,0.0
11022,0.0
11024,2.2374331554838412e-85
11026,1.8869087924735328e-265
11028,0.0
11030,0.0


         id  if1   if2    if3   if4   if5  if6   if7  if8  if9  if10  if11  \
6600  13200    0  39.0    NaN  40.0  50.0  NaN   3.0  0.0    2    45   1.0   
6601  13202    0  13.0    NaN   0.0   0.0  NaN  14.0  0.0  967    23   3.0   
6602  13204    0  28.0  276.0  12.0   NaN  7.0   0.0  0.0   77     4   0.0   
6603  13206    0  58.0    NaN   3.0   NaN  NaN   0.0  0.0    2     0   0.0   
6604  13208    0   9.0   18.0   NaN  56.0  6.0   NaN  0.0   25     0   NaN   

      if12     if13  
6600   NaN    640.0  
6601   NaN     17.0  
6602   2.0   5751.0  
6603   NaN   1473.0  
6604   3.0  18149.0  
13200,2.7856486205826314e-06
13202,0.953089915003696
13204,2.3913269502124775e-105
13206,5.139603021605601e-29
13208,0.0
13210,0.11420487123136737
13212,4.0373216850491997e-56
13214,0.0
13216,0.0
13218,0.0
13220,1.7362942695233146e-115
13222,2.1445882718993252e-102
13224,0.003033632723858193
13226,1.3119401086937967e-118
13228,9.145565052845413e-94
13230,4.857915459313331e-215
13232,0.0
13234,0.0

         id  if1   if2    if3   if4    if5    if6  if7  if8  if9  if10  if11  \
7700  15400    0   NaN  181.0   NaN  113.0    5.0  0.0  0.0   -1     1   0.0   
7701  15402    0   4.0    NaN   NaN    NaN    NaN  0.0  0.0    0     0   0.0   
7702  15404    0   1.0   87.0   NaN  108.0    3.0  NaN  0.0    0     0   NaN   
7703  15406    0  82.0  137.0  27.0    NaN    NaN  0.0  0.0    8    35   0.0   
7704  15408    0   6.0  873.0   NaN    NaN  100.0  0.0  0.0    4    29   0.0   

      if12     if13  
7700   1.0  15017.0  
7701   NaN   3067.0  
7702   1.0   3761.0  
7703   NaN   9390.0  
7704  17.0    410.0  
15400,2.7207004452234545e-275
15402,4.236338492486997e-58
15404,1.930453517030193e-70
15406,4.685247019365053e-167
15408,0.2505492397588687
15410,0.0
15412,0.0
15414,0.9889793806309979
15416,0.0
15418,8.924528481383357e-18
15420,5.229965716670612e-184
15422,0.9982221226271905
15424,5.051429257440527e-243
15426,0.00042771012709722915
15428,1.301658398702563e-153
15430,4.189852496438405

         id  if1   if2     if3   if4   if5   if6  if7  if8  if9  if10  if11  \
8800  17600    0   4.0  1084.0   1.0   8.0  11.0  1.0  0.0    0     1   1.0   
8801  17602    1   1.0    35.0   5.0  11.0   4.0  0.0  0.0    1     3   0.0   
8802  17604    0   NaN   134.0   0.0   NaN   NaN  NaN  NaN   39     0   NaN   
8803  17606    0   NaN     NaN   2.0   3.0   NaN  2.0  0.0  312     4   0.0   
8804  17608    0  78.0   405.0  26.0   NaN   2.0  0.0  0.0    3    28   0.0   

      if12   if13  
8800   5.0      4  
8801   1.0   7129  
8802   NaN     17  
8803   NaN   1270  
8804   2.0  21597  
17600,0.9992281670677441
17602,1.419599767522812e-131
17604,0.05854060325046553
17606,6.371142233439307e-25
17608,0.0
17610,0.00024892180338058226
17612,0.0
17614,0.9292145197501172
17616,1.8699852177569712e-14
17618,1.1148815798721658e-30
17620,5.446975496469527e-152
17622,6.300940560223284e-54
17624,3.836366685313546e-131
17626,1.2956664509998837e-08
17628,6.983096300987265e-81
17630,0.98435541988440

19600,4.156115027971526e-26
19602,7.212146883194845e-134
19604,4.346081249918888e-09
19606,3.0918221932281077e-50
19608,0.0
19610,1.6647826240351685e-38
19612,1.1764565777830432e-134
19614,2.8994972953707257e-13
19616,0.0
19618,6.6120687576719245e-211
19620,0.9999999952799403
19622,0.03875934519059703
19624,4.573182287471186e-11
19626,3.916494519575242e-33
19628,3.816380896222757e-224
19630,1.2344808098973666e-126
19632,6.665080862378717e-16
19634,0.5643957907717544
19636,0.03956416587961387
19638,3.689602033364242e-220
19640,1.1271936324353213e-144
19642,5.8945059685550186e-304
19644,0.0
19646,0.9734166205273791
19648,0.03698160435215608
19650,0.9999223858795542
19652,8.1417301027789e-23
19654,5.51101782432527e-21
19656,1.8084354090145257e-24
19658,3.8868133892461545e-45
19660,3.3854221713563635e-54
19662,5.861284833756341e-26
19664,2.4319391321989582e-26
19666,3.839564300174405e-161
19668,0.00032332105999697164
19670,3.5507526482513236e-29
19672,0.39007551059983936
19674,2.1410093707

          id  if1   if2    if3  if4   if5  if6  if7  if8   if9  if10  if11  \
10900  21800    0  33.0   43.0  1.0   NaN  NaN  0.0  0.0  1675     0   0.0   
10901  21802    0   8.0   21.0  3.0  24.0  1.0  0.0  0.0     0     3   0.0   
10902  21804    0  35.0    NaN  1.0   NaN  NaN  0.0  0.0     0     1   0.0   
10903  21806    0  10.0  257.0  1.0   NaN  NaN  0.0  0.0    93    14   0.0   
10904  21808    0  19.0    NaN  9.0   NaN  NaN  0.0  0.0   283     2   0.0   

       if12      if13  
10900   NaN    9207.0  
10901   1.0    4284.0  
10902   NaN     169.0  
10903   NaN  198995.0  
10904   NaN    4245.0  
21800,1.8917255208695014e-171
21802,9.936683670731032e-80
21804,4.716381189205655e-05
21806,0.0
21808,1.667215842564787e-79
21810,2.345870866935106e-209
21812,0.0
21814,0.099388958927505
21816,9.442297759620425e-105
21818,6.613602954684299e-51
21820,4.817485911407729e-59
21822,0.06427384458068362
21824,0.0
21826,3.887727778376076e-279
21828,0.9683481685326495
21830,0.0
21832,2.4483164

          id  if1   if2    if3   if4   if5   if6  if7  if8  if9  if10  if11  \
12000  24000    0   NaN    3.0   NaN   NaN   NaN  NaN  NaN   -1    10   NaN   
12001  24002    0   3.0   83.0   1.0   NaN   3.0  0.0  0.0    0     1   0.0   
12002  24004    0  81.0  437.0   2.0   NaN   NaN  0.0  0.0    0     0   0.0   
12003  24006    0  53.0  787.0  36.0   NaN   4.0  0.0  0.0   21    48   0.0   
12004  24008    0   2.0   33.0   1.0  37.0  23.0  0.0  0.0   54     1   0.0   

       if12   if13  
12000   NaN   1352  
12001   2.0  24984  
12002   NaN   7629  
12003   3.0   4638  
12004   8.0   3548  
24000,4.0680077458249098e-25
24002,0.0
24004,1.4552489937242633e-139
24006,1.5910414190952808e-75
24008,1.2187852391618997e-66
24010,6.238547914330113e-09
24012,0.0
24014,0.0
24016,2.2633664951226168e-110
24018,0.0
24020,0.6828857614139731
24022,2.966502075779923e-215
24024,6.275039598318053e-116
24026,1.0912596451440828e-10
24028,0.9939407009689691
24030,0.9999884346693607
24032,2.29815015505893

          id  if1   if2     if3   if4   if5  if6  if7  if8  if9  if10  if11  \
13100  26200    0   NaN   359.0   2.0   NaN  NaN  0.0  0.0    0     3   0.0   
13101  26202    0   1.0  1073.0  12.0  26.0  NaN  0.0  0.0   17    27   0.0   
13102  26204    0   1.0     4.0   3.0   1.0  4.0  2.0  0.0   21     4   1.0   
13103  26206    0   NaN   250.0   NaN   NaN  NaN  0.0  0.0    1     0   0.0   
13104  26208    0  56.0     NaN  26.0  26.0  NaN  5.0  0.0   60     0   1.0   

       if12     if13  
13100   NaN  57186.0  
13101   NaN   1297.0  
13102   1.0      0.0  
13103   NaN   5669.0  
13104   NaN     50.0  
26200,0.0
26202,6.276227665766954e-17
26204,0.15430482408932783
26206,1.4073354993390672e-104
26208,0.004118734553238371
26210,0.0
26212,2.0862477015134325e-165
26214,0.48576749633363064
26216,0.0
26218,4.5075393701797425e-93
26220,2.768960468887267e-76
26222,0.47823231189822446
26224,1.6727894688963116e-182
26226,2.2971876460133948e-20
26228,6.881565516898846e-08
26230,0.0
26232,3.02

          id  if1   if2    if3   if4   if5   if6  if7  if8   if9  if10  if11  \
14200  28400    0   5.0   17.0   2.0  25.0   1.0  0.0  0.0  2108     3   0.0   
14201  28402    0   1.0  321.0   NaN   NaN   NaN  0.0  0.0    11    10   0.0   
14202  28404    0   NaN  388.0   2.0   0.0  67.0  6.0  3.0   206    15   2.0   
14203  28406    0  45.0    NaN  30.0  32.0   NaN  3.0  0.0    82    37   0.0   
14204  28408    0   7.0   31.0   2.0  31.0   1.0  0.0  0.0     0     4   0.0   

       if12     if13  
14200   1.0  14349.0  
14201   NaN  40163.0  
14202  31.0      2.0  
14203   NaN   1268.0  
14204   1.0  10416.0  
28400,2.950360597655342e-265
28402,0.0
28404,0.9966752561353333
28406,3.721253718333628e-19
28408,2.105621485331345e-191
28410,3.575589745587556e-18
28412,0.0
28414,0.6978317409912428
28416,1.637190426833776e-75
28418,2.7706563543187913e-51
28420,4.6589237004871625e-160
28422,9.114870964594592e-125
28424,3.606839240989249e-07
28426,3.280886109624337e-05
28428,5.0064414500462985e

          id  if1   if2     if3   if4     if5   if6  if7  if8  if9  if10  \
15300  30600    0   5.0   713.0   2.0  2685.0  11.0  0.0  0.0    0    29   
15301  30602    0  49.0  1189.0  12.0    21.0  28.0  0.0  0.0   46    12   
15302  30604    1   2.0     NaN   1.0     NaN   NaN  0.0  0.0  172     0   
15303  30606    0  39.0   401.0   2.0     NaN   2.0  0.0  0.0   54     5   
15304  30608    0   2.0   271.0   2.0     NaN  66.0  0.0  0.0    5     0   

       if11  if12    if13  
15300   0.0   3.0   980.0  
15301   0.0   5.0  2234.0  
15302   0.0   NaN     1.0  
15303   0.0   1.0  9764.0  
15304   0.0   8.0     0.0  
30600,1.610122618919954e-13
30602,4.347936276950108e-36
30604,0.025605235164841735
30606,7.255146885586372e-178
30608,0.3251880283338953
30610,2.1735456195694e-184
30612,0.0
30614,5.275011785082715e-18
30616,0.0
30618,0.9999999999943645
30620,0.035391788453273225
30622,0.0
30624,2.774694730031342e-293
30626,6.527440867239687e-10
30628,0.0
30630,0.9986766834324806
30632,0.9

          id  if1   if2     if3  if4   if5   if6  if7  if8  if9  if10  if11  \
16400  32800    0  29.0     3.0  2.0   2.0   1.0  1.0  0.0    4     5   1.0   
16401  32802    0   1.0   202.0  1.0   NaN   NaN  0.0  0.0    4     1   0.0   
16402  32804    0   1.0  1019.0  8.0  43.0  66.0  0.0  0.0    1    42   0.0   
16403  32806    0  42.0     2.0  2.0  37.0   1.0  0.0  0.0    3     6   0.0   
16404  32808    0  12.0     NaN  6.0   NaN   NaN  0.0  0.0   22     0   0.0   

       if12    if13  
16400   1.0       1  
16401   NaN  241400  
16402   9.0    5326  
16403   1.0    2232  
16404   NaN      21  
32800,0.20868028770768632
32802,0.0
32804,3.963725822107139e-88
32806,7.113205174376331e-42
32808,0.015138400624778236
32810,2.3162742496531715e-77
32812,1.26852070444086e-08
32814,0.028451868635687237
32816,9.384576872542452e-241
32818,7.990998331058893e-72
32820,0.6636942530119846
32822,1.2428185486296694e-24
32824,2.3183233432659316e-87
32826,6.849453915404275e-157
32828,0.03687946667185

KeyboardInterrupt: 

In [19]:
!ls /home/users/datasets/criteo

criteo_train0	criteo_train20a       criteo_valid_large_filtered_labels
criteo_train1	criteo_train500
criteo_train20	criteo_valid1_labels


In [35]:
valid_fields = ["id"] + numeric_features

In [39]:
outfields = valid_fields

path_to_valid_labels = '/home/users/datasets/criteo/criteo_valid1_labels'

with open(path_to_valid_labels) as f:
    line = f.readline()
    print(line)
    line = f.readline()
    print(line)
    line = f.readline()
    print(line)
    # skip header
    
    #if line.startswith(valid_fields[0]):
    #    continue

    #unpack into a tuple/dict
    #values = line.rstrip().split('\t')
    #record = dict(zip(valid_fields, values)) 

    #apply filter conditions
    #if filter_cond(record):
    #    output = "\t".join([record[x] for x in outfields])
    #    print(output)


0	0

1	0

2	0



In [40]:
path_to_valid_feat = '/home/users/datasets/criteo/criteo_valid_large_filtered_labels'

with open(path_to_valid_feat) as f:
    line = f.readline()
    print(line)
    line = f.readline()
    print(line)
    line = f.readline()
    print(line)
    # skip header

75779264	1

75779276	1

75779283	1



In [41]:
path_to_filter_feat = '/home/users/Daniil-Selikhanovych/hw1/pred_with_filter/part-00000'

In [42]:
model = load("/home/users/Daniil-Selikhanovych/hw1/1.joblib")

#fields = """doc_id,hotel_name,hotel_url,street,city,state,country,zip,class,price,
#num_reviews,CLEANLINESS,ROOM,SERVICE,LOCATION,VALUE,COMFORT,overall_ratingsource""".replace("\n",'').split(",")

#read and infere
read_opts=dict(
        sep='\t', names=valid_fields, index_col=False, header=None,
        iterator=True, chunksize=100
)

for df in pd.read_csv(path_to_filter_feat, **read_opts):
    print(df.head())
    pred = model.predict_proba(df)
    out = zip(df.id, pred[:, 1])
    print("\n".join(["{0},{1}".format(*i) for i in out]))
    break

         id  if1    if2   if3    if4   if5  if6  if7   if8  if9  if10  if11  \
0  75779264   37  433.0   0.0    NaN   3.0    0    0  1636    0     0   1.0   
1  75779276   23  576.0  15.0    NaN   NaN    0    0     1   48     0   NaN   
2  75779283   23    3.0   2.0   12.0   NaN    4    0    91    0     1   NaN   
3  75779291   37  323.0  28.0  185.0  76.0    0    0     1   32     0   7.0   
4  75779318   32  187.0   3.0    NaN   7.0    0    0     0    7     0   1.0   

    if12  if13  
0  24822   1.0  
1    106  15.0  
2    734   7.0  
3   5384  28.0  
4  38787   3.0  
75779264,0.03627507857075537
75779276,0.020328522197213685
75779283,0.04301123940843566
75779291,0.014003608672974264
75779318,0.02815183735410622
75779321,0.023333008627798323
75779341,0.04136225895209229
75779375,0.05324087566016542
75779396,0.05088089606459887
75779426,0.13123228300208625
75779435,0.020439751082645095
75779446,0.06170121697459085
75779449,0.08507306358437543
75779488,0.02036634489500542
75779498,0.02

In [None]:
df = read_opts=dict(
        sep=',', names=valid_fields, index_col=False, header=None,
        iterator=True, chunksize=100
)

In [None]:
path_to_filter_probas = '/home/users/Daniil-Selikhanovych/hw1/part-00000'
a = pd.read_csv(path_to_filter_probas, )