In [1]:
import numpy as np
import pandas as pd
import json
import math

In [2]:
item_metadata = pd.read_csv('D:\\Dokumenty\\Systemy_rekomendacyjne\\data\\item_metadata_all.csv',
                       sep=',')
item_metadata[:3]

Unnamed: 0,item_id,n_clicks,views,clicks_views_ratio,Wheelchair Accessible,Ski Resort,Hotel Bar,Electric Kettle,Teleprinter,1 Star,...,Flatscreen TV,Boat Rental,Swimming Pool (Combined Filter),Guest House,Water Slide,Convention Hotel,Swimming Pool (Indoor),WiFi (Public Areas),Accessible Parking,sum_of_properties
0,5001,3.0,99,0.030303,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,48.0
1,5002,6.0,68,0.088235,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,55.0
2,5003,6.0,90,0.066667,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,73.0


In [3]:
train_set = pd.read_csv('D:\\Dokumenty\\Systemy_rekomendacyjne\\additional_resources\\2019-master\data\\train.csv',
                       sep=',',
                      nrows=100000,
                      index_col=0)
train_set[:3]

Unnamed: 0_level_0,session_id,timestamp,user_id,step,action_type,reference,platform,city,device,current_filters,impressions,prices
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,b6b4a3c02db0c,1541030408,4JK19KX9RU36,1,search for destination,"Londrina, Brazil",BR,"Londrina, Brazil",desktop,,,
1,b6b4a3c02db0c,1541030410,4JK19KX9RU36,2,search for destination,"Londrina, Brazil",BR,"Londrina, Brazil",desktop,,,
2,5b578bc20be9f,1541030412,KQ7YR1O2APO1,1,search for destination,"Vienna, Austria",RO,"Vienna, Austria",mobile,,,


In [4]:
def string_to_array(s):
    """Convert pipe separated string to array."""

    if isinstance(s, str):
        out = s.split("|")
    elif math.isnan(s):
        out = []
    else:
        raise ValueError("Value must be either string of nan")
    return out

def explode_impressions_and_prices(df_in):
    """Explode column col_expl of array type into multiple rows."""

    df = df_in.copy()
    df.loc[:, 'impressions'] = df['impressions'].apply(string_to_array)  # zamienia 1|2|3 na [1,2,3]
    df.loc[:, 'prices'] = df['prices'].apply(string_to_array)  # zamienia 1|2|3 na [1,2,3]

    df = df.sort_values(by=['session_id'])
    df['number_of_impressions'] = df['impressions'].apply(len)
    
    df_out = pd.DataFrame(
        {col: np.repeat(df[col].values,
                        df['impressions'].str.len())
         for col in df.columns.drop('impressions')}
    )
    

    df_out.loc[:, 'impressions'] = np.concatenate(df['impressions'].values)
    df_out.loc[:, 'impressions'] = df_out['impressions'].apply(int)
    
    df_out.loc[:, 'prices'] = np.concatenate(df['prices'].values)
    df_out.loc[:, 'prices'] = df_out['prices'].apply(int)

    return df_out

In [5]:
train_set = explode_impressions_and_prices(train_set)
train_set[:3]

Unnamed: 0,session_id,timestamp,user_id,step,action_type,reference,platform,city,device,current_filters,prices,number_of_impressions,impressions
0,000cfc10b5baf,1541033232,A887V2Z1910U,2,clickout item,7993,US,"San Francisco, USA",desktop,Excellent Rating,208,25,746596
1,000cfc10b5baf,1541033232,A887V2Z1910U,2,clickout item,7993,US,"San Francisco, USA",desktop,Excellent Rating,159,25,7973040
2,000cfc10b5baf,1541033232,A887V2Z1910U,2,clickout item,7993,US,"San Francisco, USA",desktop,Excellent Rating,182,25,8019320


In [6]:
train_set = train_set[['reference', 'impressions']]
train_set[:3]

Unnamed: 0,reference,impressions
0,7993,746596
1,7993,7973040
2,7993,8019320


In [7]:
train_set = train_set.merge(item_metadata,
                           left_on='impressions',
                           right_on='item_id',
                           how='left')
train_set = train_set.drop(['impressions'], axis=1)
train_set = train_set.fillna(0)
train_set.iloc[:,4:] = train_set.iloc[:,4:].astype('int8')
train_set[:5]

Unnamed: 0,reference,item_id,n_clicks,views,clicks_views_ratio,Wheelchair Accessible,Ski Resort,Hotel Bar,Electric Kettle,Teleprinter,...,Flatscreen TV,Boat Rental,Swimming Pool (Combined Filter),Guest House,Water Slide,Convention Hotel,Swimming Pool (Indoor),WiFi (Public Areas),Accessible Parking,sum_of_properties
0,7993,746596,2.0,25,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,1,43
1,7993,7973040,1.0,13,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,12
2,7993,8019320,0.0,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,6
3,7993,10287780,1.0,8,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,10
4,7993,7990,22.0,573,0,1,0,1,1,1,...,0,0,1,0,0,0,0,1,1,79


In [8]:
train_set.iloc[:,:2] = train_set.iloc[:,:2].astype(int)
train_set['clicked'] = np.where(train_set['reference'] == train_set['item_id'], 1, 0)
train_set = train_set.drop(columns=['reference','item_id'])
train_set[:3]

Unnamed: 0,n_clicks,views,clicks_views_ratio,Wheelchair Accessible,Ski Resort,Hotel Bar,Electric Kettle,Teleprinter,1 Star,Shower,...,Boat Rental,Swimming Pool (Combined Filter),Guest House,Water Slide,Convention Hotel,Swimming Pool (Indoor),WiFi (Public Areas),Accessible Parking,sum_of_properties,clicked
0,2.0,25,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,1,43,0
1,1.0,13,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,12,0
2,0.0,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,6,0


In [9]:
# NAUKA

In [10]:
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [11]:
# out = out.dropna()
X = train_set.iloc[:, :-1]
y = train_set['clicked'].values

In [12]:
from sklearn.preprocessing import StandardScaler, normalize
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
sc = StandardScaler()
sc.fit(X_train)
X_train = sc.transform(X_train)
X_train = normalize(X_train, norm='l2')
X_test = sc.transform(X_test)
X_test = normalize(X_test, norm='l2')

In [13]:
from imblearn.over_sampling import ADASYN
adasyn = ADASYN(random_state=1, n_jobs=-1)
X_train, y_train = adasyn.fit_resample(X_train, y_train)

In [14]:
logreg = LogisticRegression(n_jobs=-1) #, class_weight={1.0:0.4, 0.0:0.6}
logreg.fit(X_train, y_train)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=-1, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [15]:
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))
# THRESHOLD = 0.7
# y_pred = np.where(logreg.predict_proba(X_test)[:,1] > THRESHOLD, 1, 0)

Accuracy of logistic regression classifier on test set: 0.68


In [16]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, balanced_accuracy_score
print('balanced_accuracy_score: {0}'.format(balanced_accuracy_score(y_test, y_pred)))
print('accuracy_score: {0}'.format(accuracy_score(y_test, y_pred)))

print('Nieprawidłowo sklasyfikowane próbki: %d' % (y_test != y_pred).sum())

print('classification_report :\n', classification_report(y_test, y_pred))
confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)
print(confmat)

balanced_accuracy_score: 0.5967081078768046
accuracy_score: 0.6781990244472355
Nieprawidłowo sklasyfikowane próbki: 22035
classification_report :
               precision    recall  f1-score   support

           0       0.97      0.69      0.80     65362
           1       0.07      0.51      0.13      3112

    accuracy                           0.68     68474
   macro avg       0.52      0.60      0.46     68474
weighted avg       0.93      0.68      0.77     68474

[[44861 20501]
 [ 1534  1578]]


In [None]:
# TEST

In [71]:
test_set = pd.read_csv('D:\\Dokumenty\\Systemy_rekomendacyjne\\additional_resources\\2019-master\data\\test.csv',
                       sep=',',
#                        nrows=1472418
                      skiprows=range(1,1472419)
                      )
mask = test_set["reference"].isnull() & (test_set["action_type"] == "clickout item")
test_set = test_set[mask]
test_set[:3]

Unnamed: 0,user_id,timestamp,session_id,step,action_type,reference,platform,city,device,current_filters,impressions,prices
9,U9MAV55LZMOR,1541453209,0dd5544c71728,1,clickout item,,US,"Atlantic City, USA",mobile,,366186|56563|3741160|63568|3478738|3137008|678...,69|68|46|39|69|47|48|52|74|70|44|34|57|69|37|7...
15,VXIWZ3NF0WHF,1541453212,be1b5623b2e12,3,clickout item,,DK,"Copenhagen, Denmark",tablet,,4412788|53876|3147220|929429|148127|53835|1753...,122|144|195|100|92|117|72|107|268|166|171|657|...
16,2VZWKE7NWP8C,1541453213,7daf68934bfef,1,clickout item,,US,"Weston, USA",mobile,,57189|404846|164156|111309|148435|77409|103228...,115|166|115|97|111|87|200|101|114|160|98|97|10...


In [59]:
# test_set[1676230:]

In [72]:
test_set = explode_impressions_and_prices(test_set)
test_set[:3]

Unnamed: 0,user_id,timestamp,session_id,step,action_type,reference,platform,city,device,current_filters,prices,number_of_impressions,impressions
0,YEPNOM933RW5,1541456066,0005f3dacebc3,1,clickout item,,PL,"Wisla, Poland",mobile,,28,25,411991
1,YEPNOM933RW5,1541456066,0005f3dacebc3,1,clickout item,,PL,"Wisla, Poland",mobile,,34,25,412001
2,YEPNOM933RW5,1541456066,0005f3dacebc3,1,clickout item,,PL,"Wisla, Poland",mobile,,30,25,763721


In [73]:
test_set = test_set[['impressions', 'number_of_impressions', 'step', 'user_id','session_id', 'timestamp']]
test_set[:3]

Unnamed: 0,impressions,number_of_impressions,step,user_id,session_id,timestamp
0,411991,25,1,YEPNOM933RW5,0005f3dacebc3,1541456066
1,412001,25,1,YEPNOM933RW5,0005f3dacebc3,1541456066
2,763721,25,1,YEPNOM933RW5,0005f3dacebc3,1541456066


In [74]:
test_set = test_set.merge(item_metadata,
                           left_on='impressions',
                           right_on='item_id',
                           how='left')

In [75]:
test_set = test_set.drop(['impressions'], axis=1)
test_set = test_set.fillna(0)
test_set.iloc[:,8:] = test_set.iloc[:,8:].astype('int8')
test_set.iloc[:,6:7] = test_set.iloc[:,6:7].astype('int8')

test_set[:5]

Unnamed: 0,number_of_impressions,step,user_id,session_id,timestamp,item_id,n_clicks,views,clicks_views_ratio,Wheelchair Accessible,...,Flatscreen TV,Boat Rental,Swimming Pool (Combined Filter),Guest House,Water Slide,Convention Hotel,Swimming Pool (Indoor),WiFi (Public Areas),Accessible Parking,sum_of_properties
0,25,1,YEPNOM933RW5,0005f3dacebc3,1541456066,411991.0,5,134.0,0,1,...,1,0,0,0,0,0,0,1,0,58
1,25,1,YEPNOM933RW5,0005f3dacebc3,1541456066,412001.0,16,97.0,0,1,...,0,0,0,0,0,1,0,0,0,53
2,25,1,YEPNOM933RW5,0005f3dacebc3,1541456066,763721.0,4,21.0,0,0,...,0,0,0,0,0,0,0,0,0,41
3,25,1,YEPNOM933RW5,0005f3dacebc3,1541456066,950343.0,6,12.0,0,0,...,0,0,0,0,1,1,1,1,0,52
4,25,1,YEPNOM933RW5,0005f3dacebc3,1541456066,963611.0,27,110.0,0,1,...,0,0,1,0,0,1,1,1,0,64


In [76]:
item_ids = test_set[['item_id','number_of_impressions', 'step', 'user_id','session_id','timestamp']]
test_set = test_set.drop(columns=['item_id', 'number_of_impressions', 'step', 'user_id','session_id','timestamp'])
test_set[:3]

Unnamed: 0,n_clicks,views,clicks_views_ratio,Wheelchair Accessible,Ski Resort,Hotel Bar,Electric Kettle,Teleprinter,1 Star,Shower,...,Flatscreen TV,Boat Rental,Swimming Pool (Combined Filter),Guest House,Water Slide,Convention Hotel,Swimming Pool (Indoor),WiFi (Public Areas),Accessible Parking,sum_of_properties
0,5,134.0,0,1,1,1,1,0,0,1,...,1,0,0,0,0,0,0,1,0,58
1,16,97.0,0,1,1,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,53
2,4,21.0,0,0,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,41


In [77]:
X_test = sc.transform(test_set.values)
X_test = normalize(X_test, norm='l2')

In [78]:
results = logreg.predict_proba(X_test)

In [79]:
df_results = pd.DataFrame(results)
df_results[['item_id', 'number_of_impressions', 'step', 'user_id','session_id','timestamp']] = item_ids
df_results = df_results[['item_id', 'number_of_impressions', 'step', 'user_id','session_id', 'timestamp', 0, 1]]
df_results['item_id'] = df_results['item_id'].astype(int)
df_results[:3]

Unnamed: 0,item_id,number_of_impressions,step,user_id,session_id,timestamp,0,1
0,411991,25,1,YEPNOM933RW5,0005f3dacebc3,1541456066,0.445864,0.554136
1,412001,25,1,YEPNOM933RW5,0005f3dacebc3,1541456066,0.411735,0.588265
2,763721,25,1,YEPNOM933RW5,0005f3dacebc3,1541456066,0.393657,0.606343


In [80]:
def ddd(df):
    return df[1].max()

In [81]:
df = pd.DataFrame(df_results.groupby(['item_id','number_of_impressions','step','user_id','session_id','timestamp']).apply(ddd)).reset_index()

In [82]:
df = df.sort_values(['user_id','session_id','timestamp',0], ascending=[True, True, True, False])
df = df.drop(columns=['number_of_impressions'])
df = df[['user_id', 'session_id','timestamp','step',0]]
df = df.reset_index()
df = df.rename(columns={'index':'item_id'})
df[:5]

Unnamed: 0,item_id,user_id,session_id,timestamp,step,0
0,52910,002Z4VWD9803,19e3090dddb3e,1541454912,27,0.854479
1,88462,002Z4VWD9803,19e3090dddb3e,1541454912,27,0.725543
2,14797,002Z4VWD9803,19e3090dddb3e,1541454912,27,0.654991
3,6845,002Z4VWD9803,19e3090dddb3e,1541454912,27,0.646986
4,64489,002Z4VWD9803,19e3090dddb3e,1541454912,27,0.628869


In [83]:
def group_concat(df, gr_cols, col_concat):
    """Concatenate multiple rows into one."""

    df_out = (
        df
        .groupby(gr_cols)[col_concat]
        .apply(lambda x: ' '.join(x))
        .to_frame()
        .reset_index()
    )

    
    return df_out

In [84]:
df['item_id'] = df['item_id'].astype(str)
df = group_concat(df, ["user_id", "session_id", "timestamp", "step"], 'item_id')
df = df.rename(columns={'item_id':'item_recommendations'})
df[:5]

Unnamed: 0,user_id,session_id,timestamp,step,item_recommendations
0,002Z4VWD9803,19e3090dddb3e,1541454912,27,52910 88462 14797 6845 64489 102661 15293 1493...
1,0035NH370RO8,1aa18f4d38589,1541454808,1,92249 66254 72097 75913 60284 71808 71804 7229...
2,0084CG10K9HT,a26df1847b07b,1541453674,2,8261 3209 25657 3211 88637 86984 58456 29771 9...
3,00DW4Z3A86VB,27f534ad5758e,1541454748,3,16582 84239 16532 70936 16631
4,00JXM8HMS7X7,f30eff281b5b5,1541453860,1,4172 14118 43154 14223 14082 14160 32861 36542...
5,00VY50EJCRYM,7f0ebfe559f38,1541453445,1,99694 70798 92565 81841 60507 72070 80189 9157...
6,018JW2JBUWT1,df14bb52bb6f8,1541455174,1,93724 86267 80203 70050 102796 101077 101500 9...
7,01D5HJIPWQ52,d04e66f1ec3d3,1541454979,23,4171 14117 4158 48365 14100 4168 28520 28534 4...
8,01M1SAK7D1FR,05ed38ab2a2b9,1541455200,56,60003 63481 97854 35425 47235 67184 97636 3897...
9,01NV3TUAN038,61c6bc65c55ab,1541454513,6,97277 70808 104076 92948 90051 63780 37032 922...


In [None]:
df.to_csv("D:\\Dokumenty\\Systemy_rekomendacyjne\\data\\submission_ML2.csv", sep=',', index=False)

In [None]:
df1 = df.copy()

In [4]:
df1 = pd.read_csv('D:\\Dokumenty\\Systemy_rekomendacyjne\\data\\submission_ML1.csv',
                       sep=',')
df1[:3]

Unnamed: 0,user_id,session_id,timestamp,step,item_id
0,000I430EXZC0,ddabdf53a4d38,1541501642,1,1375003 1055227 587122 1538328 907820 890293 1...
1,000O8CPC6T0W,846b712ae8e1f,1541480658,1,33823 1134325 2588485 132119 2741649 132108 14...
2,000REY2YYH1D,4a313f2eb8954,1541465923,1,1329316 1157652 473333 432677 1283368 449078 1...


In [5]:
df2 = pd.read_csv('D:\\Dokumenty\\Systemy_rekomendacyjne\\data\\submission_ML2.csv',
                       sep=',')
df2[:3]

Unnamed: 0,user_id,session_id,timestamp,step,item_id
0,0010PD3FH7SV,26b9e7e18cc87,1541535143,1,2870821 2201933 3174150 1196679 2145009 120367...
1,0010PD3FH7SV,26b9e7e18cc87,1541535269,2,2870824 2201936 3174153 1196682 2145012 120367...
2,0015CRBC6OH9,8bcbd478ac2cf,1541518796,1,2863149 551500 738614 146830 734448 553932 946...


In [6]:
print(df1.shape)
print(df2.shape)

(141830, 5)
(164591, 5)


In [7]:
df1 = pd.concat([df1, df2])
df1.shape

(306421, 5)

In [10]:
df1.to_csv("D:\\Dokumenty\\Systemy_rekomendacyjne\\data\\submission_ML.csv", sep=',', index=False)