In [1]:
import pandas as pd
import numpy as np
import collections
import time
import math
import plotly.express
import plotly.express as px

# Tworzenie plików ze zliczonymi wystąpieniami properties - podzielenie, z powodu braku RAMu

In [2]:
item_metadata = pd.read_csv('D:\\Dokumenty\\Systemy_rekomendacyjne\\data\\item_metadata_separated_properties.csv',
                           sep=',',
                           dtype='int64')
item_metadata[:3]

Unnamed: 0,item_id,Wheelchair Accessible,Ski Resort,Hotel Bar,Electric Kettle,Teleprinter,1 Star,Shower,Microwave,Childcare,...,Flatscreen TV,Boat Rental,Swimming Pool (Combined Filter),Guest House,Water Slide,Convention Hotel,Swimming Pool (Indoor),WiFi (Public Areas),Accessible Parking,sum_of_properties
0,5001,0,0,0,1,0,0,1,0,0,...,0,1,1,0,0,0,0,1,0,48
1,5002,0,0,1,0,0,0,1,0,1,...,0,1,0,0,0,0,0,1,0,55
2,5003,1,0,1,1,0,0,1,0,0,...,0,1,1,0,0,0,1,1,0,73


In [3]:
train = pd.read_csv('D:\\Dokumenty\\Systemy_rekomendacyjne\\data\\train.csv',
                       sep=',',
                    nrows=1000000,
                    skiprows=range(1, 9000000)
#                    nrows=1000000
                   )
train = train[train["action_type"] == "clickout item"]
print(train.shape)
train[:3]

(90681, 13)


Unnamed: 0,index,session_id,timestamp,user_id,step,action_type,reference,platform,city,device,current_filters,impressions,prices
1,9000000,a95b4ae431032,1541350004,K0H63GJLHJF1,2,clickout item,1754159,BR,"Guarujá, Brazil",mobile,,104632|104902|4504514|1754159|2083424|5008004|...,183|62|64|30|34|59|66|29|61|45|46|29|56|75|31|...
3,9000002,05db4bedc5fdf,1541350004,Q1903KD1O4HR,3,clickout item,2214170,TR,"Altınoluk, Turkey",mobile,,3211510|2064426|2065330|3385290|9259266|697658...,54|38|44|36|35|66|29|39|32|25|31|66|50|71|57|7...
11,9000010,91f8f88e996b5,1541350004,Z3YJR81C3PBS,2,clickout item,1571411,RO,"Mamaia, Romania",mobile,,895667|5954310|128612|3251378|121419|7174848|1...,45|49|48|31|55|60|48|66|48|69|21|64|46|34|49|2...


In [4]:
def string_to_array(s):
    """Convert pipe separated string to array."""

    if isinstance(s, str):
        out = s.split("|")
    elif math.isnan(s):
        out = []
    else:
        raise ValueError("Value must be either string of nan")
    return out

def explode_impressions(df_in):
    """Explode column col_expl of array type into multiple rows."""

    df = df_in.copy()
    df.loc[:, 'impressions'] = df['impressions'].apply(string_to_array)  # zamienia 1|2|3 na [1,2,3]

    df = df.sort_values(by=['session_id'])
    
    df_out = pd.DataFrame(
        {col: np.repeat(df[col].values,
                        df['impressions'].str.len())
         for col in df.columns.drop('impressions')}
    )
    

    df_out.loc[:, 'impressions'] = np.concatenate(df['impressions'].values)
    df_out.loc[:, 'impressions'] = df_out['impressions'].apply(int)
    
    return df_out

In [5]:
train = explode_impressions(train)
train[:3]

Unnamed: 0,index,session_id,timestamp,user_id,step,action_type,reference,platform,city,device,current_filters,prices,impressions
0,9551403,0002e7a47e7da,1541361558,074UTL8MKSG4,22,clickout item,923009,DE,"Münster, Germany",desktop,Sort by Price,136|323|333|352|354|357|368|371|386|407|411|41...,11128746
1,9551403,0002e7a47e7da,1541361558,074UTL8MKSG4,22,clickout item,923009,DE,"Münster, Germany",desktop,Sort by Price,136|323|333|352|354|357|368|371|386|407|411|41...,923007
2,9551403,0002e7a47e7da,1541361558,074UTL8MKSG4,22,clickout item,923009,DE,"Münster, Germany",desktop,Sort by Price,136|323|333|352|354|357|368|371|386|407|411|41...,923009


In [6]:
train = train.drop(columns=['index','timestamp', 'user_id', 'step','action_type','platform','city','device','session_id','prices','current_filters'], axis=1)
train[:3]

Unnamed: 0,reference,impressions
0,923009,11128746
1,923009,923007
2,923009,923009


In [7]:
item_metadata = item_metadata.drop(columns=['sum_of_properties'], axis=1)
item_metadata.iloc[:3,1:] = item_metadata.iloc[:3,1:].astype('int8')
item_metadata['item_id'] = item_metadata['item_id'].astype(str)

In [8]:
train = train.merge(item_metadata,
                   left_on='reference',
                   right_on='item_id',
                   how='left')
train[:3]

Unnamed: 0,reference,impressions,item_id,Wheelchair Accessible,Ski Resort,Hotel Bar,Electric Kettle,Teleprinter,1 Star,Shower,...,Country Hotel,Flatscreen TV,Boat Rental,Swimming Pool (Combined Filter),Guest House,Water Slide,Convention Hotel,Swimming Pool (Indoor),WiFi (Public Areas),Accessible Parking
0,923009,11128746,923009,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,923009,923007,923009,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,923009,923009,923009,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [9]:
train = train.drop(columns=['item_id'], axis=1)
train = train.drop(columns=['reference'], axis=1)
train[:3]

Unnamed: 0,impressions,Wheelchair Accessible,Ski Resort,Hotel Bar,Electric Kettle,Teleprinter,1 Star,Shower,Microwave,Childcare,...,Country Hotel,Flatscreen TV,Boat Rental,Swimming Pool (Combined Filter),Guest House,Water Slide,Convention Hotel,Swimming Pool (Indoor),WiFi (Public Areas),Accessible Parking
0,11128746,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,923007,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,923009,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [10]:
train = train.fillna(0)

In [11]:
# train.iloc[:,1:] = train.iloc[:,1:].astype('int8')
# train.dtypes

In [None]:
result1 = train.set_index('impressions').sum()

In [None]:
result1.to_csv("D:\\Dokumenty\\Systemy_rekomendacyjne\\data\\prop7.csv")

# Scalanie plików

In [17]:
all_occurrences_df = pd.DataFrame()
prop = pd.Series.from_csv(f'D:\\Dokumenty\\Systemy_rekomendacyjne\\data\\prop1.csv',
                           sep=',',
                  header=None)
all_occurrences_df['all_occurrences'] = prop.to_frame()[0]
for x in range(2, 11):
    prop = pd.Series.from_csv(f'D:\\Dokumenty\\Systemy_rekomendacyjne\\data\\prop{x}.csv',
                               sep=',',
                      header=None)
    
    all_occurrences_df['all_occurrences'] = all_occurrences_df['all_occurrences'] + prop.to_frame()[0]
#     list_of_series.append(prop)
all_occurrences_df['all_occurrences'] = all_occurrences_df['all_occurrences'].astype(int)
all_occurrences_df[:3]


from_csv is deprecated. Please use read_csv(...) instead. Note that some of the default arguments are different, so please refer to the documentation for from_csv when changing your function calls



Unnamed: 0,all_occurrences
Wheelchair Accessible,12878379
Ski Resort,1168079
Hotel Bar,15544986


# Zliczanie kliknięć w dane properties

In [18]:
item_metadata = pd.read_csv('D:\\Dokumenty\\Systemy_rekomendacyjne\\data\\item_metadata_separated_properties.csv',
                           sep=',',
                           dtype='int64')
item_metadata['item_id'] = item_metadata['item_id'].astype(str)
item_metadata[:3]

Unnamed: 0,item_id,Wheelchair Accessible,Ski Resort,Hotel Bar,Electric Kettle,Teleprinter,1 Star,Shower,Microwave,Childcare,...,Flatscreen TV,Boat Rental,Swimming Pool (Combined Filter),Guest House,Water Slide,Convention Hotel,Swimming Pool (Indoor),WiFi (Public Areas),Accessible Parking,sum_of_properties
0,5001,0,0,0,1,0,0,1,0,0,...,0,1,1,0,0,0,0,1,0,48
1,5002,0,0,1,0,0,0,1,0,1,...,0,1,0,0,0,0,0,1,0,55
2,5003,1,0,1,1,0,0,1,0,0,...,0,1,1,0,0,0,1,1,0,73


In [19]:
train = pd.read_csv('D:\\Dokumenty\\Systemy_rekomendacyjne\\data\\train.csv',
                       sep=',',
                    nrows=1000000
#                     skiprows=range(1, 9000000)
#                    nrows=1000000
                   )
train = train[train["action_type"] == "clickout item"]
print(train.shape)
train[:3]

(107137, 13)


Unnamed: 0,index,session_id,timestamp,user_id,step,action_type,reference,platform,city,device,current_filters,impressions,prices
5,5,90ccf1b651b92,1541030423,F0NEAMNR1WK0,1,clickout item,1669587,BR,"Canela, Brazil",mobile,,507861|2176280|8280296|1830637|1944129|7315132...,214|81|158|117|152|54|105|113|88|96|79|99|79|1...
8,8,d3aa2789c8973,1541030425,EA6D53DNLIU0,1,clickout item,2128208,MY,"Sungai Pelek, Malaysia",mobile,,2128208|2892128|1135230|8410646|2784325|135182...,83|18|31|67|175|78|112|31|26|93|19|64|16|68|38...
9,9,413d03546ec94,1541030425,QR0O86CIA0J6,1,clickout item,2832624,BR,"Holambra, Brazil",desktop,,2832624|2898616|7717092|9662278|7342374|324166...,56|81|56|38|63|38|51|54|35|40|37|36|41|46|42|2...


In [20]:
train = train.merge(item_metadata,
                   left_on='reference',
                   right_on='item_id',
                   how='left')
train[:3]

Unnamed: 0,index,session_id,timestamp,user_id,step,action_type,reference,platform,city,device,...,Flatscreen TV,Boat Rental,Swimming Pool (Combined Filter),Guest House,Water Slide,Convention Hotel,Swimming Pool (Indoor),WiFi (Public Areas),Accessible Parking,sum_of_properties
0,5,90ccf1b651b92,1541030423,F0NEAMNR1WK0,1,clickout item,1669587,BR,"Canela, Brazil",mobile,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,41.0
1,8,d3aa2789c8973,1541030425,EA6D53DNLIU0,1,clickout item,2128208,MY,"Sungai Pelek, Malaysia",mobile,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,77.0
2,9,413d03546ec94,1541030425,QR0O86CIA0J6,1,clickout item,2832624,BR,"Holambra, Brazil",desktop,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,40.0


In [21]:
train = train.drop(columns=['index','session_id','timestamp','user_id','step','action_type','platform',
                            'city','device','sum_of_properties','current_filters','impressions','prices','item_id'], axis=1)
train = train.set_index('reference')
train[:3]

Unnamed: 0_level_0,Wheelchair Accessible,Ski Resort,Hotel Bar,Electric Kettle,Teleprinter,1 Star,Shower,Microwave,Childcare,WiFi (Rooms),...,Country Hotel,Flatscreen TV,Boat Rental,Swimming Pool (Combined Filter),Guest House,Water Slide,Convention Hotel,Swimming Pool (Indoor),WiFi (Public Areas),Accessible Parking
reference,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1669587,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2128208,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
2832624,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0


In [22]:
clicked_occurrences_df = train.sum().to_frame('clicked_occurrences')
clicked_occurrences_df['clicked_occurrences'] = clicked_occurrences_df['clicked_occurrences'].astype('int64')
clicked_occurrences_df[:3]

Unnamed: 0,clicked_occurrences
Wheelchair Accessible,44330.0
Ski Resort,4042.0
Hotel Bar,52534.0


In [27]:
result_df = pd.concat([clicked_occurrences_df, all_occurrences_df], axis=1, sort=False)
result_df['clicked_to_all_ratio'] = result_df['clicked_occurrences']/result_df['all_occurrences']


In [35]:
result_df = result_df.sort_values(['clicked_to_all_ratio'], ascending=False)
result_df[:10]

Unnamed: 0,clicked_occurrences,all_occurrences,clicked_to_all_ratio
Motel,1237,189569,0.006525
Camping Site,247,45078,0.005479
Hostel,1892,458355,0.004128
Washing Machine,26978,6670651,0.004044
Casa Rural (ES),81,20036,0.004043
Guest House,1944,481678,0.004036
Fridge,39232,9733494,0.004031
Electric Kettle,49938,12950898,0.003856
Farmstay,390,101685,0.003835
1 Star,1119,291967,0.003833


In [45]:
result_df.loc[result_df['clicked_occurrences'] > 20000, :][:20]

Unnamed: 0,clicked_occurrences,all_occurrences,clicked_to_all_ratio
Washing Machine,26978,6670651,0.004044
Fridge,39232,9733494,0.004031
Electric Kettle,49938,12950898,0.003856
Massage,32095,8535900,0.00376
Ironing Board,42989,11516382,0.003733
3 Star,32568,8779991,0.003709
Air Conditioning,72705,19780836,0.003676
Car Park,80255,22086822,0.003634
Hotel,76524,21097639,0.003627
Singles,26444,7291205,0.003627


In [44]:
clicked_occurrences_df.sort_values(['clicked_occurrences'], ascending=False)[:20]

Unnamed: 0,clicked_occurrences
Satisfactory Rating,91099
Shower,83123
WiFi (Public Areas),82138
Car Park,80255
Television,80018
Good Rating,79218
WiFi (Rooms),78632
Hotel,76524
Luxury Hotel,74243
Business Hotel,73439
