In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import os
import datetime

path = '/Volumes/Data/ML/onefactor/'
os.chdir(path)

tq = True

In [2]:
class SOLUTION:

    def __init__(self):
        self.value1 = None
        self.value2 = None
        
    def get_value(self, df0):
        self.value1 = np.array([x for x in tqdm(df0['hash_id'])])
        self.value2 = np.array([x for x in tqdm(df0['lacid'])])
        return self.value1, self.value2
        
    def fit(self, X):
        X['ts2'] = [datetime.datetime.fromtimestamp(x) for x in tqdm(X['ts'])]
        X['lac'] = X['lac'].apply(str)
        X['cid'] = X['cid'].apply(str)
        v1 = np.array(list(X['lac']))
        v2 = np.array(list(X['cid']))
        X['lacid'] = [v1[x]+'_'+v2[x] for x in tqdm(range(len(v1)))]
        return X

    def cat(self, X):        
        res_df = self.got_counter_res_df_speed_fast(X['lacid'])
        res_df.columns = ['lacid', 'count', '%']
        res_df.drop('%', axis=1, inplace=True)
        X =  X.merge(res_df, left_on='lacid', right_on='lacid', how='left')        
        X = X[X['count'] > 9].reset_index(drop=True)
        X.drop('count', axis=1, inplace=True)   
        return X

    def predict(self, value1, value2, df1, tq, logfile):
        matr = []
        for k in tqdm(range(len(df1))):
            id1 = df1.loc[k, 'id1']
            vec = self.mainfun(k, id1, self.value1, self.value2, df1, tq)
            matr.append(vec)  
            s = ';'.join([str(x) for x in vec])
            self.write_to_log2(logfile, s)
        matr = np.array(matr).copy()
        return matr

    def calc_solution(self, df1, matr):
        vv = []
        for i in tqdm(range(len(df1))): 
            f = 0
            mx = np.max(matr[:,:])
            for x in range((matr.shape[0])):
                if f == 1:
                    break
                for y in range((matr.shape[1])):
                    if matr[x, y] == mx:
                        vv.append([x, y, mx])
                        matr[x, :] = -1
                        matr[:, y] = -1
                        f = 1
                        break
        return vv
    
    def save_solution(self, logfile, df3, vv):
        self.write_to_log2(logfile, 'id1;id2')
        ww = []
        for k in (range(0, len(vv))):
            id1 = str(df3.loc[vv[k][0], 'id1'])
            id2 = str(df3.loc[vv[k][1], 'id2'])
            s = ';'.join([id1, id2])
            self.write_to_log2(logfile, s)
            ww.append([id1, id2])
        return pd.DataFrame(ww, columns = ['id1','id2'])

    def check_solution(self, sol2):
        res1 = self.got_counter_res_df_speed_fast(sol2['id1'])
        res2 = self.got_counter_res_df_speed_fast(sol2['id2'])
        if res1.iloc[0,1] == 1 and res2.iloc[0,1] == 1:
            s = 'True'
        else:
            s = 'False'
        return s
    
    def got_counter_res_df_speed_fast(self, db):
        unique, counts = np.unique(db, return_counts=True); 
        counter = dict(zip(unique, counts))
        res_df = pd.DataFrame.from_dict(counter, orient='index').reset_index().rename(columns={'index':'event', 0:'count'})
        n = (np.sum(res_df['count']))
        res_df['%'] = [round((res_df.loc[i, 'count']*100/n),2) for i in (range(len(res_df)))]
        return res_df
        
    def mainfun(self, k, id1, value1, value2, df1, tq):
        id1 = df1.loc[k, 'id1']
        vec = []
        for j in (range(len(df1))):
            id2 = df1.loc[j, 'id2']
            ind1 = list(np.where(self.value1==id1)[0])
            ind2 = list(np.where(self.value1==id2)[0])
            db1 = self.value2[ind1]
            db2 = self.value2[ind2]
            v = self.subfun(id1, id2, db1, db2)
            vec.append(v)
        return vec
       
    def subfun(self, id1, id2, db1, db2):
        v = list(set(db1) & set(db2))
        mask = np.isin(db1, v)
        a = db1[mask]
        mask = np.isin(db2, v)
        b = db2[mask]
        s0 = len(v)
        s1 = len(a)/len(db1)*100
        s2 = len(b)/len(db2)*100
        s3 = s1+s2
        return np.sum([s0, s1, s2, s3])

    def write_to_log2(self, log_file, string, enc = 'UTF-8'):
        with open(log_file, mode='a', encoding = enc) as fileto:
            fileto.write(string+'\n')
        fileto.close()    

In [3]:
df0 = pd.read_csv('01_data_2.csv', sep = ';')
df1 = pd.read_csv('02_etalon_2.csv', sep = ';')

In [4]:
slt = SOLUTION()
df0 = slt.fit(df0)
value1, value2 = slt.get_value(df0)

100%|██████████| 335629/335629 [00:00<00:00, 1128324.25it/s]
100%|██████████| 335629/335629 [00:00<00:00, 1036871.27it/s]
100%|██████████| 335629/335629 [00:00<00:00, 2888652.80it/s]
100%|██████████| 335629/335629 [00:00<00:00, 3165572.42it/s]


In [5]:
logfile = 'train_0.csv'
matr = slt.predict(value1, value2, df1, tq, logfile)
vv = slt.calc_solution(df1, matr)
logfile = 'train_1.csv'
ww = slt.save_solution(logfile, df1, vv)
print(slt.check_solution(ww))
sol1 = pd.read_csv('train_1.csv', sep = ';')

100%|██████████| 20/20 [00:00<00:00, 30.22it/s]
100%|██████████| 20/20 [00:00<00:00, 16750.42it/s]


True


In [6]:
##############################################################################################
# PREDICT
##############################################################################################

v = list(set(df1['id1'])) + list(set(df1['id2']))
df2 = df0[~df0['hash_id'].isin(v)].reset_index(drop=True)

v0 = list(set(df2[df2['fulldate'] < '2018-09-15']['hash_id']))
v1 = list(set(df2[df2['fulldate'] > '2018-09-15']['hash_id']))
v = [[v0[x], v1[x]] for x in range(len(v0))]
df3 = pd.DataFrame(v, columns = ['id1', 'id2'])

In [7]:
logfile = 'test_0.csv'
value1, value2 = slt.get_value(df2)
matr = slt.predict(value1, value2, df3, tq, logfile)
vv = slt.calc_solution(df3, matr)
logfile = 'test_1.csv'
ww = slt.save_solution(logfile, df3, vv)
print(slt.check_solution(ww))
soltest = pd.read_csv('test_1.csv', sep = ';')

100%|██████████| 289401/289401 [00:00<00:00, 2842260.75it/s]
100%|██████████| 289401/289401 [00:00<00:00, 3076506.84it/s]
100%|██████████| 20/20 [00:04<00:00,  4.99it/s]
100%|██████████| 20/20 [00:00<00:00, 14685.94it/s]


True
