In [None]:
import csv
import numpy as np
from tqdm import notebook
import operator
from scipy.sparse import csr_matrix
from sklearn.decomposition import NMF
from scipy.sparse.linalg import svds

In [None]:
INPUT_FILE_NAME = 'log_sample.csv'
PREDICTED_TIME_MILLIS = 10 * 60 * 60 * 1000
TRAIN_TIME_DAYS = 30
FORGET_TIME_DAYS = 10

In [None]:
class ReadEvents:
    def __init__(self, input_file_name, predicted_time_millis, train_time_days):
        self.input_file_name = input_file_name
        self.predicted_time = predicted_time_millis
        self.train_time = train_time_days * 24 * 60 * 60 * 1000
    
    def read_events_from_file(self):
        def check_group_event_id(group_id, event_id):
            return group_id not in ('performance', 'vcs.change.reminder') and \
                   event_id not in ('ui.lagging',
                                    'ide.error',
                                    'ide.freeze',
                                    'ui.latency',
                                    'registered',
                                    'invoked',
                                    'TsLintLanguageService',
                                    'whitelist.updated',
                                    'logs.send',
                                    'notification.shown',
                                    'ESLintLanguageService')
        
        events = []
        event_types = {}
        users = {}
        with open(self.input_file_name, 'r') as fin:
            for row in notebook.tqdm(csv.reader(fin, delimiter=',')):
                count = row[12].split('.')[0]
                if count:
                    count = int(count)
                else:
                    count = 0
                
                group_id = row[5]
                event_id = row[10]
                timestamp = int(row[3])
                device_id = row[7]
                if count and check_group_event_id(group_id, event_id):
                    event_types[(group_id, event_id)] = True
                    users[device_id] = True
                    for i in range(count):
                        events.append((device_id, group_id, event_id, timestamp))
        
        print(str(len(events)) + " event read.")
        print(str(len(users)) + " users found.")
        print(str(len(list(event_types.keys()))) + " event types found.")
        
        self.users = list(users.keys())
        self.event_types = list(event_types.keys())
        self.events = events
    
    def split_events_to_train_test(self):
        user_to_min_time = {}
        user_to_max_time = {}
        
        for event in notebook.tqdm(self.events):
            device_id, group_id, event_id, timestamp = event
            
            if device_id not in user_to_min_time.keys():
                user_to_min_time[device_id] = timestamp
            else:
                user_to_min_time[device_id] = min(user_to_min_time[device_id], timestamp)
            
            if device_id not in user_to_max_time.keys():
                user_to_max_time[device_id] = timestamp
            else:
                user_to_max_time[device_id] = max(user_to_max_time[device_id], timestamp)
        
        train_events = []
        test_events = {}
    
        for event in notebook.tqdm(self.events):
            device_id, group_id, event_id, timestamp = event
            
            min_time = user_to_min_time[device_id]
            max_time = user_to_max_time[device_id]
            
            if (max_time - min_time) < self.predicted_time:
                continue
            
            threshold = max_time - self.predicted_time
            
            if timestamp < threshold:
                if threshold - self.train_time <= timestamp :
                    train_events.append((device_id, group_id, event_id, timestamp, threshold))
            else:
                test_events[(device_id, group_id, event_id)] = True
              
        print(str(len(train_events)) + " train events.")
        print(str(len(test_events.keys())) + " test events.")
              
        return train_events, test_events

    def __call__(self):
        self.read_events_from_file()
        return self.split_events_to_train_test()
        

In [None]:
events_reader = ReadEvents(INPUT_FILE_NAME, PREDICTED_TIME_MILLIS, TRAIN_TIME_DAYS)

In [None]:
train_events, test_events = events_reader()

In [None]:
PRODUCTIVITY = "productivity"

In [None]:
class Recommender:  
    def __init__(self, users, event_types, events, forget_time_days):
        self.users = users
        self.event_types = event_types
        self.events = events
        self.forget_time_millis = forget_time_days * 24 * 60 * 60 * 1000

    def recommend(self, user):
        pass
    
    def recommend_list(self, users=None):
        if not users:
            users = self.users
        
        user_to_recomendation = {}
        
        for user in notebook.tqdm(users):
            user_to_recomendation[user] = self.recommend(user)
        
        return user_to_recomendation

In [8]:
class RecommenderTopEvent(Recommender):
    def get_top_events(self):
        event_to_count = {}
        for (device_id, group_id, event_id, _, _) in notebook.tqdm(self.events):
            
            if group_id == PRODUCTIVITY:
                
                if (group_id, event_id) in event_to_count.keys():
                    event_to_count[(group_id, event_id)] = event_to_count[(group_id, event_id)] + 1
                else:
                    event_to_count[(group_id, event_id)] = 1
        
        sorted_by_count_events = sorted(event_to_count.items(), key=operator.itemgetter(1), reverse=True)
        
        all_count_sum = 0
        for event_count in sorted_by_count_events:
            all_count_sum += event_count[1]
        
        self.top_events = [(x[0], x[1] / all_count_sum) for x in sorted_by_count_events]  
        #print(self.top_events)

    def __init__(self, users, event_types, events, forget_time_days):
        super(RecommenderTopEvent, self).__init__(users, event_types, events, forget_time_days)
        self.get_top_events()
    
    def recommend(self, user):
        done_by_user_events = {}
        
        for (device_id, group_id, event_id, timestamp, threshold) in self.events:
            if device_id == user and \
              threshold - timestamp < self.forget_time_millis:
                
                done_by_user_events[(group_id, event_id)] = True

        for i in range(len(self.top_events)):
            top_event = self.top_events[i][0]
            
            if not top_event in done_by_user_events.keys():
                #print(top_event)
                return top_event

        #print(self.top_events[i][0])
        return self.top_events[i][0]
    

In [9]:
class RecommenderTopEventWithProbability(RecommenderTopEvent):
    
    def recommend(self, user):
        done_by_user_events = {}
        
        for (device_id, group_id, event_id, timestamp, threshold) in self.events:
            if device_id == user and \
              threshold - timestamp < self.forget_time_millis:
                
                done_by_user_events[(group_id, event_id)] = True

                
        not_done_event_with_prob = {}
        all_not_done_top_sum = 0
        for i in range(len(self.top_events)):
            top_event = self.top_events[i]
            if not top_event[0] in done_by_user_events.keys():
                not_done_event_with_prob[top_event] = True
                all_not_done_top_sum += top_event[1]
            
        not_done_event_with_prob = list(not_done_event_with_prob.keys())
        probs = []
        for i in range(len(not_done_event_with_prob)):
            probs.append(not_done_event_with_prob[i][1] / all_not_done_top_sum)
        
        return not_done_event_with_prob[np.random.choice(len(not_done_event_with_prob), 1, probs)[0]][0]  
    

In [10]:
class Recommendations:
    def __init__(self, algorithm, users_to_recommend, test_events):
        self.algorithm = algorithm
        self.users_to_recommend = users_to_recommend
        self.test_events = test_events
    
    def check_recomendation(self):
        true_recommendations_cnt, all_recommendations_cnt = 0, len(list(self.user_to_recomendation.keys()))
        
        for user in self.user_to_recomendation.keys():
            group_id, event_id = self.user_to_recomendation[user]
            if group_id != PRODUCTIVITY:
                print(group_id)
            if (user, group_id, event_id) in self.test_events.keys():
                true_recommendations_cnt += 1
        return true_recommendations_cnt / all_recommendations_cnt
            
    def get_recommendation(self):
        self.user_to_recomendation = self.algorithm.recommend_list(self.users_to_recommend)
        result = self.check_recomendation()
        print(str(result) + " predictions were correct.")
        
        return self.user_to_recomendation
        
    def __call__(self):
        return self.get_recommendation()

In [11]:
algorithm = RecommenderTopEvent(events_reader.users, events_reader.event_types, train_events, FORGET_TIME_DAYS)

HBox(children=(IntProgress(value=0, max=3468843), HTML(value='')))




In [12]:
recommendations = Recommendations(algorithm, None, test_events)

In [13]:
user_to_recomendation = recommendations()

HBox(children=(IntProgress(value=0, max=724), HTML(value='')))


0.0925414364640884 predictions were correct.


In [68]:
algorithm = RecommenderTopEventWithProbability(events_reader.users, events_reader.event_types, train_events, FORGET_TIME_DAYS)

HBox(children=(IntProgress(value=0, max=3468843), HTML(value='')))




In [69]:
recommendations = Recommendations(algorithm, None, test_events)

In [70]:
user_to_recomendation = recommendations()

HBox(children=(IntProgress(value=0, max=724), HTML(value='')))

KeyboardInterrupt: 

In [118]:
class MatrixFactorization(Recommender):  
    def __init__(self, users, event_types, events, forget_time_days):
        super(MatrixFactorization, self).__init__(users, event_types, events, forget_time_days)
        
        self.user_to_index = {}
        for i in range(len(self.users)):
            self.user_to_index[self.users[i]] = i
        
        #self.productivity_event_types = []
        #for event in self.event_types:
        #    group_id, event_id = event
            #if group_id == 'productivity':
        ##    self.productivity_event_types.append(event_id)
            
        self.event_to_index = {}
        for i in range(len(self.event_types)):
            #print(self.event_types[i])
            self.event_to_index[self.event_types[i]] = i
         
        data = []
        row_id = []
        col_id = []
        row_col = {}
        for event in events:
            (device_id, group_id, event_id, timestamp, threshold) = event
            #print(str(threshold - timestamp))
            if threshold - timestamp < self.forget_time_millis and\
            (self.user_to_index[device_id], self.event_to_index[(group_id, event_id)]) not in row_col.keys():
                data.append(1)
                row_id.append(self.user_to_index[device_id])
                col_id.append(self.event_to_index[(group_id, event_id)])
                row_col[(self.user_to_index[device_id], self.event_to_index[(group_id, event_id)])] = True
        
        self.matrix = csr_matrix((np.array(data), (np.array(row_id), np.array(col_id))), dtype=float, shape=(len(users), len(self.event_types)))
        #print(data)
        u, s, vt = svds(self.matrix, k=2)
        self.predicted_matrix = np.dot(u, vt)
        

    def recommend(self, user):
        user_ind = self.user_to_index[user]
        user_values = self.predicted_matrix[user_ind]
        user_old_values = self.matrix.toarray()[user_ind]
        diff_values = user_values - user_old_values
        for i in range(len(diff_values)):
            if user_old_values[i] > 0:
                diff_values[i] = 0
            if self.event_types[i][0] != PRODUCTIVITY:
                diff_values[i] = -1
            #print(str(self.event_types[i]) + " " + str(diff_values[i]))
        #print(diff_values)
        event_ind = np.argmax(diff_values)
        #print(diff_values[event_ind])
        #if diff_values[event_ind] < 0:
        #    print(diff_values[event_ind])
        #print(len(diff_values))
        #print(len(pro))
        #if diff_values[event_ind] < 0:
         #   print(diff_values[event_ind])
        #print(self.event_types[event_ind])
        return self.event_types[event_ind]
                
        
        

In [119]:
algorithm = MatrixFactorization(events_reader.users, events_reader.event_types, train_events, FORGET_TIME_DAYS)

In [120]:
recommendations = Recommendations(algorithm, None, test_events)

In [121]:
user_to_recomendation = recommendations()

HBox(children=(IntProgress(value=0, max=724), HTML(value='')))


0.10220994475138122 predictions were correct.


In [122]:
user_event = {}
for (device_id, group_id, event_id, timestamp, threshold) in train_events:
    user_event[(device_id, group_id, event_id)] = True

In [123]:
for user in user_to_recomendation.keys():
    group_id, event_id = user_to_recomendation[user]
    if (user, group_id, event_id) in user_event.keys():
        print((user, group_id, event_id))

('170419111546e84-014c-4dc5-8cf4-d6a139c2ebc2', 'productivity', 'editing.completion.basic')
('2803193f5aad39a-f56a-444f-a764-f260798994b7', 'productivity', 'navigation.goto.declaration')
('0708191802abdf4-d1e3-4a70-9ee2-0c73e6213743', 'productivity', 'editing.select.word')
('1308191e68abb69-2d9a-4b23-a9da-db8e381b9a22', 'productivity', 'SearchEverywhere')
('30071914b3ba8c3-4ae1-42a9-9893-c077ad4d9210', 'productivity', 'editing.completion.basic')
('070819171a61cfe-2ab5-4bb4-a2f5-f4377ceb323f', 'productivity', 'editing.completion.basic')
('0908191169f45a9-a7e3-480d-9803-81eb966fddad', 'productivity', 'editing.completion.basic')
('0907191cf2cbe7b-e315-453f-a376-9469fe5f5644', 'productivity', 'editing.completion.replace')
('28031915628df27-6609-4bf7-9426-2dd7c2301699', 'productivity', 'SearchEverywhere')
('06041912367056b-45ce-478e-8bab-13e653d5bad4', 'productivity', 'SearchEverywhere')
('290519274cbcbc2-ab5a-45de-9a94-acd7996a50a1', 'productivity', 'navigation.goto.declaration')
('0407191