# Implementation of recommendation bot in MiWork 
 - 2021-06-17, Hyoungjoon Lim @ jupyter notebook (python3, iOS)
 - Find task/job/reports registered repeatedly with a regular cycle (2021 1H) 

## 1. Environment setting
### 1-1. Package import

In [283]:
from numpy import dot
from numpy.linalg import norm
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from datetime import datetime

### 1-2. Function definition

In [260]:
def cos_sim(text1, text2):
    return dot(text1, text2)/(norm(text1)*norm(text2))

In [261]:
def matrix_to_ind(matrix):
    res = np.array([])
    for i in range(len(matrix)):
        for j in range(len(matrix[0])):
            tmp = [int(i),int(j),matrix[i][j]]
            res = np.append(res, tmp)
    res = pd.DataFrame(res.reshape(int(len(res)/len(tmp)),int(len(tmp))))
    res = res.sort_values(by=[2], axis=0, ascending=False)
    res = res.reset_index(drop=True)
    res_m = res[res[2]>=0.4]
    ind = list(set(res_m[0]))
    
    return ind

In [293]:
def tfidf_to_related_txt(data):
    
    id_data = list(set(data[data.columns[9]]))
    
    RESULT_data = np.array([])

    for i in range(len(id_data)):

        tmp_data = data[data[data.columns[9]]==id_data[i]]

        tfidf_mat = tfidf.fit_transform(tmp_data[tmp_data.columns[1]])

        cosine_sim = linear_kernel(tfidf_mat, tfidf_mat)

        np.fill_diagonal(cosine_sim,0)

        ind = matrix_to_ind(cosine_sim)

        tmp_data = tmp_data.reset_index(drop=True)

        result = np.array([])
        
        if len(ind)<=2: continue

        for j in range(len(ind)):

            tmp_res = [id_data[i], tmp_data.loc[ind[j]][tmp_data.columns[3]], tmp_data.loc[ind[j]][tmp_data.columns[1]]]

            result = np.append(result, tmp_res)

        RESULT_data = np.append(RESULT_data, result)

#         print(i)

    RESULT_data = RESULT_data.reshape(int(len(RESULT_data)/len(tmp_res)),int(len(tmp_res)))

    RESULT_data = pd.DataFrame(RESULT_data)   

    RESULT_data = RESULT_data.drop_duplicates()
    
    return RESULT_data

## 2. Data preprocessing
### 2-1. Data upload

In [263]:
working_dir = '/Users/hyoungjoonlim/Desktop/usingPython/data/'

task = pd.read_csv(working_dir+'2106업무_패턴찾기.csv',encoding='CP949')
task = task.drop(['업무중요도'],axis=1)
job = pd.read_csv(working_dir+'2106세부업무_패턴찾기.csv',encoding='CP949')
report = pd.read_csv(working_dir+'2106간단보고_패턴찾기.csv',encoding='CP949')
report = report[['액티비티번호','액티비티내용','액티비티등록년월','액티비티등록일시','액티비티등록일자','액티비티등록자B레벨조직명',
                 '액티비티등록자C레벨조직명','액티비티등록자D레벨조직명','액티비티등록자사원명','액티비티등록자사원번호','액티비티등록자직위명','액티비티등록자직책명']]

### 2-2. Identify data structure

In [264]:
task.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4846 entries, 0 to 4845
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   업무번호         4846 non-null   object
 1   업무명          4846 non-null   object
 2   업무등록년월       4846 non-null   int64 
 3   업무등록일시       4846 non-null   object
 4   업무등록일자       4846 non-null   object
 5   업무등록자B레벨조직명  4752 non-null   object
 6   업무등록자C레벨조직명  3260 non-null   object
 7   업무등록자D레벨조직명  4836 non-null   object
 8   업무등록자사원명     4836 non-null   object
 9   업무등록자사원번호    4846 non-null   int64 
 10  업무등록자직위명     4835 non-null   object
 11  업무등록자직책명     592 non-null    object
dtypes: int64(2), object(10)
memory usage: 454.4+ KB


In [265]:
job.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21164 entries, 0 to 21163
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   세부업무번호         21164 non-null  int64 
 1   세부업무명          21164 non-null  object
 2   세부업무등록년월       21164 non-null  int64 
 3   세부업무등록일시       21164 non-null  object
 4   세부업무등록일자       21164 non-null  object
 5   세부업무등록자B레벨조직명  20028 non-null  object
 6   세부업무등록자C레벨조직명  16269 non-null  object
 7   세부업무등록자D레벨조직명  21127 non-null  object
 8   세부업무등록자사원명     21127 non-null  object
 9   세부업무등록자사원번호    21164 non-null  int64 
 10  세부업무등록자직위명     20898 non-null  object
 11  세부업무등록자직책명     2353 non-null   object
dtypes: int64(3), object(9)
memory usage: 1.9+ MB


In [266]:
report.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5701 entries, 0 to 5700
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   액티비티번호         5701 non-null   int64 
 1   액티비티내용         5701 non-null   object
 2   액티비티등록년월       5701 non-null   int64 
 3   액티비티등록일시       5701 non-null   object
 4   액티비티등록일자       5701 non-null   object
 5   액티비티등록자B레벨조직명  5406 non-null   object
 6   액티비티등록자C레벨조직명  4579 non-null   object
 7   액티비티등록자D레벨조직명  5701 non-null   object
 8   액티비티등록자사원명     5701 non-null   object
 9   액티비티등록자사원번호    5701 non-null   int64 
 10  액티비티등록자직위명     5396 non-null   object
 11  액티비티등록자직책명     874 non-null    object
dtypes: int64(3), object(9)
memory usage: 534.6+ KB


In [267]:
task['업무명'].isnull().sum(), job['세부업무명'].isnull().sum(), report['액티비티내용'].isnull().sum()

(0, 0, 0)

## 3. Evaluation of cosine similarity

In [268]:
stopwords = ['.', '(', ')', ',', "'", '%', '-', 'X', ').', '×','의','자','에','안','번',
                      '#','호','을','이','다','만','로','가','를']

tfidf = TfidfVectorizer(stop_words=stopwords)

### 3-1. TF-IDF construction & Extraction of related texts

In [294]:
Result_task = tfidf_to_related_txt(task)

Result_job = tfidf_to_related_txt(job)

Result_report = tfidf_to_related_txt(report)

In [295]:
Result_task.to_csv('Result_task.csv', index=False, encoding='cp949')
Result_job.to_csv('Result_job.csv', index=False, encoding='cp949')
Result_report.to_csv('Result_report.csv', index=False, encoding='cp949')