In [1]:
# load package
# string 
import re

# date
from datetime import datetime

# math
import pandas as pd
import numpy as np
import scipy as sp
import random

# nlp
from pattern.de import parse, conjugate, singularize, pluralize

# sys
import sys
import os
import time
import warnings

# machine learning
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, gaussian_process, discriminant_analysis
from xgboost import XGBClassifier

# model utils
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection 
from sklearn import model_selection
from sklearn import metrics

# plot
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.tools.plotting import scatter_matrix #??
# = show plots in Jupyter Notebook browser
%matplotlib inline 
mpl.style.use('ggplot') #??
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8 #??

# self define
sys.path.append('../../utils/')

In [2]:
# read files
lb_wsp_2014 = pd.read_csv('../data/Autohaus_weeber/leonberg_werkstattposten_2014.csv', sep = ';')
lb_wsp_2015 = pd.read_csv('../data/Autohaus_weeber/leonberg_werkstattposten_2015.csv', sep = ';')
lb_wsp_2016 = pd.read_csv('../data/Autohaus_weeber/leonberg_werkstattposten_2016.csv', sep = ';')
lb_wsp_2017 = pd.read_csv('../data/Autohaus_weeber/leonberg_werkstattposten_2017.csv', sep = ';')
lb_wsp_2018 = pd.read_csv('../data/Autohaus_weeber/leonberg_werkstattposten_2018.csv', sep = ';')
std_wsp_2014 = pd.read_csv('../data/Autohaus_weeber/weil_der_stadt_werkstattposten_2014.csv', sep = ';')
std_wsp_2015 = pd.read_csv('../data/Autohaus_weeber/weil_der_stadt_werkstattposten_2015.csv', sep = ';')
std_wsp_2016 = pd.read_csv('../data/Autohaus_weeber/weil_der_stadt_werkstattposten_2016.csv', sep = ';')
std_wsp_2017 = pd.read_csv('../data/Autohaus_weeber/weil_der_stadt_werkstattposten_2017.csv', sep = ';')
std_wsp_2018 = pd.read_csv('../data/Autohaus_weeber/weil_der_stadt_werkstattposten_2018.csv', sep = ';')
# cat
d1 = lb_wsp_2014.copy()
d2 = lb_wsp_2015.copy()
d3 = lb_wsp_2016.copy()
d4 = lb_wsp_2017.copy()
d5 = lb_wsp_2018.copy()

d6 = std_wsp_2014.copy()
d7 = std_wsp_2015.copy()
d8 = std_wsp_2016.copy()
d9 = std_wsp_2017.copy()
d10 = std_wsp_2018.copy()

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [67]:
# for training we use data from 14 - 17 in lb_wsp
train = pd.concat([d1,d2,d3,d4], 0)
test = d5
data = [train, test]

In [4]:
# 转化为auftrag table， 但是这次合并的是Teile-Nr项
# 给的数据的每一行都是一个维修项，初衷是，把属于同一个auftrag的维修项合并到一起，看一下，在同一个Auftrag中，经常一起修的是那些内容
#TODO, 这个方法的输出是一个两列的dataframe，但是一般情况下，我们会想要df中的其他属性，所以是否可以给个输出整个df的选项
# 另外的Agg可以作为一个默认参数传入

def toAuftragTable(df, att, auftn, clean = True):
    """
    input:
        df, DataFrame:
            the dataframe
        att, string:
            the column name of the target attribute
        auftn, string:
            the column name of the aftragsnummer attribute
        clean:
            when true, drop the null item in auftn attribute.
    output:
        df_g, DataFrame:
            dataframe contrains two columns auftn and att
            type of item in att is string, separate with ';'
    """
    # assert: make sure the type of the attributes inputted
    
    # extract the att and date columns
    df = df[[att, auftn]]
    # set type to object
    #df[att] = df[att].astype('object')
    #df[auftn] = df[auftn].astype('object')
    # if clean is True, drop the fake data, like the null data
    if clean:
        print("Falls Null date exist, drop these dates directly")
        #df = df.drop(df[df[att].isnull()].index)
        df = df.drop(df[df[auftn].isnull()].index)
    # group and sum 
    df_g = df.groupby([auftn], as_index = False).apply(agg)
    return df_g

# apply 只能对单行进行处理，而不是对整个分组进行处理，所以估计应该把axis换成1，比较好
def agg(x):
    # 是否用‘ ’分隔会比较好，这样就不用对初始的属性，
    # x 在这里是dataframe？？？
    #x = [str(i) for i in x]
    x = x.apply(lambda x: ';'.join(set([str(i) for i in x])), axis = 0)
    #x = x.apply(lambda x: ' '.join(set(x)), axis = 0)
    #print(x.columns.values)
    return x

In [47]:
class FreDict:
    """
    统计单词和出现的频率
    文档中的每个字都会被记录进去，所以应该确认输入的文件内容，只含有目标列属性
    """
    def __init__(self, path, header = True, sep = ';', clean = False, recover = False, singular = False):
        """
        input:
            path: string
                the path of the file
            header: boolean
                weather the file contains header of not
            sep: string
                the sep of the file in each line
            clean: boolean
                remove the word with single buchstachben or not
            recover: boolean
                transform 'ae', 'oe', 'ue', 'ss' back to 'ä', 'ö', 'ü', 'ß' usw.
            singular: boolean
                weather turns the word to singular or not
        """
        self.dict_re = {'ae': 'ä', 'oe': 'ö', 'ue': 'ü', 'ß': 'ss', 'u.': 'und ', 'fzg': 'fahrzeug', ' f ': ' für '}
        self.words, self.dict_count = self._load_data(path, header, sep, clean, recover, singular) # list of Words
        self.ls_sorted = self._sort_dict(self.words) # list of list
        self.len = len(self.words)
    
    def get_word(self, w):
        if w in self.dict_count.keys():
            return self.dict_count[w]
        else:
            return Word('x', 0, 'NAW')
    
    def get_best(self, s):
        """
        input:
            s string:
                list of words in form of string
        output:
            out string:
                the frequentest word
        """
        words = s.split(' ')
        words = [word.strip() for word in words]
        out = 'xxx'
        count = 0
        for word in words:
            tmp = self.get_word(word)
            if tmp.get_count() > count:
                out = tmp.get_word()
                count = tmp.get_count()
        return out
        
    def top(self, n = 10):
        # return list of list
        # get the top n item 
        if n == -1:
            return self.ls_sorted
        return self.ls_sorted[:n]
    
    def top_norm(self, n = 10):
        # get the top n norm
        out = []
        counter = 0
        for i in self.ls_sorted:
            if counter >= n and n != -1:
                break
            if i[2] == 'NN' or i[2] == 'NNP' or i[2] == 'NNS' or i[2] == 'NNPS':
                out.append(i)
            counter += 1
        return out
    
    def top_verb(self, n = 10):
        # get the top n verb
        out = []
        counter = 0
        for i in self.ls_sorted:
            if counter >= n and n != -1:
                break
            if i[2] == 'VB' or i[2] == 'VBZ' or i[2] == 'VBP' or i[2] == 'VBD' or i[2] == 'VBN' or i[2] == 'VBG':
                out.append(i)
            counter += 1
        return out
    
    def _load_data(self, path, header, sep, clean, recover, singular):
        """
        return list of words and count dictionary
        """
        dic = {}
        with open(path) as fi:
            counter = 0
            for li in fi:
                # 假设句子两侧存在双引号
                li = li.strip()
                if li[0] == '"':
                    li = li[1:]
                if li[-1] == '"':
                    li = li[:-1]
                if header and counter == 0:
                    # drop first line if header is true
                    counter += 1
                    continue
                li = li.strip()
                if recover:
                    # recover same express if necessary
                    for i, j in self.dict_re.items():
                        if i in li:
                            li = re.sub(i, j, li)
                #items = re.split('[;, ]', li)
                items = li.split(sep)
                for item in items:
                    item = item.strip()
                    if item not in dic.keys():
                        if clean and len(item) <= 1:
                            # remove the item with less than 2 ziffer
                            continue
                        else:
                            dic[item] = 1
                    else:
                        dic[item] += 1
                counter += 1
        # transform to Word
        words = []
        for i, j in dic.items():
            try:
                if singular:
                    i = singularize(i).lower()
                words.append(Word(i, j, parse(i).split('/')[1]))
                #words.append(Word(i, j, parse(conjugate(i)).split('/')[1]))
            except:
                words.append(Word(i, j, ''))
        # pack the dic
        dic = {i: Word(i, j, parse(i).split('/')[1]) for i, j in dic.items() if len(i) > 0 }
        # return 
        return words, dic
    
    def _sort_dict(self, words):
        # return list of words, sorted according to the count
        li_sorted = sorted(words, key = lambda x: x.get_count(), reverse = True) 
        # transform to list
        out = []
        for i in li_sorted:
            out.append(i.to_list())
        return out
    
class Word:
    """
    use @property to rewrite the class
    """
    def __init__(self, word = 'NN', count = 1, tag = 'NN'):
        self.word = word
        self.count = count
        self.tag = tag

    def to_list(self):
        out = [self.word, self.count, self.tag]
        return out
    
    def __str__(self):
        return self.word + " " + self.count + " " + self.tag

    def get_count(self):
        return self.count

    def get_tag(self):
        return self.tag

    def get_word(self):
        return self.word

    def set_count(self, c):
        self.count = c

    def set_tag(self, t):
        self.tag = t

    def set_word(self, w):
        self.word = w

In [16]:
df_train = toAuftragTable(train, 'Teile-Nr', 'Auftragsnummer')

Falls Null date exist, drop these dates directly


In [72]:
# there is special meaning for the nan value, it means: 手工服务
df_train.head(20)

Unnamed: 0,Teile-Nr,Auftragsnummer
0,JLU;HDV,103K
1,JCR;GQQ,77KW
2,,WSAU000839
3,ZZ MET;LMW030000Z0;ZZ UBS;nan;D 330KD2A1,WSAU195981
4,W-AG-EM;N 10648301;8K0698451A;N 90813202;8K0...,WSAU198256
5,N 90813202;071115562C;5K6955427A;1Q1998002;na...,WSAU200129
6,N 0141565;SKN 0177254;1C0945511A RDW;N 9082...,WSAU200223
7,8E0012619;N 90813202;071115562C;03L903137A;G ...,WSAU200258
8,N 90813202;B 000DS;06A115561B;8E0819439;nan;...,WSAU200259
9,N 0209022;8P0881361G;8P0963555G;nan;W-AN-GAR,WSAU200260


In [7]:
# In these 'Vertrag' is the Teile-Nr empty
#nun_auftrag_list = df_train[df_train['Teile-Nr'] == 'nan']['Auftragsnummer'].tolist()
#num_bool = train['Auftragsnummer'].map(lambda x: True if x in nun_auftrag_list else False)

In [61]:
# 看一下，Teile-Nr各个项目出现的频率
df_train['Teile-Nr'].to_csv('/tmp/tmp.csv', sep = ';', index = False)
dic = FreDict('/tmp/tmp.csv', header = True, sep = ';', clean = True, recover = True, singular = False)

In [62]:
dic.top(15)

[['nan', 72437, 'NN'],
 ['N  90813202', 11571, 'NN'],
 ['nan"', 7103, 'NN'],
 ['G  052164DS', 6566, 'NN'],
 ['B  000DS', 6385, 'NN'],
 ['WHT000729A', 4552, 'NNP'],
 ['03L115562', 4020, 'NN'],
 ['1K1819653B', 3969, 'NN'],
 ['W-AG-EM', 3926, 'NN'],
 ['N  10648301', 3771, 'NN'],
 ['OEL02', 3758, 'NN'],
 ['N  0138157', 3155, 'NN'],
 ['SKBV18', 3115, 'NN'],
 ['000010006"', 2837, 'CD'],
 ['OEL02"', 2764, 'NN']]

In [57]:
conbime = df_train['Teile-Nr'].map(lambda x: True if ('N  90813202' in x) and ('G  052164DS' in x) else False)
df_train[conbime].count()

Teile-Nr          3546
Auftragsnummer    3546
dtype: int64

In [58]:
x = df_train['Teile-Nr'].map(lambda x: True if ('N  90813202' in x) else False)
df_train[x].count()

Teile-Nr          14755
Auftragsnummer    14755
dtype: int64

In [59]:
y = df_train['Teile-Nr'].map(lambda x: True if ('G  052164DS' in x) else False)
df_train[y].count()

Teile-Nr          8239
Auftragsnummer    8239
dtype: int64

In [60]:
#P(y -> x) = P(x|y) = P(x, y)/ P(y) = num_(x,y)/num_(y)
df_train[conbime].count()/df_train[y].count()

Teile-Nr          0.430392
Auftragsnummer    0.430392
dtype: float64

#### 还存在的问题是：
- 上述做法有意义的前提条件是，如果在相同vertrag中存在某个关联关系的两个物体，那么在相邻Vertrag中，也存在类似的关系。
- 应该咨询一下，Teile-Nr能不能进行分层归类。
- 想在这些相关关系是在同一个vertrag中出现的，但是更关心的应该是在相邻两个vertrag之间的关联关系。

### 以上是用于zwischen的例子，正式的应该通过association runle来获得，这部分内容已经在code里面实现了，所以有空直接copy过来测试就行了