In [1]:
# load package
# string 
import re

# math
import pandas as pd
import numpy as np
import scipy as sp
import random

# sys
import sys
import os
import time
import warnings

# machine learning
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, gaussian_process, discriminant_analysis
from xgboost import XGBClassifier

# model utils
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection 
from sklearn import model_selection
from sklearn import metrics

# plot
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.tools.plotting import scatter_matrix #??
# = show plots in Jupyter Notebook browser
%matplotlib inline 
mpl.style.use('ggplot') #??
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8 #??

# show all columns
from IPython.display import display
pd.options.display.max_columns = None

# self define
sys.path.append('../../utils/')

In [2]:
# read files
lb_wsp_2014 = pd.read_csv('../data/Autohaus_weeber/leonberg_werkstattposten_2014.csv', sep = ';')
lb_wsp_2015 = pd.read_csv('../data/Autohaus_weeber/leonberg_werkstattposten_2015.csv', sep = ';')
lb_wsp_2016 = pd.read_csv('../data/Autohaus_weeber/leonberg_werkstattposten_2016.csv', sep = ';')
lb_wsp_2017 = pd.read_csv('../data/Autohaus_weeber/leonberg_werkstattposten_2017.csv', sep = ';')
lb_wsp_2018 = pd.read_csv('../data/Autohaus_weeber/leonberg_werkstattposten_2018.csv', sep = ';')
std_wsp_2014 = pd.read_csv('../data/Autohaus_weeber/weil_der_stadt_werkstattposten_2014.csv', sep = ';')
std_wsp_2015 = pd.read_csv('../data/Autohaus_weeber/weil_der_stadt_werkstattposten_2015.csv', sep = ';')
std_wsp_2016 = pd.read_csv('../data/Autohaus_weeber/weil_der_stadt_werkstattposten_2016.csv', sep = ';')
std_wsp_2017 = pd.read_csv('../data/Autohaus_weeber/weil_der_stadt_werkstattposten_2017.csv', sep = ';')
std_wsp_2018 = pd.read_csv('../data/Autohaus_weeber/weil_der_stadt_werkstattposten_2018.csv', sep = ';')
# cat
d1 = lb_wsp_2014.copy()
d2 = lb_wsp_2015.copy()
d3 = lb_wsp_2016.copy()
d4 = lb_wsp_2017.copy()
d5 = lb_wsp_2018.copy()
data1 = pd.concat([d1,d2,d3,d4], 0)
d6 = std_wsp_2014.copy()
d7 = std_wsp_2015.copy()
d8 = std_wsp_2016.copy()
d9 = std_wsp_2017.copy()
d10 = std_wsp_2018.copy()
data2 = pd.concat([d6,d7,d8,d9,d10], 0)
data = pd.concat([data1, data2], 0)

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [8]:
df = lb_wsp_2014[['Auftragsnummer', 'Beschreibung', 'Fahrgestellnummer']]

In [15]:
# bs operate and transfer
# operate
from pattern.de import parse, conjugate, singularize, pluralize

def bs_operate(li):
    """
    input string:
        'describe' attribute 
    """
    # to string
    li = str(li).lower()
    # recover: problem with '.' in string
    #dict_re = {'ae': 'ä', 'oe': 'ö', 'ue': 'ü', 'ß': 'ss', 'u.': 'und ', 'fzg': 'fahrzeug', ' f ': ' für '}
    dict_re = {'ae': 'ä', 'oe': 'ö', 'ue': 'ü', 'ß': 'ss', 'fzg': 'fahrzeug', ' f ': ' '}
    for i, j in dict_re.items():
        if i in li:
            li = re.sub(i, j, li)
    #print(li,0)
    # replace punctuation with ' '
    li = re.sub("[\s+\.\!_,$%^*(+\"\')-:]", " ", li)
    #print(li,1)
    # remove the number in the string
    li = re.sub('\d', '', li)
    #print(li,2)
    # remove word with one ziffer
    li = re.sub(' [a-zA-Z] ', "", li)
    #print(li,3)
    # plural to odd
    words = li.strip().split(' ')
    #words_sin = [pluralize(word.strip()) if len(word) > 0 and parse(word).split('/')[1] == 'NN' and parse(word).split('/')[1] != 'IN' else word.strip() for word in words]
    words_sin = [pluralize(word.strip()) if len(word) > 0 and parse(word).split('/')[1] != 'IN' else word.strip() for word in words]
    li = ' '.join(words_sin)
    #print(li,4)
    # upper
    li = li.upper()
    #print(li,5)
    return li

# 给的数据的每一行都是一个维修项，初衷是，把属于同一个auftrag的维修项合并到一起，看一下，在同一个Auftrag中，经常一起修的是那些内容

def toAuftragTable(df, att, auftn, clean = True):
    """
    input:
        df, DataFrame:
            the dataframe
        att, string:
            the column name of the target attribute
        auftn, string:
            the column name of the aftragsnummer attribute
    output:
        df_g, DataFrame:
            dataframe contrains two columns auftn and att
            type of item in att is string, separate with ';'
    """
    # assert: make sure the type of the attributes inputted
    
    # extract the att and date columns
    df = df[[att, auftn]]
    # if clean is True, drop the fake data, like the null data
    if clean:
        print("Null date exist, drop these dates directly")
        df = df.drop(df[df[att].isnull()].index)
        df = df.drop(df[df[auftn].isnull()].index)
    # group and sum 
    df_g = df.groupby([auftn], as_index = False).apply(agg)
    return df_g

# apply 只能对单行进行处理，而不是对整个分组进行处理，所以估计应该把axis换成1，比较好
def agg(x):
    # 是否用‘ ’分隔会比较好，这样就不用对初始的属性，
    #x = x.apply(lambda x: ';'.join(set(x)), axis = 0)
    x = x.apply(lambda x: ' '.join(set(x)), axis = 0)
    #print(x.columns.values)
    return x

In [10]:
df['Beschreibung2'] = df['Beschreibung'].map(bs_operate)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [82]:
df_new = toAuftragTable(df, 'Beschreibung2', 'Auftragsnummer', clean = True)

Null date exist, drop these dates directly


In [83]:
# merge the data to df_new
df_ori= lb_wsp_2014[['Auftragsnummer', 'KM-Stand', 'Auftragsdatum', 'Markencode', 'Adressanredecode', 'Fahrgestellnummer', 'Motorcode', 'Fahrzeugmodellnummer', 'Modell', 'Typ', 'Getriebecode']].drop_duplicates()

In [84]:
tmp = df_new.copy()
tmp = pd.merge(tmp, df_ori, how = 'left', on = 'Auftragsnummer')

In [85]:
tmp = tmp.sort_values(by = 'Auftragsnummer', ascending=False)

In [86]:
df_new = tmp.reset_index().drop('index', axis = 1)

In [129]:
df_new[df_new['Beschreibung2'].map(lambda x: ('SCHRAUB' in str(x)) and ('LACK' in str(x)))].count()

Beschreibung2           308
Auftragsnummer          308
KM-Stand                308
Auftragsdatum           308
Markencode              307
Adressanredecode        305
Fahrgestellnummer       308
Motorcode               301
Fahrzeugmodellnummer    307
Modell                  306
Typ                     206
Getriebecode            299
dtype: int64

In [131]:
df_new[df_new['Beschreibung2'].map(lambda x: ('LACK' in str(x)))].count()

Beschreibung2           1435
Auftragsnummer          1435
KM-Stand                1435
Auftragsdatum           1435
Markencode              1425
Adressanredecode        1403
Fahrgestellnummer       1431
Motorcode               1403
Fahrzeugmodellnummer    1425
Modell                  1422
Typ                      912
Getriebecode            1391
dtype: int64

In [132]:
305/1435

0.21254355400696864

- 可以看到有些车（Fahrgestellnummer）重复出现了13次。 这些重复出现的数据，可以计算他们的频率，从而计算出他们下次出现的时候
- 另外一种方法是，通过计算前后两次出现的时候KM-stand的改变，从而通过km_stand的状况来提出，维修请求

In [96]:
df_new[['Auftragsnummer', 'Beschreibung2']].head()

Unnamed: 0,Auftragsnummer,Beschreibung2
0,WSAU330854,SCHALTHEBEL AM GETRIEBE AUS EINGEBAUTE GETRIEB...
1,WSAU256082,BELEUCHTUNGSE CHECKE DURCHGEFÜHRTEN RÄDER AUS ...
2,WSAU236371,SELBSTBETEILIGUNGEN AUF DIEN ENTSCHÄDIGUNGSSUMMEN
3,WSAU229930,STANDGELDKOSTEN VOM
4,WSAU229914,PROVISIONEN LREN KENNZEICHEN


In [97]:
df_new.describe()

Unnamed: 0,Beschreibung2,Auftragsnummer,KM-Stand,Auftragsdatum,Markencode,Adressanredecode,Fahrgestellnummer,Motorcode,Fahrzeugmodellnummer,Modell,Typ,Getriebecode
count,21631,21631,21628,21631,21195,21121,21241,20900,21193,21149,14410,20830
unique,15115,21630,15489,317,19,12,10194,579,2676,2824,54,1396
top,ZULASSUNGEN ABMELDUNGEN GWE LREN GEBÜHRE,77KW,1500,11.11.2014,VW,Firma,TMBJJ7NE6D0034366,CFFB,36535Y,Passat Variant Comfortline BM,X0A,LHW
freq,279,2,909,158,11119,9024,13,1388,309,337,12768,313


In [126]:
tmp = lb_wsp_2014[['Auftragsdatum', 'Fahrgestellnummer', 'KM-Stand', 'Beschreibung']].drop_duplicates()

In [127]:
tmp.sort_values(by = 'Auftragsdatum')

Unnamed: 0,Auftragsdatum,Fahrgestellnummer,KM-Stand,Beschreibung
14067,0,,,000
235649,01.01.1753,WV1ZZZ7HZ8H024400,000,SCHALTHEBE
235642,01.01.1753,WV1ZZZ7HZ8H024400,000,BELEUCHTUNGS-CHECK .
235643,01.01.1753,WV1ZZZ7HZ8H024400,000,FLÜSSIGKEITSSTÄNDE GEPRÜFT
235644,01.01.1753,WV1ZZZ7HZ8H024400,000,SCHALTHEBEL AM GETRIEBE AUS- U.EINGEBAUT
235650,01.01.1753,WV1ZZZ7HZ8H024400,000,SCHALTUNG
235646,01.01.1753,WV1ZZZ7HZ8H024400,000,GETRIEBETRÄGER AUS- U.EINGEBAUT
235647,01.01.1753,WV1ZZZ7HZ8H024400,000,SCHRAUBE
235648,01.01.1753,WV1ZZZ7HZ8H024400,000,BUNDMUTTER
235645,01.01.1753,WV1ZZZ7HZ8H024400,000,SCHALTWELLE AUS- U.EINGEBAUT


In [105]:
tmp.describe()

Unnamed: 0,Auftragsdatum,Fahrgestellnummer,KM-Stand
count,19806,19610,19805
unique,317,10194,15489
top,11.11.2014,TMBJJ7NE6D0034366,1500
freq,147,13,898


In [124]:
tmp1 = tmp[tmp['Fahrgestellnummer'] == 'TMBJJ7NE6D0034366']

In [125]:
tmp1

Unnamed: 0,Auftragsdatum,Fahrgestellnummer,KM-Stand,count
25423,10.02.2014,TMBJJ7NE6D0034366,1852000,1
45562,13.03.2014,TMBJJ7NE6D0034366,2133200,1
62259,07.04.2014,TMBJJ7NE6D0034366,2535800,1
64253,09.04.2014,TMBJJ7NE6D0034366,2546700,1
88541,20.05.2014,TMBJJ7NE6D0034366,2604200,1
88585,20.05.2014,TMBJJ7NE6D0034366,2535800,1
97164,27.05.2014,TMBJJ7NE6D0034366,2535800,1
99066,02.06.2014,TMBJJ7NE6D0034366,2986500,1
127896,14.07.2014,TMBJJ7NE6D0034366,3243100,1
127925,21.07.2014,TMBJJ7NE6D0034366,2986500,1


In [108]:
def to_count_table(df, cols):
    df['count'] = '1'
    df = df[[cols[0], 'count']]
    return df.groupby(cols[0], as_index = False).count().sort_values(by = 'count', ascending = False)

In [109]:
to_count_table(tmp, ['Fahrgestellnummer'])

Unnamed: 0,Fahrgestellnummer,count
939,TMBJJ7NE6D0034366,13
2983,WAUZZZ8K6DA013386,10
8540,WVWZZZ3CZDE514391,10
1918,WAUZZZ4G1DN090225,10
9369,WVWZZZ7NZBV031839,10
9201,WVWZZZ6RZDY058325,10
8291,WVWZZZ3CZCE057210,10
1134,TMBKT61Z4C2187428,9
1883,WAUZZZ4G0DN045809,9
6147,WVGZZZ7PZBD037551,9
