In [1]:
# load package
# string 
import re

# math
import pandas as pd
import numpy as np
import scipy as sp
import random

# sys
import sys
import os
import time
import warnings

# date
from datetime import datetime, timedelta

# math
import math

# preprocessing
from sklearn.preprocessing import KBinsDiscretizer, StandardScaler, MinMaxScaler, QuantileTransformer, PowerTransformer

# machine learning
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, gaussian_process, discriminant_analysis
from xgboost import XGBClassifier

# model utils
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn import feature_selection 
from sklearn import model_selection
from sklearn import metrics

# apriori
from efficient_apriori import apriori

# plot
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.tools.plotting import scatter_matrix #??
# = show plots in Jupyter Notebook browser
%matplotlib inline 
mpl.style.use('ggplot') #??
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8 #??

# show all columns
from IPython.display import display
pd.options.display.max_columns = None

# memory manage
import gc

# logging
import logging 

# other
import tqdm as tqdm

# self define
sys.path.append('../../utils/')

  _nan_object_mask = _nan_object_array != _nan_object_array


In [2]:
# for network design
import torch
import argparse
import os

from scipy.stats import stats
from torch.utils.data import DataLoader
from torch.optim import lr_scheduler

In [3]:
logging.basicConfig(level = logging.INFO, format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler = logging.FileHandler('../log/extract_features.log')
handler.setLevel(logging.INFO)
formater = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formater)
logger = logging.getLogger(__name__)
logger.addHandler(handler)
logger.info('This is a log info')

2018-11-27 09:18:34,354 - __main__ - INFO - This is a log info


In [4]:
# read files
lb_wsp_2014 = pd.read_csv('../data/Autohaus_weeber/leonberg_werkstattposten_2014.csv', sep = ';')
lb_wsp_2015 = pd.read_csv('../data/Autohaus_weeber/leonberg_werkstattposten_2015.csv', sep = ';')
lb_wsp_2016 = pd.read_csv('../data/Autohaus_weeber/leonberg_werkstattposten_2016.csv', sep = ';')
lb_wsp_2017 = pd.read_csv('../data/Autohaus_weeber/leonberg_werkstattposten_2017.csv', sep = ';')
lb_wsp_2018 = pd.read_csv('../data/Autohaus_weeber/leonberg_werkstattposten_2018.csv', sep = ';')
std_wsp_2014 = pd.read_csv('../data/Autohaus_weeber/weil_der_stadt_werkstattposten_2014.csv', sep = ';')
std_wsp_2015 = pd.read_csv('../data/Autohaus_weeber/weil_der_stadt_werkstattposten_2015.csv', sep = ';')
std_wsp_2016 = pd.read_csv('../data/Autohaus_weeber/weil_der_stadt_werkstattposten_2016.csv', sep = ';')
std_wsp_2017 = pd.read_csv('../data/Autohaus_weeber/weil_der_stadt_werkstattposten_2017.csv', sep = ';')
std_wsp_2018 = pd.read_csv('../data/Autohaus_weeber/weil_der_stadt_werkstattposten_2018.csv', sep = ';')
# cat
d1 = lb_wsp_2014.copy()
d2 = lb_wsp_2015.copy()
d3 = lb_wsp_2016.copy()
d4 = lb_wsp_2017.copy()
d5 = lb_wsp_2018.copy()

d6 = std_wsp_2014.copy()
d7 = std_wsp_2015.copy()
d8 = std_wsp_2016.copy()
d9 = std_wsp_2017.copy()
d10 = std_wsp_2018.copy()

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
# for training we use data from 14 - 17 in lb_wsp
train1 = pd.concat([d1,d2,d3,d4,d5], 0)
train2 = pd.concat([d6,d7,d8,d9,d10], 0)
train1['Autohaus'] = 'leonberg'
train2['Autohaus'] = 'weil'
#train = pd.concat([train1, train2], 0)
#train = train1.copy()

In [6]:
# because there exists reused Auftragsnummer in different Autohaus station. So we add some sign to the Auftragsnummer
# in each Autohaus station
train1['Auftragsnummer'] = 'A' + train1['Auftragsnummer']
train2['Auftragsnummer'] = 'B' + train2['Auftragsnummer']
train = pd.concat([train1, train2], 0)

In [7]:
# remove the outlier in the Teile-Nr
# remove the na wert
df = train[train['Teile-Nr'].isna().map(lambda x: not x)]
# remove the value short value
df = df[df['Teile-Nr'].map(lambda x: False if len(x) < 4 else True)]
# remove the value that doesn't contain number: 30593 of 593527 in train1(5%)
df = df[df['Teile-Nr'].map(lambda x: True if re.search('\d', x) else False)]
# remove the value that doesn't contrain adjoining number(min 2 number): 594 of 562934 in train1(0.1%)
df = df[df['Teile-Nr'].map(lambda x: True if re.search('\d\d', x) else False)]

In [8]:
# use findall instead of search, because, the df here should contain adjoining number, otherwise it's wrong
df['Gruppe-Nr'] = df['Teile-Nr'].map(lambda x: re.findall('\d\d', x)[0][0])

In [9]:
# clean the memory
del train1, train2, lb_wsp_2014, lb_wsp_2015, lb_wsp_2016, lb_wsp_2017, lb_wsp_2018
del std_wsp_2014, std_wsp_2015, std_wsp_2016, std_wsp_2017, std_wsp_2018
del d1, d2, d3, d4, d5, d6, d7, d8, d9, d10
gc.collect()

91

In [11]:
cf = df.copy()
# drop duplicate
cf = cf.drop_duplicates()
# find out the confused data. For the same Auftragsnummer, exists more than one value in the other attribute.
# here is AWSAU310019, BWSAU386471, BWSAU435051
cf = cf.drop(cf[cf['Auftragsnummer'] == 'AWSAU310019'].index, axis= 0) # 8 items
cf = cf.drop(cf[cf['Auftragsnummer'] == 'BWSAU386471'].index, axis= 0) # 3 items
cf = cf.drop(cf[cf['Auftragsnummer'] == 'BWSAU435051'].index, axis= 0) # 2 items
cf = cf.drop(cf[cf['Auftragsnummer'] == 'BWSAU271939'].index, axis= 0) # ? items
# remove wierd auftragsnummer
cf = cf.drop(cf[cf['Auftragsnummer'] == '103K'].index) #
cf = cf.drop(cf[cf['Auftragsnummer'] == '77KW'].index) #
# remove na value in Fahrgestellnummer
cf = cf.drop(cf[cf['Fahrgestellnummer'].isna()].index, axis = 0) #
# remove the Fahrgestellnumer that only contain number, not tested!!!!
cf = cf.drop(cf[cf['Fahrgestellnummer'].map(lambda x: False if re.search('[a-zA-Z]', x) else True)].index, axis = 0)
# remove the items, which day of the Auftragsdatum lareger than 31, not tested!!!!
cf = cf.drop(cf[cf['Auftragsdatum'].map(lambda x: int(x[0:2])) > 31].index, axis = 0)
# remove the items, which length of the Auftragsdatum shorter than 10, not tested!!!!
cf = cf.drop(cf[cf['Auftragsdatum'].map(lambda x:True if len(x) < 10 else False)].index, axis = 0)
# remove the items, which year of the Auftragsdatum smaller than 2013, not tested!!!!
cf = cf.drop(cf[cf['Auftragsdatum'].map(lambda x: int(x[6:]) < 2013)].index, axis = 0)
df = cf
del cf
gc.collect()

81

In [12]:
gn = df.copy()

In [13]:
gn = toAuftragTable(gn, 'Gruppe-Nr', 'Auftragsnummer') # number: 245018

Falls Null date exist, drop these dates directly


In [44]:
af = df.copy()

In [45]:
af = af[['Auftragsnummer', 'Fahrgestellnummer', 'Auftragsdatum']]
af = af.drop_duplicates()

In [46]:
agn = pd.merge(gn, af, how = 'left', on = 'Auftragsnummer')

In [47]:
agn['Auftragsdatum'] = pd.to_datetime(agn['Auftragsdatum'])

In [48]:
# extract feature: count
fc = agn[['Fahrgestellnummer']]
fc['count'] = 1
fc = fc.groupby('Fahrgestellnummer', as_index = False).sum()
agn = pd.merge(agn, fc, how = 'left', on = 'Fahrgestellnummer')
del fc

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [131]:
# extract feature: time distance
ft = agn[['Fahrgestellnummer', 'Auftragsdatum']]
ft_max = ft.groupby('Fahrgestellnummer', as_index = False).max()
ft_min = ft.groupby('Fahrgestellnummer', as_index = False).min()
ft_com = pd.merge(ft_max, ft_min, how = 'inner', on = 'Fahrgestellnummer')
ft_com['period'] = ft_com['Auftragsdatum_x'] - ft_com['Auftragsdatum_y']
#ft = pd.merge(ft, ft_com[['Fahrgestellnummer', 'period']], on = 'Fahrgestellnummer', how = 'inner')
agn = pd.merge(agn, ft_com[['Fahrgestellnummer', 'period']], on = 'Fahrgestellnummer', how = 'inner')
agn['period'] = agn['period'].map(lambda x: x.days/120) # unit in 4 months
del ft, ft_max, ft_min, ft_com

KeyError: "['Auftragsdatum'] not in index"

In [65]:
# extract feature: frequent
def get_fre(x):
    #if x['period'] == 0 and x['period'] != 1:
    #    return 1
    if x['period'] == 0:
        return 0
    return x['count']*1.0/x['period']
agn['frequent'] = agn.apply(get_fre, axis = 1)

In [130]:
# combine Gruppe-Nr with the same Fahrgestellnummer and save the result as type of string
out = []
for name, group in agn[['Gruppe-Nr', 'Fahrgestellnummer']].groupby('Fahrgestellnummer', as_index = False):
    out.append(pd.DataFrame({'Fahrgestellnummer': [group.iloc[0, group.columns.get_loc('Fahrgestellnummer')]],
                            'Gruppe-Nr_concat': [';'.join(group['Gruppe-Nr'].tolist())]}))
out = pd.concat(out, axis = 0)
out = out.reset_index().iloc[:,1:]
agn = out
del out
gc.collect

KeyError: "['Gruppe-Nr'] not in index"

In [132]:
for nr in range(10):
    agn['count_group_'+str(nr)] = agn['Gruppe-Nr_concat'].map(lambda x: x.count(str(nr)))

In [133]:
# count the number of columns with value 0
def get_coverage(x):
    counter = 0
    for i in range(10):
        if x['count_group_'+str(i)] == 0:
            counter += 1
    return 10 - counter
agn['coverage'] = agn.apply(get_coverage, axis = 1)

In [134]:
# get deep
def get_deep(x):
    deepest = 0
    for i in range(10):
        if x['count_group_'+str(i)] > deepest:
            deepest = x['count_group_'+str(i)]
    return deepest
agn['deep'] = agn.apply(get_deep, axis = 1)

In [135]:
# get total count of teile
agn['count'] = agn['Gruppe-Nr_concat'].map(lambda x: len(x.split(';')))

In [137]:
# get proportion of group
for i in range(10):
    agn['proportion_group_'+str(i)] = agn['count_group_'+str(i)]/agn['count']

In [240]:
def get_rank(x):
    df_proportions = x[['proportion_group_0', 'proportion_group_1', 'proportion_group_2', 'proportion_group_3', 
                     'proportion_group_4','proportion_group_5', 'proportion_group_6', 'proportion_group_7', 
                     'proportion_group_8', 'proportion_group_9']].copy()
    sorted_df = df_proportions.sort_values(by = df_proportions.index[0], axis = 1)
    counter = 0
    value = 0
    clock = 0
    for index, i in enumerate(sorted_df.columns.tolist()):
        if sorted_df.iloc[0, index] > value:
            value = sorted_df.iloc[0, index]
            counter = clock
        x['rank_group_'+i[-1]] = counter
        clock += 1
    return x
out = []
for name, group in agn[:20].groupby('Fahrgestellnummer', as_index = False):
    out.append(get_rank(group))

In [241]:
pd.concat(out, axis = 0)

Unnamed: 0,proportion_group_0,proportion_group_1,proportion_group_2,proportion_group_3,proportion_group_4,proportion_group_5,proportion_group_6,proportion_group_7,proportion_group_8,proportion_group_9,rank_group_0,rank_group_1,rank_group_2,rank_group_3,rank_group_4,rank_group_5,rank_group_6,rank_group_7,rank_group_8,rank_group_9
0,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8,8,0,0,0,0,0,0,0,0
1,0.2,0.2,0.0,0.0,0.2,0.2,0.0,0.0,0.0,0.2,5,5,0,0,5,5,0,0,0,5
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,9,0,0,0,0,0,0
3,0.285714,0.285714,0.142857,0.095238,0.0,0.047619,0.0,0.047619,0.047619,0.047619,8,8,7,6,0,2,0,2,2,2
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0,0,0,0,0,0,0,9
5,0.153846,0.230769,0.230769,0.0,0.0,0.153846,0.0,0.0,0.0,0.230769,5,7,7,0,0,5,0,0,0,7
6,0.166667,0.333333,0.166667,0.0,0.0,0.083333,0.0,0.0,0.0,0.25,6,9,6,0,0,5,0,0,0,8
7,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0,0,8,0,0,0,0,0,0,8
8,0.25,0.25,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.25,6,6,0,0,0,6,0,0,0,6
9,0.285714,0.285714,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.285714,7,7,0,0,0,6,0,0,0,7


In [191]:
a = pd.DataFrame({'A': [1,2], 'B':[2,2], 'C': [1,1]})

In [223]:
a = pd.DataFrame()

In [227]:
a

Unnamed: 0,D


In [10]:
# 转化为auftrag table， 但是这次合并的是Teile-Nr项
# 给的数据的每一行都是一个维修项，初衷是，把属于同一个auftrag的维修项合并到一起，看一下，在同一个Auftrag中，经常一起修的是那些内容

def toAuftragTable(df, att, auftn, clean = True):
    """
    input:
        df, DataFrame:
            the dataframe
        att, string:
            the column name of the target attribute
        auftn, string:
            the column name of the aftragsnummer attribute
        clean:
            when true, drop the null item in auftn attribute.
    output:
        df_g, DataFrame:
            dataframe contrains two columns auftn and att
            type of item in att is string, separate with ';'
    """
    # assert: make sure the type of the attributes inputted
    
    # extract the att and date columns
    df = df[[att, auftn]]
    # set type to object
    #df[att] = df[att].astype('object')
    #df[auftn] = df[auftn].astype('object')
    # if clean is True, drop the fake data, like the null data
    if clean:
        print("Falls Null date exist, drop these dates directly")
        #df = df.drop(df[df[att].isnull()].index)
        df = df.drop(df[df[auftn].isnull()].index)
    # group and sum 
    df_g = df.groupby([auftn], as_index = False).apply(agg)
    return df_g

# apply 只能对单行进行处理，而不是对整个分组进行处理，所以估计应该把axis换成1，比较好
def agg(x):
    # 是否用‘ ’分隔会比较好，这样就不用对初始的属性，
    # x 在这里是dataframe？？？
    #x = [str(i) for i in x]
    x = x.apply(lambda x: ';'.join(set([str(i) for i in x])), axis = 0)
    #x = x.apply(lambda x: ' '.join(set(x)), axis = 0)
    #print(x.columns.values)
    return x