In [1]:
import numpy as np
import pandas as pd
import sklearn.metrics

from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.svm import SVC

In [2]:
# wrangle the given data file and write the result into a new file
def wrangle(file_name = None):
    
    old_path = file_name + '.csv'
    new_path = file_name + '_wrangled.csv'

    old_data = pd.read_csv(old_path, encoding = 'gbk')

    # fetch the total consumption frequency of each customer
    frequency = old_data['USER_ID'].value_counts()
    frequency = frequency.reset_index()
    frequency.columns = ['USER_ID', 'frequency']

    # fetch the total consumption of each customer
    consumption = old_data[['number_consumers', "expenditure"]].groupby(old_data['USER_ID']).sum()
    consumption = consumption.reset_index()
    consumption.columns = ['USER_ID', 'total_number', 'total_expenditure']

    # merge this two tables above
    new_data = pd.merge(frequency, consumption, left_on = 'USER_ID', right_on = 'USER_ID', how = 'left')

    # fetch the label of each customer
    labels = old_data.iloc[:, :4]
    labels = labels.groupby(['USER_ID']).last()
    labels = labels.reset_index()

    # merge labels into new data
    new_data = pd.merge(new_data, labels, left_on = 'USER_ID', right_on = 'USER_ID', how = 'left')

    # drop the records which contain NaN or hold 0 value for 'total_number'
    new_data = new_data.dropna(axis = 0)
    new_data = new_data[new_data['total_number'] != 0]

    # calculate the per capita consumption of each customer
    new_data['aver_consumption'] = new_data['total_expenditure'] / new_data['total_number']
    new_data['aver_consumption'] = new_data['aver_consumption'].apply(lambda x: '%.2f' % x)

    # fetch the last consumption date by the end of observation period 
    new_data['last_visit'] = pd.to_datetime(new_data['LAST_VISITS'])
    end_date = pd.to_datetime('2016-7-31')
    gaps = end_date - new_data['last_visit']
    new_data['last_visit'] = gaps.apply(lambda x: x.days)

    # write the wrangled data into a new file
    new_data = new_data.loc[:, ['USER_ID', 'frequency', 'total_expenditure', 'aver_consumption', 'last_visit', 'type']]
    new_data.to_csv(new_path, index = False, encoding = 'gbk')

In [3]:
# wrangle the train data and the test data
wrangle('train')
wrangle('test')

In [4]:
# read data from the given file
def read_data(path = None):

    df = pd.read_csv(path, encoding = 'gbk')
    # drop the USER_ID column
    df = df.drop(['USER_ID'], axis = 1)
    # convert labels into digits
    df = df.replace('非流失', 0)
    df = df.replace('准流失', 1)
    # fetch data set and feature names
    dataset = df.values
    feature_name = df.columns.values

    return dataset, feature_name

In [5]:
# read train data
train_dataset, train_features = read_data(r"train_wrangled.csv")
train_data = pd.DataFrame(train_dataset, columns = train_features)

print(train_data)

      frequency  total_expenditure  aver_consumption  last_visit  type
0          37.0            33570.0            145.32         3.0   0.0
1          34.0            31903.0            142.42         4.0   0.0
2          33.0            30400.0            152.76         8.0   0.0
3          33.0            30849.0            155.80         7.0   0.0
4          32.0            28695.0            145.66         1.0   0.0
...         ...                ...               ...         ...   ...
1473        1.0             1432.0            159.11       106.0   1.0
1474        1.0              440.0            220.00        65.0   1.0
1475        1.0             1568.0            156.80        57.0   1.0
1476        1.0              785.0            112.14        49.0   1.0
1477        1.0             1012.0            126.50        98.0   1.0

[1478 rows x 5 columns]


In [6]:
# read test data
test_dataset, test_features = read_data(r"test_wrangled.csv")
test_data = pd.DataFrame(test_dataset, columns = test_features)

print(test_data)

     frequency  total_expenditure  aver_consumption  last_visit  type
0         41.0            34784.0            146.77         0.0   0.0
1         33.0            32699.0            157.97         2.0   0.0
2         33.0            30394.0            146.12         3.0   0.0
3         32.0            27088.0            141.08         5.0   0.0
4         25.0            18910.0            163.02         5.0   0.0
..         ...                ...               ...         ...   ...
429        1.0              358.0            119.33        20.0   1.0
430        1.0             1433.0            159.22        49.0   1.0
431        1.0             1259.0            179.86        42.0   1.0
432        1.0             1602.0            160.20        41.0   1.0
433        1.0              469.0             78.17       106.0   1.0

[434 rows x 5 columns]


In [7]:
# devide the data set into train set and test set
X_train = np.array(train_data.iloc[:, :-1])
Y_train = np.array(train_data.iloc[:, -1])

X_test = np.array(test_data.iloc[:, :-1])
Y_test = np.array(test_data.iloc[:, -1])

In [8]:
# try different kernel function to find the best one
kernels = ['linear', 'poly', 'rbf', 'sigmoid']

for kernel in kernels:
    SVM_classifier = SVC(C = 1.0, kernel = kernel, decision_function_shape = 'ovo')
    SVM_classifier.fit(X_train, Y_train)

    print("kernel function: ", kernel)
    print("score:", SVM_classifier.score(X_test, Y_test))

kernel function:  linear
score: 0.9285714285714286
kernel function:  poly
score: 0.8870967741935484
kernel function:  rbf
score: 0.8986175115207373
kernel function:  sigmoid
score: 0.7396313364055299


In [9]:
# research the effect that standardization takes at the model
NX_train = StandardScaler().fit_transform(X_train)
NX_test = StandardScaler().fit_transform(X_test)

for kernel in kernels:
    SVM_classifier = SVC(C = 1.0, kernel = kernel, decision_function_shape = 'ovo')
    SVM_classifier.fit(NX_train, Y_train)

    print("kernel function: ", kernel)
    print("score:", SVM_classifier.score(NX_test, Y_test))

kernel function:  linear
score: 0.9285714285714286
kernel function:  poly
score: 0.8778801843317973
kernel function:  rbf
score: 0.9193548387096774
kernel function:  sigmoid
score: 0.7764976958525346


In [10]:
# use grid search to find the best parameter for poly function
SVM_classifier = SVC(kernel = 'poly', decision_function_shape = 'ovo')
param_grid = {'C':[1.0], 'degree':np.arange(2, 6), 'coef0':np.arange(0, 10)}

algo = GridSearchCV(estimator = SVM_classifier, param_grid = param_grid, cv = 10)
algo.fit(X_train, Y_train)

print("score on train set: ", algo.score(X_train, Y_train))
print("score on test set:", algo.score(X_test, Y_test))
print("best parameter conbination: ", algo.best_params_)

score on train set:  0.9546684709066305
score on test set: 0.9147465437788018
best parameter conbination:  {'C': 1.0, 'coef0': 9, 'degree': 4}
