In [1]:
import pandas as pd

import dask
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster
import multiprocessing.popen_spawn_win32
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression


def data_cleaning (data_path):
    #start a local dask cluster
    cluster = LocalCluster(n_workers=4)
    client = Client(cluster)

    #read data
    sys_info = dd.read_csv(data_path,
                           delimiter ="\1",
                           assume_missing=True)
    print('read data successfully')

    #find used columns
    used_cols =['chassistype', 
                'chassistype_2in1_category',
                'countryname_normalized',
                'modelvendor_normalized', 
                'model_normalized', 
                'ram', 
                'os',
                '#ofcores', 
                'age_category', 
                'graphicsmanuf',
                'graphicscardclass', 
                'processornumber', 
                'cpuvendor', 
                'cpu_family', 
                'cpu_suffix',
                'screensize_category', 
                'persona',
                'processor_line', 
                'vpro_enabled',
                'discretegraphics']
    df = sys_info[used_cols]

    #cleaning
    df = df.dropna()
    df = df[df.persona!= 'Unknown'].reset_index(drop=True)
    df = df[df.processornumber!= 'Unknown'].reset_index(drop=True)
    df = df.compute()

    df['processornumber'] = df['processornumber'].apply(lambda x: x[:2] ).astype('int32',errors='raise')
    df['ram'] =df['ram'].astype('int32')
    df['#ofcores'] =df['#ofcores'].astype('int32',errors='raise')

    #define the columns with different type
    used_cols.remove('persona')
    int_cols = ['ram','#ofcores','processornumber']
    cat_cols = [i for i in used_cols if i not in int_cols]
    print('clean data successfully')

    #one hot encoding on cat_cols
    df = pd.get_dummies(df, columns =cat_cols).reset_index(drop=True)
    #get the x and y
    y = df['persona'].values
    temp = list(df.columns.values)
    temp.remove('persona')
    x = df[temp].values

    #apply label encoder on persona
    le = preprocessing.LabelEncoder()
    y = le.fit_transform(y)

    #apply PCA on y
    pca = PCA(n_components=1000)
    x = pca.fit_transform(x,y)
    
    return x,y


In [2]:
def build_classifier(x,y, classifier_list, trail_num=5 ):
    
    knn_param = list(range(2,50,5))
    dt_param = list(range(5,100,10))
    rf_param = list(range(5,100,10))
    nn_param = list(range(300,1100,100))
    svm_param = [10**i for i in range(-3,4)]
    sgd_param = list(range(5,100,10))
    l_param = [10**i for i in range(-3,4)]
    

    print('Start to build the classifier and tune the parameter')

    result = pd.DataFrame(columns =['trail','classifier','parameter','train_acc','test_acc','train_f1','test_f1'])

    for t in range(trail_num):
        #train test split
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33)

        #KNN
        if 'knn' in classifier_list:
            print("Start KNN")
            #tune the parameter
            for i in knn_param:

                clf = KNeighborsClassifier(n_neighbors=i)
                clf.fit(x_train, y_train)

                #get train score
                y_train_pred = clf.predict(x_train)
                train_sc = accuracy_score(y_train, y_train_pred)
                train_f1 = f1_score(y_train, y_train_pred,average='macro')

                #get test score
                y_test_pred = clf.predict(x_test)
                test_sc = accuracy_score(y_test, y_test_pred)
                test_f1 = f1_score(y_test, y_test_pred,average='macro')

                temp = {'trail':t,
                        'classifier':'KNN',
                        'parameter': i,
                        'train_acc':train_sc,
                        'test_acc':test_sc,
                        'train_f1':train_f1,
                        'test_f1':test_f1}
                print(temp)
                result = result.append(temp,ignore_index=True)


        #DecisionTree
        if 'decision tree' in classifier_list:
            print("Start Decision Tree")
            for i in dt_param:
                clf = DecisionTreeClassifier(max_depth= i)
                clf.fit(x_train, y_train)

                 #get train score
                y_train_pred = clf.predict(x_train)
                train_sc = accuracy_score(y_train, y_train_pred)
                train_f1 = f1_score(y_train, y_train_pred,average='micro')

                #get test score
                y_test_pred = clf.predict(x_test)
                test_sc = accuracy_score(y_test, y_test_pred)
                test_f1 = f1_score(y_test, y_test_pred,average='micro')

                temp = {'trail':t,
                        'classifier':'Decision Tree',
                        'parameter': i,
                        'train_acc':train_sc,
                        'test_acc':test_sc,
                        'train_f1':train_f1,
                        'test_f1':test_f1}
                print(temp)
                result = result.append(temp,ignore_index=True)

        #Random Forest
        if 'random forest' in classifier_list:
            print("Start Random Forest")
            for i in rf_param:
                clf = RandomForestClassifier(max_depth =i)
                clf.fit(x_train, y_train)

                #get train score
                y_train_pred = clf.predict(x_train)
                train_sc = accuracy_score(y_train, y_train_pred)
                train_f1 = f1_score(y_train, y_train_pred,average='macro')

                #get test score
                y_test_pred = clf.predict(x_test)
                test_sc = accuracy_score(y_test, y_test_pred)
                test_f1 = f1_score(y_test, y_test_pred,average='macro')

                temp = {'trail':t,
                        'classifier':'Random Forest',
                        'parameter': i,
                        'train_acc':train_sc,
                        'test_acc':test_sc,
                        'train_f1':train_f1,
                        'test_f1':test_f1}
                print(temp)
                result = result.append(temp,ignore_index=True)


        #neural network
        if 'neural network' in classifier_list:
            print("Start Neural network")
            for i in nn_param:
                clf = MLPClassifier( max_iter=i)
                clf.fit(x_train, y_train)

                #get train score
                y_train_pred = clf.predict(x_train)
                train_sc = accuracy_score(y_train, y_train_pred)
                train_f1 = f1_score(y_train, y_train_pred,average='micro')

                #get test score
                y_test_pred = clf.predict(x_test)
                test_sc = accuracy_score(y_test, y_test_pred)
                test_f1 = f1_score(y_test, y_test_pred,average='micro')

                temp = {'trail':t,
                        'classifier':'Neural network',
                        'parameter': i,
                        'train_acc':train_sc,
                        'test_acc':test_sc,
                        'train_f1':train_f1,
                        'test_f1':test_f1}
                print(temp)
                result = result.append(temp,ignore_index=True)
                
                
        #SVM
        if 'svm' in classifier_list:
            print("Start SVM")
            for i in svm_param:
                clf = svm.SVC( C = i)
                clf.fit(x_train, y_train)

                #get train score
                y_train_pred = clf.predict(x_train)
                train_sc = accuracy_score(y_train, y_train_pred)
                train_f1 = f1_score(y_train, y_train_pred,average='micro')

                #get test score
                y_test_pred = clf.predict(x_test)
                test_sc = accuracy_score(y_test, y_test_pred)
                test_f1 = f1_score(y_test, y_test_pred,average='micro')

                temp = {'trail':t,
                        'classifier':'SVM',
                        'parameter': i,
                        'train_acc':train_sc,
                        'test_acc':test_sc,
                        'train_f1':train_f1,
                        'test_f1':test_f1}
                print(temp)
                result = result.append(temp,ignore_index=True)
                
                
        #SGD
        if 'sgd' in classifier_list:
            print("Start SGD")
            for i in sgd_param:
                clf = SGDClassifier( max_iter = i)
                clf.fit(x_train, y_train)

                #get train score
                y_train_pred = clf.predict(x_train)
                train_sc = accuracy_score(y_train, y_train_pred)
                train_f1 = f1_score(y_train, y_train_pred,average='micro')

                #get test score
                y_test_pred = clf.predict(x_test)
                test_sc = accuracy_score(y_test, y_test_pred)
                test_f1 = f1_score(y_test, y_test_pred,average='micro')

                temp = {'trail':t,
                        'classifier':'SGD',
                        'parameter': i,
                        'train_acc':train_sc,
                        'test_acc':test_sc,
                        'train_f1':train_f1,
                        'test_f1':test_f1}
                print(temp)
                result = result.append(temp,ignore_index=True)
                
        
        #logistic
        if 'logistic' in classifier_list:
            print("Start logistic")
            for i in l_param:
                clf = LogisticRegression( C = i)
                clf.fit(x_train, y_train)

                #get train score
                y_train_pred = clf.predict(x_train)
                train_sc = accuracy_score(y_train, y_train_pred)
                train_f1 = f1_score(y_train, y_train_pred,average='micro')

                #get test score
                y_test_pred = clf.predict(x_test)
                test_sc = accuracy_score(y_test, y_test_pred)
                test_f1 = f1_score(y_test, y_test_pred,average='micro')

                temp = {'trail':t,
                        'classifier':'logistic',
                        'parameter': i,
                        'train_acc':train_sc,
                        'test_acc':test_sc,
                        'train_f1':train_f1,
                        'test_f1':test_f1}
                print(temp)
                result = result.append(temp,ignore_index=True)

    print("All Done!")
    return result


In [None]:
file_path = '../data/system_sysinfo_unique_normalized.csv000.gz'
all_classifier_list = ['knn','random forest','decision tree','neural network','svm','sgd','logistic']


cl = ['svm','sgd','logistic']
x,y = data_cleaning(file_path)
r =  build_classifier(x,y,cl,trail_num =1)

Please ensure that each individual file can fit in memory and
use the keyword ``blocksize=None to remove this message``
Setting ``blocksize=None``
  warn(


read data successfully
clean data successfully
Start to build the classifier and tune the parameter
Start SVM


In [4]:
r.to_csv('../result/classifier_performance.csv',index=False)
r[['trail', 'classifier', 'parameter','train_f1','test_f1']]

Unnamed: 0,trail,classifier,parameter,train_f1,test_f1
0,0,KNN,2,0.541568,0.119518
1,0,KNN,7,0.296865,0.133593
2,0,KNN,12,0.221241,0.125909
3,0,KNN,17,0.183685,0.123441
4,0,KNN,22,0.161298,0.118938
...,...,...,...,...,...
185,4,Neural network,600,0.621034,0.256085
186,4,Neural network,700,0.621034,0.248744
187,4,Neural network,800,0.613301,0.258895
188,4,Neural network,900,0.622626,0.262091


In [7]:
# import dataframe_image as dfi
# dfi.export(result, 'classifier_result.png')

In [11]:
r = pd.read_csv('../result/classifier_performance.csv')
temp = r.groupby(['classifier', 'parameter']).mean().drop(columns=['trail'])
temp

Unnamed: 0_level_0,Unnamed: 1_level_0,train_acc,test_acc,train_f1,test_f1
classifier,parameter,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Decision Tree,5,0.369703,0.365741,0.369703,0.365741
Decision Tree,15,0.55832,0.301268,0.55832,0.301268
Decision Tree,25,0.817781,0.230255,0.817781,0.230255
Decision Tree,35,0.876295,0.213923,0.876295,0.213923
Decision Tree,45,0.879132,0.212272,0.879132,0.212272
Decision Tree,55,0.879153,0.212012,0.879153,0.212012
Decision Tree,65,0.879153,0.212378,0.879153,0.212378
Decision Tree,75,0.879153,0.211331,0.879153,0.211331
Decision Tree,85,0.879153,0.211809,0.879153,0.211809
Decision Tree,95,0.879153,0.211654,0.879153,0.211654


In [15]:
cond1 = r.classifier == 'KNN'
cond2 = r.parameter == 2

r[cond1&cond2][['trail', 'classifier', 'parameter','train_f1','test_f1']]


Unnamed: 0,trail,classifier,parameter,train_f1,test_f1
0,0,KNN,2,0.587921,0.171824
38,1,KNN,2,0.58889,0.173124
76,2,KNN,2,0.586364,0.1694
114,3,KNN,2,0.585412,0.171543
152,4,KNN,2,0.585205,0.172843
