In [49]:
import pandas as pd

import dask
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster
import multiprocessing.popen_spawn_win32
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn import preprocessing

In [2]:
#start a local dask cluster
cluster = LocalCluster(n_workers=4)
client = Client(cluster)
client

Perhaps you already have a cluster running?
Hosting the HTTP server on port 12369 instead


0,1
Client  Scheduler: tcp://127.0.0.1:12372  Dashboard: http://127.0.0.1:12369/status,Cluster  Workers: 4  Cores: 12  Memory: 17.14 GB


In [14]:
#read data
sys_info = dd.read_csv('../data/system_sysinfo_unique_normalized.csv000.gz',
                       delimiter ="\1",
                       assume_missing=True)
sys_info.head()

Please ensure that each individual file can fit in memory and
use the keyword ``blocksize=None to remove this message``
Setting ``blocksize=None``
  warn(


Unnamed: 0,load_ts,guid,chassistype,chassistype_2in1_category,countryname,countryname_normalized,modelvendor,modelvendor_normalized,model,model_normalized,...,cpu_suffix,screensize_category,persona,processor_line,vpro_enabled,firstreportdate,lastreportdate,discretegraphics,cpu_stepping,engagement_id
0,2020-10-01 00:48:15,10351643a4ae4b2d9cbc8a90db184050,Notebook,Unknown,Brazil,Brazil,Lenovo,Lenovo,Lenovo G480,Lenovo G480,...,Core-M,14x,Web User,M-Processor,N,2020-01-12 20:31:03,2020-09-29 18:30:56,N,Intel64 Family 6 Model 42 Stepping 7,iduu
1,2020-10-01 00:48:15,4f40b2f38cd74f6ea1559cebbc924619,Notebook,Unknown,Sweden,Sweden,Asus,Asus,T300CHI,T300CHI,...,Core-Y,12x,Web User,Unknown,N,2018-11-15 17:24:52,2020-09-28 12:49:29,N,Intel64 Family 6 Model 61 Stepping 4,iduu
2,2020-10-01 00:48:15,04b81eb4d661410ebff356f27fc544ed,Desktop,Unknown,Viet Nam,Viet Nam,System manufacturer,System manufacturer,System Product Name,System Product Name,...,Other,23x,Office/Productivity,Unknown,Y,2020-05-09 12:24:37,2020-09-29 08:54:35,Y,Intel64 Family 6 Model 158 Stepping 12,intel.com
3,2020-10-01 00:48:15,afd55a587fe14fc1904d572b3230dd54,Notebook,Unknown,Mexico,Mexico,HP,HP,HP ProBook 640 G2,HP ProBook 640 G2,...,Core-U,14x,Office/Productivity,U-Processor,N,2019-10-22 13:56:27,2020-09-28 08:41:47,N,Intel64 Family 6 Model 78 Stepping 3,intel.com
4,2020-10-01 00:48:15,4b3d6747c5f541949f4c1bd127b00bd9,Notebook,Unknown,Romania,Romania,Acer,Acer,Nitro AN515-52,Nitro AN515-52,...,Core-H,15x,Web User,H-Processor,N,2020-04-01 17:37:46,2020-04-23 19:34:16,Y,Intel64 Family 6 Model 158 Stepping 10,intel.com


# EDA & Cleaning

In [26]:
sys_info.persona.value_counts().compute()

Web User                 35153
Casual User              12810
Gamer                    10611
Casual Gamer             10498
Office/Productivity       7519
Content Creator/IT        5883
Communication             5836
Win Store App User        4968
Entertainment             3768
File & Network Sharer     2231
Unknown                    723
Name: persona, dtype: int64

In [39]:
df= sys_info[['persona','modelvendor_normalized','cpuvendor','ram','os']].compute()
df

Unnamed: 0,persona,modelvendor_normalized,cpuvendor,ram,os
0,Web User,Lenovo,Intel,12.0,Win10
1,Web User,Asus,Intel,8.0,Win10
2,Office/Productivity,System manufacturer,Intel,8.0,Win10
3,Office/Productivity,HP,Intel,8.0,Win10
4,Web User,Acer,Intel,16.0,Win10
...,...,...,...,...,...
99995,Web User,Gigabyte,Intel,8.0,Win10
99996,Web User,System manufacturer,Intel,2.0,Win10
99997,Web User,Lenovo,Intel,8.0,Win10
99998,Casual Gamer,Acer,Intel,16.0,Win10


In [40]:
df = df[df.persona!= 'Unknown']
df

Unnamed: 0,persona,modelvendor_normalized,cpuvendor,ram,os
0,Web User,Lenovo,Intel,12.0,Win10
1,Web User,Asus,Intel,8.0,Win10
2,Office/Productivity,System manufacturer,Intel,8.0,Win10
3,Office/Productivity,HP,Intel,8.0,Win10
4,Web User,Acer,Intel,16.0,Win10
...,...,...,...,...,...
99995,Web User,Gigabyte,Intel,8.0,Win10
99996,Web User,System manufacturer,Intel,2.0,Win10
99997,Web User,Lenovo,Intel,8.0,Win10
99998,Casual Gamer,Acer,Intel,16.0,Win10


In [41]:
df.modelvendor_normalized.value_counts()

HP                       15386
Dell                     14729
Lenovo                   11815
Asus                     10624
System manufacturer       6454
Acer                      6123
MSI                       5327
Gigabyte                  5122
Unknown                   3460
Intel                     3317
To be filled by O.E.M     2952
Other                     2131
Toshiba                   1655
Samsung                   1650
Microsoft Corporation      941
Fujitsu                    877
Sony                       865
Alienware                  764
LG                         546
Medion                     470
Notebook                   410
Apple                      399
Mouse Computer             332
Razer                      297
NEC                        295
Positivo                   245
Wortmann_AG                179
ECS                        179
Panasonic Corporation      155
Timi                       144
Gateway                    143
Biostar                    121
Packard 

In [42]:
df.cpuvendor.value_counts()

Intel    97882
AMD       1395
Name: cpuvendor, dtype: int64

In [43]:
df.ram.value_counts()

8.000000     40019
16.000000    27404
4.000000     14492
32.000000     5656
12.000000     4808
             ...  
7.003906         1
6.007812         1
32.007812        1
3.953125         1
4.000488         1
Name: ram, Length: 99, dtype: int64

In [44]:
df.os.value_counts()

Win10         97077
Win8.1         1979
Win8            144
Win Server       65
Name: os, dtype: int64

# Feature Engineering

In [45]:
df = pd.get_dummies(df, columns =['cpuvendor','modelvendor_normalized','os']).reset_index(drop=True)
df

Unnamed: 0,persona,ram,cpuvendor_AMD,cpuvendor_Intel,modelvendor_normalized_Acer,modelvendor_normalized_Alienware,modelvendor_normalized_Apple,modelvendor_normalized_Asus,modelvendor_normalized_Biostar,modelvendor_normalized_Casper Bilgisayar,...,modelvendor_normalized_To be filled by O.E.M,modelvendor_normalized_Toshiba,modelvendor_normalized_Unknown,modelvendor_normalized_VAIO Corporation,modelvendor_normalized_Wortmann_AG,modelvendor_normalized_innotek GmbH,os_Win Server,os_Win10,os_Win8,os_Win8.1
0,Web User,12.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,Web User,8.0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,Office/Productivity,8.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,Office/Productivity,8.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,Web User,16.0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99272,Web User,8.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
99273,Web User,2.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
99274,Web User,8.0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
99275,Casual Gamer,16.0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [53]:
le = preprocessing.LabelEncoder()

y = df['persona'].values
x = df[df.columns[1:]].values
y = le.fit_transform(y)

# KNN Classifier

In [54]:
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(x, y)
y_pred = neigh.predict(x)
accuracy_score(y, y_pred)

0.19041671283378828

# Decision Tree Classifier

In [55]:
clf = DecisionTreeClassifier(random_state=0)
clf.fit(x, y)
y_pred = clf.predict(x)
accuracy_score(y, y_pred)

0.3596804899422827