In [19]:
#Clean System_info dataset and merge the batinfo and systeminfo

import pandas as pd
import numpy as np

import dask
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster
import multiprocessing.popen_spawn_win32

#create DASK local cluster and client
cluster = LocalCluster(n_workers=4)
client = Client(cluster)

#read csv file
bat_info = dd.read_csv('../data/batt_info.csv000',
                       delimiter ="\1",
                       assume_missing=True)

#drop missing value
bat_info_clean = bat_info[bat_info.interval_start_utc <'2022']
bat_info_clean = bat_info_clean[(bat_info.interval_end_utc <'2022')& (bat_info.interval_end_utc >'2019')]
bat_info_clean = bat_info_clean[(bat_info_clean.dt <'2022')& (bat_info_clean.dt >'2019')]
bat_info_clean = bat_info_clean[bat_info_clean.battery_enum <10]

#Keep only one record for one guid
bat_info_clean = bat_info_clean.drop_duplicates(subset=['guid'])

#Normalize the chemistry
bat_info_clean['chemistry'] = bat_info_clean['chemistry'].map_partitions(lambda x: x.astype(str).str.lower())

def norm_chem(i):
    #find the value for lion
    
    s = i.chemistry
    if(s == 'lion'):
        return 'lion'
    elif(s == 'liio'):
        return 'lion'
    elif(s == 'li-i'):
        return 'lion'
    elif(s == 'li'):
        return 'lion'
    elif(s == 'lio'):
        return 'lion'
    
    #find the value for lip
    elif(s == 'lip'):
        return 'lip'
    elif(s == 'li p'):
        return 'lip'
    elif(s == 'lipo'):
        return 'lip'
    
    #find the value of pbac
    elif(s == 'pbac'):
        return 'pbac'
    
    #find the missing values
    elif(s == 'nan'):
        return np.nan
    elif(s == 'miss'):
        return np.nan
    else:
        return np.nan
    
bat_info_clean['chemistry'] = bat_info_clean.apply(norm_chem, axis=1, meta=(None, 'str'))

#drop missing value
bat_info_clean = bat_info_clean.dropna(subset=['chemistry','battery_count','designed_capacity'])
cond1 = bat_info_clean.designed_capacity != 0
cond2 = bat_info_clean.battery_count != 0

bat_info_clean = bat_info_clean[cond1& cond2]

#only keep useful features
bat_info_clean =bat_info_clean[['guid','chemistry','battery_count','designed_capacity']]
bat_info_clean = bat_info_clean.compute()

#read csv file
sys_info = dd.read_csv('../data/system_sysinfo_unique_normalized.csv000.gz',
                       delimiter ="\1",
                       assume_missing=True)
useful_columns = ['guid','chassistype_2in1_category','countryname_normalized','modelvendor_normalized','model_normalized',
                    'ram','os','#ofcores', 'age_category', 'graphicsmanuf', 'gfxcard',
                   'graphicscardclass', 'processornumber', 'cpuvendor', 'cpuname',
                   'cpucode', 'cpu_family', 'cpu_suffix', 'screensize_category', 'persona',
                   'processor_line', 'vpro_enabled','discretegraphics', 'cpu_stepping', 'engagement_id']

sys_info = sys_info[useful_columns]
merged = bat_info_clean.merge(sys_info.compute(), on=['guid'],how='inner')

merged.to_csv('../data/batt_info_and_system_info.csv',index = False)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 8451 instead
Please ensure that each individual file can fit in memory and
use the keyword ``blocksize=None to remove this message``
Setting ``blocksize=None``
  warn(


In [13]:
merged.chassistype_2in1_category.value_counts()

Unknown        29434
Convertible     2293
Detachable       904
Name: chassistype_2in1_category, dtype: int64

In [15]:
merged.countryname_normalized.value_counts()

United States of America                                7226
Other                                                   2652
Germany                                                 2155
United Kingdom of Great Britain and Northern Ireland    1627
Brazil                                                  1491
Japan                                                   1404
Canada                                                  1053
India                                                   1004
China                                                    962
Korea, Republic of                                       937
Italy                                                    858
Netherlands                                              757
Turkey                                                   647
Mexico                                                   635
France                                                   600
Australia                                                577
Spain                   

In [16]:
merged.modelvendor_normalized.value_counts()

HP                       6405
Dell                     6255
Lenovo                   5501
Asus                     4323
Acer                     2850
MSI                      1152
Toshiba                   894
Samsung                   864
Microsoft Corporation     524
Sony                      470
Other                     411
Fujitsu                   334
Alienware                 297
LG                        273
Notebook                  246
Medion                    189
Apple                     173
Razer                     168
NEC                       113
Gigabyte                  108
Panasonic Corporation     108
Timi                       86
Mouse Computer             76
Clevo                      66
Positivo                   65
Hasee                      64
Unknown                    60
MONSTER                    53
HUAWEI                     51
VAIO Corporation           50
Packard Bell               48
Intel                      45
Gateway                    43
System man

In [17]:
merged.ram.value_counts()

8.000000      14704
16.000000      7856
4.000000       5436
12.000000      1724
6.000000       1231
32.000000       913
24.000000       176
2.000000        120
20.000000       107
10.000000        70
64.000000        63
3.000000         62
8.015625         48
16.015625        21
4.015625         17
8.007812         14
5.000000         10
40.000000         9
48.000000         7
4.007812          5
0.000000          4
12.007812         3
22.000000         3
8.000977          3
8.003906          3
16.007812         2
128.000000        2
28.000000         2
4.000977          2
13.000000         1
80.000000         1
36.000000         1
8.005859          1
9.000000          1
48.001953         1
18.000000         1
12.015625         1
4.003906          1
34.547852         1
4.005859          1
56.000000         1
14.000000         1
4.005833          1
Name: ram, dtype: int64

In [18]:
merged.os.value_counts()

Win10         31495
Win8.1          809
Win7            249
Win8             70
Win Server        8
Name: os, dtype: int64