In [61]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [62]:
phone_df = pd.read_csv('../data/Smartphone_chosse.csv')

In [63]:
phone_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   battery_power  2000 non-null   int64  
 1   blue           2000 non-null   int64  
 2   clock_speed    2000 non-null   float64
 3   dual_sim       2000 non-null   int64  
 4   fc             2000 non-null   int64  
 5   four_g         2000 non-null   int64  
 6   int_memory     2000 non-null   int64  
 7   m_dep          2000 non-null   float64
 8   mobile_wt      2000 non-null   int64  
 9   n_cores        2000 non-null   int64  
 10  pc             2000 non-null   int64  
 11  px_height      2000 non-null   int64  
 12  px_width       2000 non-null   int64  
 13  ram            2000 non-null   int64  
 14  sc_h           2000 non-null   int64  
 15  sc_w           2000 non-null   int64  
 16  talk_time      2000 non-null   int64  
 17  three_g        2000 non-null   int64  
 18  touch_sc

In [64]:
phone_df.sample(5)

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
1983,1262,0,1.8,1,12,0,34,0.1,149,5,...,223,737,3248,13,3,4,0,1,1,2
1670,1569,1,1.6,1,4,1,54,0.9,190,4,...,371,665,3905,6,2,4,1,0,0,3
822,839,0,2.0,1,0,0,14,0.4,175,7,...,564,1391,3835,15,8,11,1,0,0,3
1975,1157,1,0.8,0,7,0,27,0.1,88,8,...,1694,1798,2885,8,4,2,1,0,1,3
1468,1425,1,0.6,0,0,1,33,0.5,89,3,...,396,1648,3771,10,3,9,1,0,1,3


In [65]:
phone_df.isnull().sum() # сумма пропущенных значений в колонках
# пропусков нет

battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64

In [66]:
# Разделим фичи на числовые и категориальные
FEATURES = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
        'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
        'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
        'touch_screen', 'wifi','price_range']

CAT_FEATURES = ['n_cores', 'blue', 'dual_sim', 'four_g', 'three_g', 'touch_screen', 'wifi']

CONT_FEATURES = [col for col in FEATURES if col not in CAT_FEATURES]

In [67]:
phone_df[CAT_FEATURES].head()

Unnamed: 0,n_cores,blue,dual_sim,four_g,three_g,touch_screen,wifi
0,2,0,0,0,0,0,1
1,3,1,1,1,1,1,0
2,5,1,1,1,1,1,0
3,6,1,0,0,1,0,0
4,2,1,0,1,1,1,0


In [68]:
# Кодирование категориальных признаков
cat_encoding_dict = {}
for col in CAT_FEATURES:
    label_enc = LabelEncoder()
    phone_df[col] = label_enc.fit_transform(phone_df[col])
    cat_encoding_dict[col] = label_enc

In [69]:
cat_encoding_dict['n_cores'].classes_

array([1, 2, 3, 4, 5, 6, 7, 8], dtype=int64)

In [70]:
phone_df[CAT_FEATURES].head() # Они же являются бинарными (кроме n_cores)

Unnamed: 0,n_cores,blue,dual_sim,four_g,three_g,touch_screen,wifi
0,1,0,0,0,0,0,1
1,2,1,1,1,1,1,0
2,4,1,1,1,1,1,0
3,5,1,0,0,1,0,0
4,1,1,0,1,1,1,0


In [71]:
# Кодирование порядковых признаков могло выглядеть так
phone_df[CONT_FEATURES].head()

Unnamed: 0,battery_power,clock_speed,fc,int_memory,m_dep,mobile_wt,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,price_range
0,842,2.2,1,7,0.6,188,2,20,756,2549,9,7,19,1
1,1021,0.5,0,53,0.7,136,6,905,1988,2631,17,3,7,2
2,563,0.5,2,41,0.9,145,6,1263,1716,2603,11,2,9,2
3,615,2.5,0,10,0.8,131,9,1216,1786,2769,16,8,11,2
4,1821,1.2,13,44,0.6,141,14,1208,1212,1411,8,2,15,1


In [72]:
mapping = {0: 0,
           1: 1,
           2: 2,
           3: 3 }

In [73]:
phone_df['price_range'] = phone_df['price_range'].map(mapping)
phone_df[CONT_FEATURES].head()


Unnamed: 0,battery_power,clock_speed,fc,int_memory,m_dep,mobile_wt,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,price_range
0,842,2.2,1,7,0.6,188,2,20,756,2549,9,7,19,1
1,1021,0.5,0,53,0.7,136,6,905,1988,2631,17,3,7,2
2,563,0.5,2,41,0.9,145,6,1263,1716,2603,11,2,9,2
3,615,2.5,0,10,0.8,131,9,1216,1786,2769,16,8,11,2
4,1821,1.2,13,44,0.6,141,14,1208,1212,1411,8,2,15,1


In [74]:
phone_df['price_range']

0       1
1       2
2       2
3       2
4       1
       ..
1995    0
1996    2
1997    3
1998    0
1999    3
Name: price_range, Length: 2000, dtype: int64

In [75]:
phone_df.to_csv('../data/Smartphone_chosse_preprocessed.csv')
