### Imports

In [13]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [14]:
phones = pd.read_csv('datasets/clean_full_mobiles_dataset.tsv', sep='\t')


In [15]:
phones.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 495 entries, 0 to 494
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   model               495 non-null    object 
 1   price(₽)            493 non-null    float64
 2   old_price(₽)        316 non-null    float64
 3   rating              330 non-null    float64
 4   rating_count        330 non-null    float64
 5   company             495 non-null    object 
 6   color               486 non-null    object 
 7   release_date        457 non-null    object 
 8   country             431 non-null    object 
 9   weight(g)           486 non-null    float64
 10  diagonal(inches)    495 non-null    float64
 11  display             495 non-null    object 
 12  refresh_rate(Hz)    279 non-null    float64
 13  memory(Gb)          495 non-null    int64  
 14  ram(Gb)             495 non-null    int64  
 15  battery(mAh)        495 non-null    int64  
 16  processo

In [16]:
phones.describe(include="all")

Unnamed: 0,model,price(₽),old_price(₽),rating,rating_count,company,color,release_date,country,weight(g),diagonal(inches),display,refresh_rate(Hz),memory(Gb),ram(Gb),battery(mAh),processor,front_camera(Mpix),main_camera
count,495,493.0,316.0,330.0,330.0,495,486,457,431,486.0,495.0,495,279.0,495.0,495.0,495.0,495,495.0,495
unique,493,,,,,10,16,33,2,,,21,,,,,83,,70
top,Смартфон Samsung Galaxy A25 6/128GB Голубой EAC,,,,,TECNO,Черный,2024-01,Китай,,,IPS,,,,,8-ядерный Qualcomm Snapdragon 680 2.40 ГГц,,Два модуля (50 + 2 Mpix)
freq,2,,,,,99,148,63,408,,,155,,,,,35,,39
mean,,29052.918864,28017.544304,4.789394,7.536364,,,,,193.236626,6.67503,,103.698925,209.850505,7.373737,5032.525253,,20.446465,
std,,33955.483258,22716.370224,0.501587,11.53323,,,,,13.354777,0.16879,,18.831963,126.390476,2.799602,443.826624,,73.022906,
min,,5299.0,7999.0,3.0,1.0,,,,,166.0,6.1,,60.0,1.0,2.0,3700.0,,5.0,
25%,,11999.0,14999.0,4.9,2.0,,,,,186.0,6.6,,90.0,128.0,4.0,5000.0,,8.0,
50%,,16999.0,21499.0,5.0,3.0,,,,,192.0,6.7,,120.0,128.0,8.0,5000.0,,13.0,
75%,,27999.0,31249.0,5.0,9.0,,,,,199.0,6.74,,120.0,256.0,8.0,5000.0,,16.0,


In [17]:
phones.columns.tolist()

['model',
 'price(₽)',
 'old_price(₽)',
 'rating',
 'rating_count',
 'company',
 'color',
 'release_date',
 'country',
 'weight(g)',
 'diagonal(inches)',
 'display',
 'refresh_rate(Hz)',
 'memory(Gb)',
 'ram(Gb)',
 'battery(mAh)',
 'processor',
 'front_camera(Mpix)',
 'main_camera']

In [18]:
phones.isnull().sum()

model                   0
price(₽)                2
old_price(₽)          179
rating                165
rating_count          165
company                 0
color                   9
release_date           38
country                64
weight(g)               9
diagonal(inches)        0
display                 0
refresh_rate(Hz)      216
memory(Gb)              0
ram(Gb)                 0
battery(mAh)            0
processor               0
front_camera(Mpix)      0
main_camera             0
dtype: int64

# Filling missing values

### Price

In [19]:
print(phones[phones['price(₽)'].isna()])

                                                 model  price(₽)  \
126  Смартфон Samsung Galaxy S23 Ultra 512GB Светло...       NaN   
370  Смартфон Samsung Galaxy S23 Ultra 1TB Зеленый EAC       NaN   

     old_price(₽)  rating  rating_count  company    color release_date  \
126           NaN     NaN           NaN  Samsung  Розовый      2023-02   
370           NaN     NaN           NaN  Samsung  Зеленый      2023-02   

     country  weight(g)  diagonal(inches)            display  \
126  Вьетнам      234.0               6.8  Dynamic AMOLED 2X   
370  Вьетнам      234.0               6.8  Dynamic AMOLED 2X   

     refresh_rate(Hz)  memory(Gb)  ram(Gb)  battery(mAh)  \
126               NaN         512       12          5000   
370               NaN           1       12          5000   

                                          processor  front_camera(Mpix)  \
126  8-ядерный Qualcomm Snapdragon 8 Gen 2 2.80 ГГц                  12   
370  8-ядерный Qualcomm Snapdragon 8 Gen 2 2.80 

Цены нет у двух моделей Смартфон Samsung Galaxy S23 Ultra 512GB(1ТВ), так как их нет в наличии, поэтому я поставил цену как у аналогичных моделей других цветов.

In [20]:
phones.at[126, 'price(₽)'] = 109999
phones.at[370, 'price(₽)'] = 137999

### other columns

In [21]:
phones['rating'].fillna(0, inplace=True)
phones['rating_count'].fillna(0, inplace=True)
phones['old_price(₽)'].fillna(phones['price(₽)'], inplace=True)
phones['memory(Gb)'].replace(1, 1024, inplace=True)
phones['refresh_rate(Hz)'].fillna(60, inplace=True)
phones['weight(g)'].fillna(phones['weight(g)'].mean(), inplace=True)
phones['color'].fillna(phones['color'].mode()[0], inplace=True)
phones['release_date'] = phones.groupby('company')['release_date'].transform(lambda x: x.fillna(x.mode()[0]))
phones['sale(₽)'] = phones['old_price(₽)'] - phones['price(₽)']
phones.drop(columns='old_price(₽)', inplace=True)

cols = list(phones.columns)
cols.insert(2, cols.pop(cols.index('sale(₽)')))
phones = phones[cols]

### Country

In [22]:
print(phones[phones['country'].isna()]['company'].tolist())

['Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung', 'Samsung']


Страна не написана только у телефонов Samsung. Известно что они производятся в Южной Корее.

In [23]:
phones['country'].fillna('South Korea', inplace = True)

In [24]:
print(phones[phones['refresh_rate(Hz)'].isna()])

Empty DataFrame
Columns: [model, price(₽), sale(₽), rating, rating_count, company, color, release_date, country, weight(g), diagonal(inches), display, refresh_rate(Hz), memory(Gb), ram(Gb), battery(mAh), processor, front_camera(Mpix), main_camera]
Index: []


### One-hot encoding, normalising,  and writing to csv file

In [25]:
NUMERIC_COLUMNS = ['price(₽)', 'sale(₽)', 'rating', 'rating_count', 'weight(g)', 'diagonal(inches)',
                      'refresh_rate(Hz)', 'memory(Gb)', 'ram(Gb)', 'battery(mAh)', 'front_camera(Mpix)']
min_max_scaler = MinMaxScaler()
phones_encoded = pd.get_dummies(phones, columns=['color', 'country', 'display', 'processor'], drop_first=True)
phones_encoded[NUMERIC_COLUMNS] = min_max_scaler.fit_transform(phones_encoded[NUMERIC_COLUMNS])
phones_encoded.to_csv('datasets/full_mobiles_dataset.csv', index=True)