In [301]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

In [302]:
df = pd.read_csv('./Data/A_smart_phones.csv')

In [303]:
df.head()

Unnamed: 0.1,Unnamed: 0,Product_Name,Price,Rating,Memory,Color
0,0,Xiaomi Redmi Note 9 Pro,252.0,4.9,128 GB,Interstellar Grey
1,1,Xiaomi Redmi Note 9 Pro,249.0,4.8,128 GB,Green
2,2,Huawei P40 Lite,199.0,4.6,128GB,Midnight Black
3,3,Xiaomi Redmi Note 9 Pro,239.0,5.0,128 GB,White
4,4,Samsung Galaxy M31s,317.0,5.0,128 GB,Mirage black


Clean Data:

In [304]:
df = df[['Product_Name', 'Price', 'Rating', 'Memory', 'Color']]

Setting 'Price' as a float type:

In [305]:
type(df['Price'][20])

str

In [306]:
def to_float_num(s):
    if s != '-':
        return float(s)
    else:
        return s

In [307]:
df.Price = df.Price.apply(lambda x: to_float_num(x))

In [325]:
type(df['Price'][20])

float

Fix column 'Memory':

In [308]:
def normalize_memory(x):
    x = x.lower().replace('gb', '')
    try:
        return int(x)
    except:
        return '-'
        

In [309]:
df['Memory_GB'] = df['Memory'].apply(lambda x: normalize_memory(x))

In [310]:
df.Memory_GB.value_counts()

-      256
128    200
64     151
32      88
256     69
16      42
512     17
8       12
4        2
24       1
12       1
3        1
Name: Memory_GB, dtype: int64

In [311]:
df = df.drop(['Memory'], axis=1)

Fix column 'Color':

In [312]:
df.Color = df.Color.apply(lambda x: x.lower())

In [313]:
threshhold = 10
is_rare = df.Color.value_counts() < 10
rare = is_rare[is_rare].index

In [314]:
are_rare_rows = df["Color"].isin(rare)
are_rare_rows[are_rare_rows].index

Int64Index([  0,   4,   5,   8,  10,  11,  13,  15,  16,  17,
            ...
            823, 824, 825, 827, 828, 831, 832, 835, 837, 838],
           dtype='int64', length=263)

In [315]:
rare_colors = df.iloc[are_rare_rows[are_rare_rows].index]

In [316]:
rare_colors

Unnamed: 0,Product_Name,Price,Rating,Color,Memory_GB
0,Xiaomi Redmi Note 9 Pro,252,4.9,interstellar grey,128
4,Samsung Galaxy M31s,317,5,mirage black,128
5,Xiaomi Redmi 9,149.5,5,carbon grey,64
8,Xiaomi Redmi 9C,118.51,-,twilight blue,32
10,Xiaomi Mi 10T Lite 5G,319,5,pearl gray,128
...,...,...,...,...,...
831,Xiaomi Redmi Note 7,-,4.7,space black,-
832,Huawei Y5 (2019),-,4.7,modern black,16
835,Huawei Y5P,-,5,phantom blue,32
837,Išmanusis telefonas iPhone 6S,-,-,rožinio aukso spalvos,64


In [317]:
def normalize_color(x):
    if x == ('mėlyna' or 'mėlynas'):
        return 'blue'
    elif len(x) < 3:
        return '-'
    elif x == ('geltonas' or 'geltona'):
        return 'yellow'
    elif x == ('juodas' or 'juoda'):
        return 'black'
    elif x == ('oranžinis' or 'oranžinė'):
        return 'orange'
    elif x == ('pilkas' or 'pilka'):
        return 'grey'
    elif x == ('auksinis' or 'auksinė'):
        return 'gold'
    elif x == ('sidabrinis' or 'sidabrinė'):
        return 'silver'
    elif x == ('baltas' or 'balta'):
        return 'white'
    elif x == ('rožinio aukso spalvos'):
        return 'pink gold'
    elif x == ('žalias' or 'žalia'):
        return 'green'
    elif '+' in x:
        return x.split('+')[0]
    elif x == 'dual sim':
        return '-'
    else:
        return x

In [318]:
df.Color = df.Color.apply(lambda x: normalize_color(x))

In [319]:
df.Color.value_counts()

-                 430
black              64
blue               42
midnight black     19
white              14
                 ... 
coral red           1
dark blue           1
space blue          1
sky blue            1
frozen blue         1
Name: Color, Length: 144, dtype: int64

In [320]:
df[100:150]

Unnamed: 0,Product_Name,Price,Rating,Color,Memory_GB
100,Xiaomi Redmi 9,132.0,5,ocean green,32
101,Huawei P30 Lite,198.51,4.8,-,128
102,Xiaomi Redmi 9,132.4,5,sunset purple,32
103,OnePlus Nord,502.91,-,-,256
104,Samsung Galaxy S10,549.49,-,-,128
105,Blackview BV9900 PRO,522.0,-,black,128
106,Samsung Galaxy A71 128GB,411.0,5,-,-
107,Blackview A80,74.0,5,-,16
108,Apple iPhone XR,658.0,-,-,128
109,Xiaomi Redmi 9,135.0,5,carbon grey,32


In [321]:
df.Rating.value_counts()

-      510
5      193
4.8     30
4.9     23
4       15
4.5     14
4.7     13
4.3     10
4.6      8
3        7
3.5      4
3.7      4
4.1      3
4.4      3
1        2
3.8      1
Name: Rating, dtype: int64

Normalize Product_Name:

In [322]:
df.Product_Name = df.Product_Name.apply(lambda x: 
                      x.replace('Išmanusis telefonas ', '').replace('Išmanusis', ''))

In [323]:
df.drop(df[df['Product_Name'].apply(lambda x: x.strip().split(' ')[0]) == 'laikrodis'].index,
       inplace = True)

Save to .csv file:

In [324]:
df.to_csv('./Cleaned Data/Retailer_A_phones.csv', index=False)