## Data Wrangling


In [21]:
import pandas as pd
import numpy as np
from scipy.stats import boxcox

In [22]:
df = pd.read_csv('../data/cleaned_mobile_data_prices.csv')
df.head()

Unnamed: 0,Name,Brand,Battery_capacity(mAh),Screen_size(inches),Touchscreen,Processor,RAM(GB),Internal_storage(GB),Operating system,Wi-Fi,Bluetooth,GPS,Number of SIMs,3G,4G/ LTE,Price,Resolution_width(px),Resolution_height(px),Rear_Camera(MP),Front_Camera(MP)
0,OnePlus 7T Pro McLaren Edition,OnePlus,4085,7,Yes,8,12,256,Android,Yes,Yes,Yes,2,Yes,Yes,58998,1440,3120,48,16
1,Realme X2 Pro,Realme,4000,7,Yes,8,6,64,Android,Yes,Yes,Yes,2,Yes,Yes,27999,1080,2400,64,16
2,iPhone 11 Pro Max,Apple,3969,7,Yes,6,4,64,iOS,Yes,Yes,Yes,2,Yes,Yes,106900,1242,2688,12,12
3,iPhone 11,Apple,3110,6,Yes,6,4,64,iOS,Yes,Yes,Yes,2,Yes,Yes,62900,828,1792,12,12
4,LG G8X ThinQ,LG,4000,6,Yes,8,6,128,Android,Yes,Yes,Yes,1,No,No,49990,1080,2340,12,32


In [23]:
numberical_features = df.select_dtypes(include=[np.number]).columns
print('The skewness of the numerical features are:\n\n', df[numberical_features].skew().sort_values(ascending=False))


The skewness of the numerical features are:

 Price                    4.648919
Internal_storage(GB)     4.171917
Rear_Camera(MP)          3.743875
Front_Camera(MP)         2.017653
RAM(GB)                  1.465799
Resolution_width(px)     0.747553
Resolution_height(px)    0.679858
Battery_capacity(mAh)    0.494983
Processor                0.030914
Screen_size(inches)     -0.387508
Number of SIMs          -1.755278
dtype: float64


In [24]:
df_skewed = pd.DataFrame()

In [25]:
df_skewed['Price'] = np.log1p(df['Price'])
df_skewed['Price'].skew()

np.float64(0.8096701999140155)

In [26]:
df_skewed['Internal_storage(GB)'] = np.log1p(df['Internal_storage(GB)'])
df_skewed['Internal_storage(GB)'].skew()

np.float64(0.2924727209068648)

In [27]:
df_skewed['Rear_Camera(MP)'],_ = boxcox(df['Rear_Camera(MP)']+1)
df_skewed['Rear_Camera(MP)'].skew()

np.float64(0.016773145480194786)

In [28]:
df_skewed['Front_Camera(MP)'],_ = boxcox(df['Front_Camera(MP)']+1)
df_skewed['Front_Camera(MP)'].skew()

np.float64(0.005868111475893991)

In [29]:
df_skewed['RAM(GB)'],_ = boxcox(df['RAM(GB)']+1)
df_skewed['RAM(GB)'].skew()

np.float64(0.003949076310545883)

In [30]:
df_skewed['Number of SIMs']= df['Number of SIMs']**4
df_skewed['Number of SIMs'].skew()

np.float64(-0.43571941476143966)

In [31]:
df_skewed.head()

Unnamed: 0,Price,Internal_storage(GB),Rear_Camera(MP),Front_Camera(MP),RAM(GB),Number of SIMs
0,10.985276,5.549076,4.197814,3.696808,3.157549,16
1,10.23996,4.174387,4.527737,3.696808,2.275481,16
2,11.579658,4.174387,2.695576,3.260498,1.830773,16
3,11.049317,4.174387,2.695576,3.260498,1.830773,16
4,10.819598,4.859812,2.695576,4.870624,2.275481,1


In [32]:

df_concat = df.drop(['Price', 'Internal_storage(GB)', 'Rear_Camera(MP)', 'Front_Camera(MP)', 'RAM(GB)', 'Number of SIMs'], axis=1)
df_skewed = pd.concat([df_concat, df_skewed], axis=1)

In [33]:
df_skewed.head()

Unnamed: 0,Name,Brand,Battery_capacity(mAh),Screen_size(inches),Touchscreen,Processor,Operating system,Wi-Fi,Bluetooth,GPS,3G,4G/ LTE,Resolution_width(px),Resolution_height(px),Price,Internal_storage(GB),Rear_Camera(MP),Front_Camera(MP),RAM(GB),Number of SIMs
0,OnePlus 7T Pro McLaren Edition,OnePlus,4085,7,Yes,8,Android,Yes,Yes,Yes,Yes,Yes,1440,3120,10.985276,5.549076,4.197814,3.696808,3.157549,16
1,Realme X2 Pro,Realme,4000,7,Yes,8,Android,Yes,Yes,Yes,Yes,Yes,1080,2400,10.23996,4.174387,4.527737,3.696808,2.275481,16
2,iPhone 11 Pro Max,Apple,3969,7,Yes,6,iOS,Yes,Yes,Yes,Yes,Yes,1242,2688,11.579658,4.174387,2.695576,3.260498,1.830773,16
3,iPhone 11,Apple,3110,6,Yes,6,iOS,Yes,Yes,Yes,Yes,Yes,828,1792,11.049317,4.174387,2.695576,3.260498,1.830773,16
4,LG G8X ThinQ,LG,4000,6,Yes,8,Android,Yes,Yes,Yes,No,No,1080,2340,10.819598,4.859812,2.695576,4.870624,2.275481,1


In [34]:
len(df.columns)

20

In [35]:
len(df_skewed.columns)

20

In [36]:
numberical_features = df_skewed.select_dtypes(include=[np.number]).columns
print('The skewness of the numerical features are:\n\n', df_skewed[numberical_features].skew().sort_values(ascending=False))


The skewness of the numerical features are:

 Price                    0.809670
Resolution_width(px)     0.747553
Resolution_height(px)    0.679858
Battery_capacity(mAh)    0.494983
Internal_storage(GB)     0.292473
Processor                0.030914
Rear_Camera(MP)          0.016773
Front_Camera(MP)         0.005868
RAM(GB)                  0.003949
Screen_size(inches)     -0.387508
Number of SIMs          -0.435719
dtype: float64


In [37]:
df_skewed.to_csv('../data/skewed_data.csv', index=False)

In [38]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_skewed_scaled = df_skewed.copy()

for column in df_skewed_scaled.select_dtypes(exclude='object').columns:
    if column == 'Price':
        continue;
    df_skewed_scaled[column] = scaler.fit_transform(df_skewed_scaled[[column]])

In [39]:
df_skewed_scaled.head()

Unnamed: 0,Name,Brand,Battery_capacity(mAh),Screen_size(inches),Touchscreen,Processor,Operating system,Wi-Fi,Bluetooth,GPS,3G,4G/ LTE,Resolution_width(px),Resolution_height(px),Price,Internal_storage(GB),Rear_Camera(MP),Front_Camera(MP),RAM(GB),Number of SIMs
0,OnePlus 7T Pro McLaren Edition,OnePlus,1.320503,2.271877,Yes,1.129343,Android,Yes,Yes,Yes,Yes,Yes,2.329867,2.942193,10.985276,2.879772,2.808761,1.40736,3.306135,0.416592
1,Realme X2 Pro,Realme,1.22314,2.271877,Yes,1.129343,Android,Yes,Yes,Yes,Yes,Yes,1.00045,1.647257,10.23996,1.308767,3.36465,1.40736,1.779027,0.416592
2,iPhone 11 Pro Max,Apple,1.187632,2.271877,Yes,0.217181,iOS,Yes,Yes,Yes,Yes,Yes,1.598688,2.165231,11.579658,1.308767,0.277632,1.000871,1.009114,0.416592
3,iPhone 11,Apple,0.203694,0.858265,Yes,0.217181,iOS,Yes,Yes,Yes,Yes,Yes,0.069859,0.553756,11.049317,1.308767,0.277632,1.000871,1.009114,0.416592
4,LG G8X ThinQ,LG,1.22314,0.858265,Yes,1.129343,Android,Yes,Yes,Yes,No,No,1.00045,1.539346,10.819598,2.092076,0.277632,2.500947,1.779027,-2.132267


In [40]:
df_skewed_scaled.to_csv('../data/scaled_skewed_data.csv', index=False)