## Data Wrangling


In [64]:
import pandas as pd
import numpy as np
from scipy.stats import boxcox

In [65]:
df = pd.read_csv('../data/cleaned_mobile_data.csv')
df.head()

Unnamed: 0,Brand,Battery_capacity(mAh),Screen_size(inches),Touchscreen,Processor,RAM(GB),Internal_storage(GB),Operating system,Wi-Fi,Bluetooth,GPS,Number of SIMs,3G,4G/ LTE,Price,Resolution_width(px),Resolution_height(px),Rear_Camera(MP),Front_Camera(MP)
0,oneplus,4085,7,Yes,8,12,256,android,Yes,Yes,Yes,2,Yes,Yes,58998,1440,3120,48,16
1,realme,4000,7,Yes,8,6,64,android,Yes,Yes,Yes,2,Yes,Yes,27999,1080,2400,64,16
2,apple,3969,7,Yes,6,4,64,ios,Yes,Yes,Yes,2,Yes,Yes,106900,1242,2688,12,12
3,apple,3110,6,Yes,6,4,64,ios,Yes,Yes,Yes,2,Yes,Yes,62900,828,1792,12,12
4,lg,4000,6,Yes,8,6,128,android,Yes,Yes,Yes,1,No,No,49990,1080,2340,12,32


In [66]:
numberical_features = df.select_dtypes(include=[np.number]).columns
print('The skewness of the numerical features are:\n\n', df[numberical_features].skew().sort_values(ascending=False))


The skewness of the numerical features are:

 Price                    4.607462
Internal_storage(GB)     4.066089
Rear_Camera(MP)          3.656900
Front_Camera(MP)         1.985368
RAM(GB)                  1.434858
Resolution_width(px)     0.723081
Resolution_height(px)    0.654431
Battery_capacity(mAh)    0.486553
Processor                0.008903
Screen_size(inches)     -0.394778
Number of SIMs          -1.752184
dtype: float64


In [67]:
df_skewed = pd.DataFrame()

In [68]:
df_skewed['Price'] = np.log1p(df['Price'])
df_skewed['Price'].skew()

np.float64(0.7888100958452384)

In [69]:
df_skewed['Internal_storage(GB)'] = np.log1p(df['Internal_storage(GB)'])
df_skewed['Internal_storage(GB)'].skew()

np.float64(0.2873599715962765)

In [70]:
df_skewed['Rear_Camera(MP)'],_ = boxcox(df['Rear_Camera(MP)']+1)
df_skewed['Rear_Camera(MP)'].skew()

np.float64(0.013748024658755456)

In [71]:
df_skewed['Front_Camera(MP)'],_ = boxcox(df['Front_Camera(MP)']+1)
df_skewed['Front_Camera(MP)'].skew()

np.float64(0.005052282405939675)

In [72]:
df_skewed['RAM(GB)'],_ = boxcox(df['RAM(GB)']+1)
df_skewed['RAM(GB)'].skew()

np.float64(0.0034520788606554782)

In [73]:
df_skewed['Number of SIMs']= df['Number of SIMs']**4
df_skewed['Number of SIMs'].skew()

np.float64(-0.448242844309048)

In [74]:
df_skewed.head()

Unnamed: 0,Price,Internal_storage(GB),Rear_Camera(MP),Front_Camera(MP),RAM(GB),Number of SIMs
0,10.985276,5.549076,4.142627,3.689184,3.180631,16
1,10.23996,4.174387,4.463819,3.689184,2.287895,16
2,11.579658,4.174387,2.672344,3.254454,1.838961,16
3,11.049317,4.174387,2.672344,3.254454,1.838961,16
4,10.819598,4.859812,2.672344,4.858008,2.287895,1


In [75]:

df_concat = df.drop(['Price', 'Internal_storage(GB)', 'Rear_Camera(MP)', 'Front_Camera(MP)', 'RAM(GB)', 'Number of SIMs'], axis=1)
df_skewed = pd.concat([df_concat, df_skewed], axis=1)

In [76]:
df_skewed.head()

Unnamed: 0,Brand,Battery_capacity(mAh),Screen_size(inches),Touchscreen,Processor,Operating system,Wi-Fi,Bluetooth,GPS,3G,4G/ LTE,Resolution_width(px),Resolution_height(px),Price,Internal_storage(GB),Rear_Camera(MP),Front_Camera(MP),RAM(GB),Number of SIMs
0,oneplus,4085,7,Yes,8,android,Yes,Yes,Yes,Yes,Yes,1440,3120,10.985276,5.549076,4.142627,3.689184,3.180631,16
1,realme,4000,7,Yes,8,android,Yes,Yes,Yes,Yes,Yes,1080,2400,10.23996,4.174387,4.463819,3.689184,2.287895,16
2,apple,3969,7,Yes,6,ios,Yes,Yes,Yes,Yes,Yes,1242,2688,11.579658,4.174387,2.672344,3.254454,1.838961,16
3,apple,3110,6,Yes,6,ios,Yes,Yes,Yes,Yes,Yes,828,1792,11.049317,4.174387,2.672344,3.254454,1.838961,16
4,lg,4000,6,Yes,8,android,Yes,Yes,Yes,No,No,1080,2340,10.819598,4.859812,2.672344,4.858008,2.287895,1


In [77]:
len(df.columns)

19

In [78]:
len(df_skewed.columns)

19

In [79]:
numberical_features = df_skewed.select_dtypes(include=[np.number]).columns
print('The skewness of the numerical features are:\n\n', df_skewed[numberical_features].skew().sort_values(ascending=False))


The skewness of the numerical features are:

 Price                    0.788810
Resolution_width(px)     0.723081
Resolution_height(px)    0.654431
Battery_capacity(mAh)    0.486553
Internal_storage(GB)     0.287360
Rear_Camera(MP)          0.013748
Processor                0.008903
Front_Camera(MP)         0.005052
RAM(GB)                  0.003452
Screen_size(inches)     -0.394778
Number of SIMs          -0.448243
dtype: float64


In [80]:
df_skewed.to_csv('../data/skewed_data.csv', index=False)

In [81]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_skewed_scaled = df_skewed.copy()

for column in df_skewed_scaled.select_dtypes(exclude='object').columns:
    if column == 'Price':
        continue;
    df_skewed_scaled[column] = scaler.fit_transform(df_skewed_scaled[[column]])

In [82]:
df_skewed_scaled.head()

Unnamed: 0,Brand,Battery_capacity(mAh),Screen_size(inches),Touchscreen,Processor,Operating system,Wi-Fi,Bluetooth,GPS,3G,4G/ LTE,Resolution_width(px),Resolution_height(px),Price,Internal_storage(GB),Rear_Camera(MP),Front_Camera(MP),RAM(GB),Number of SIMs
0,oneplus,1.31301,2.263844,Yes,1.11527,android,Yes,Yes,Yes,Yes,Yes,2.322392,2.921979,10.985276,2.854021,2.768793,1.389087,3.301137,0.417509
1,realme,1.215666,2.263844,Yes,1.11527,android,Yes,Yes,Yes,Yes,Yes,0.992052,1.630673,10.23996,1.291021,3.315529,1.389087,1.768011,0.417509
2,apple,1.180164,2.263844,Yes,0.204422,ios,Yes,Yes,Yes,Yes,Yes,1.590705,2.147195,11.579658,1.291021,0.266066,0.984988,0.99704,0.417509
3,apple,0.196418,0.849982,Yes,0.204422,ios,Yes,Yes,Yes,Yes,Yes,0.060815,0.540236,11.049317,1.291021,0.266066,0.984988,0.99704,0.417509
4,lg,1.215666,0.849982,Yes,1.11527,android,Yes,Yes,Yes,No,No,0.992052,1.523064,10.819598,2.070339,0.266066,2.475555,1.768011,-2.130673


In [83]:
df_skewed_scaled.to_csv('../data/scaled_skewed_data.csv', index=False)