## Data Wrangling


In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../data/cleaned_mobile_data.csv')
df.head()

Unnamed: 0,Brand,Battery_capacity(mAh),Screen_size(inches),Processor,RAM(GB),Internal_storage(GB),Operating system,Number of SIMs,Price,Resolution_width(px),Resolution_height(px),Rear_Camera(MP),Front_Camera(MP)
0,oneplus,4085,7,8,12,256,android,2,58998,1440.0,3120.0,48,16
1,realme,4000,7,8,6,64,android,2,27999,1080.0,2400.0,64,16
2,apple,3969,7,6,4,64,ios,2,106900,1242.0,2688.0,12,12
3,apple,3110,6,6,4,64,ios,2,62900,828.0,1792.0,12,12
4,lg,4000,6,8,6,128,android,1,49990,1080.0,2340.0,12,32


In [3]:
df.dtypes

Brand                     object
Battery_capacity(mAh)      int64
Screen_size(inches)        int64
Processor                  int64
RAM(GB)                    int64
Internal_storage(GB)       int64
Operating system          object
Number of SIMs             int64
Price                      int64
Resolution_width(px)     float64
Resolution_height(px)    float64
Rear_Camera(MP)            int64
Front_Camera(MP)           int64
dtype: object

In [4]:
numberical_features = df.select_dtypes(include=[np.number]).columns
print('The skewness of the numerical features are:\n\n', df[numberical_features].skew().sort_values(ascending=False))


The skewness of the numerical features are:

 Price                    4.230431
Internal_storage(GB)     2.986253
Rear_Camera(MP)          2.296220
Front_Camera(MP)         1.378540
Resolution_width(px)     1.222954
RAM(GB)                  1.168316
Resolution_height(px)    0.812623
Battery_capacity(mAh)    0.154700
Processor               -0.509088
Screen_size(inches)     -0.695029
Number of SIMs          -2.233826
dtype: float64


In [5]:
df_skewed = pd.DataFrame()

In [6]:
df_skewed['Price'] = np.log1p(df['Price'])
df_skewed['Price'].skew()

np.float64(0.3819802504679348)

In [7]:
df_skewed['Internal_storage(GB)'] = np.log1p(df['Internal_storage(GB)'])
df_skewed['Internal_storage(GB)'].skew()

np.float64(0.096627562384248)

In [8]:
df_skewed['Resolution_width(px)'] = np.log1p(df['Resolution_width(px)'])
df_skewed['Resolution_width(px)'].skew()

np.float64(0.35976141534850126)

In [9]:
df_skewed['Rear_Camera(MP)'] = np.log1p(df['Rear_Camera(MP)'])
df_skewed['Rear_Camera(MP)'].skew()

np.float64(0.4381939389264018)

In [10]:
df_skewed['Front_Camera(MP)'] = np.log1p(df['Front_Camera(MP)'])
df_skewed['Front_Camera(MP)'].skew()

np.float64(-0.48026711434010666)

In [11]:
df_skewed['RAM(GB)'] = np.log1p(df['RAM(GB)'])
df_skewed['RAM(GB)'].skew()

np.float64(-0.29392846803760975)

In [12]:
df_skewed['Number of SIMs']= df['Number of SIMs']**4
df_skewed['Number of SIMs'].skew()

np.float64(-0.8191605145901111)

In [13]:
df_skewed.head()

Unnamed: 0,Price,Internal_storage(GB),Resolution_width(px),Rear_Camera(MP),Front_Camera(MP),RAM(GB),Number of SIMs
0,10.985276,5.549076,7.273093,3.89182,2.833213,2.564949,16
1,10.23996,4.174387,6.985642,4.174387,2.833213,1.94591,16
2,11.579658,4.174387,7.125283,2.564949,2.564949,1.609438,16
3,11.049317,4.174387,6.72022,2.564949,2.564949,1.609438,16
4,10.819598,4.859812,6.985642,2.564949,3.496508,1.94591,1


In [14]:

df_concat = df.drop(['Price', 'Internal_storage(GB)', 'Rear_Camera(MP)', 'Front_Camera(MP)', 'RAM(GB)', 'Number of SIMs', 'Resolution_width(px)'], axis=1)
df_skewed = pd.concat([df_concat, df_skewed], axis=1)

In [15]:
df_skewed.head()

Unnamed: 0,Brand,Battery_capacity(mAh),Screen_size(inches),Processor,Operating system,Resolution_height(px),Price,Internal_storage(GB),Resolution_width(px),Rear_Camera(MP),Front_Camera(MP),RAM(GB),Number of SIMs
0,oneplus,4085,7,8,android,3120.0,10.985276,5.549076,7.273093,3.89182,2.833213,2.564949,16
1,realme,4000,7,8,android,2400.0,10.23996,4.174387,6.985642,4.174387,2.833213,1.94591,16
2,apple,3969,7,6,ios,2688.0,11.579658,4.174387,7.125283,2.564949,2.564949,1.609438,16
3,apple,3110,6,6,ios,1792.0,11.049317,4.174387,6.72022,2.564949,2.564949,1.609438,16
4,lg,4000,6,8,android,2340.0,10.819598,4.859812,6.985642,2.564949,3.496508,1.94591,1


In [16]:
len(df.columns)

13

In [17]:
len(df_skewed.columns)

13

In [18]:
numberical_features = df_skewed.select_dtypes(include=[np.number]).columns
print('The skewness of the numerical features are:\n\n', df_skewed[numberical_features].skew().sort_values(ascending=False))


The skewness of the numerical features are:

 Resolution_height(px)    0.812623
Rear_Camera(MP)          0.438194
Price                    0.381980
Resolution_width(px)     0.359761
Battery_capacity(mAh)    0.154700
Internal_storage(GB)     0.096628
RAM(GB)                 -0.293928
Front_Camera(MP)        -0.480267
Processor               -0.509088
Screen_size(inches)     -0.695029
Number of SIMs          -0.819161
dtype: float64


In [19]:
df_skewed.to_csv('../data/skewed_data.csv', index=False)

In [23]:
df_skewed.isnull().sum()

Brand                    0
Battery_capacity(mAh)    0
Screen_size(inches)      0
Processor                0
Operating system         0
Resolution_height(px)    0
Price                    0
Internal_storage(GB)     0
Resolution_width(px)     0
Rear_Camera(MP)          0
Front_Camera(MP)         0
RAM(GB)                  0
Number of SIMs           0
dtype: int64

In [20]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_skewed_scaled = df_skewed.copy()

for column in df_skewed_scaled.select_dtypes(exclude='object').columns:
    if column == 'Price':
        continue;
    df_skewed_scaled[column] = scaler.fit_transform(df_skewed_scaled[[column]])

In [21]:
df_skewed_scaled.head()

Unnamed: 0,Brand,Battery_capacity(mAh),Screen_size(inches),Processor,Operating system,Resolution_height(px),Price,Internal_storage(GB),Resolution_width(px),Rear_Camera(MP),Front_Camera(MP),RAM(GB),Number of SIMs
0,oneplus,0.860535,2.239175,0.841181,android,3.103923,10.985276,2.235219,0.90039,1.903176,0.948653,2.472643,0.353286
1,realme,0.773224,2.239175,0.841181,android,1.810273,10.23996,0.840899,0.302784,2.32362,0.948653,1.286993,0.353286
2,apple,0.741382,2.239175,-0.07785,ios,2.327733,11.579658,0.840899,0.593096,-0.071134,0.63,0.642546,0.353286
3,apple,-0.140971,0.741449,-0.07785,ios,0.717857,11.049317,0.840899,-0.249024,-0.071134,0.63,0.642546,0.353286
4,lg,0.773224,0.741449,0.841181,android,1.702469,10.819598,1.536112,0.302784,-0.071134,1.736537,1.286993,-2.53118


In [22]:
df_skewed_scaled.to_csv('../data/scaled_skewed_data.csv', index=False)