# Data Splitting

In [1]:
import pandas as pd
import numpy as np

In [2]:
data=pd.read_csv("train.csv")

In [3]:
data.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [4]:
data.columns

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')

Here is the attributes of our dataset:

       battery_power: Total energy a battery can store in one time measured in mAh
       blue: Has bluetooth or not
       clock_speed: speed at which microprocessor executes instructions
       dual_sim: Has dual sim support or not
       fc: Front Camera mega pixels
       four_g: Has 4G or not
       int_memory: Internal Memory in Gigabytes
       m_dep: Mobile Depth in cm
       mobile_wt: Weight of mobile phone
       n_cores: Number of cores of processor
       pc: Primary Camera mega pixels
       px_height: Pixel Resolution Height
       px_width: Pixel Resolution Width
       ram: Random Access Memory in Megabytes
       sc_h: Screen Height of mobile in cm
       sc_w: Screen Width of mobile in cm
       talk_time: longest time that a single battery charge will last when you are
       three_g: Has 3G or not
       touch_screen: Has touch screen or not
       wifi: Has wifi or not
       price_range: This is the target variable with value of 0 (low cost), 1 (medium cost), 2 (high cost) and 3 (very high cost)

In [6]:
data['price_range']=data['price_range'].apply(lambda x:"low" if x<2 else "high")

In [7]:
data.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,low
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,high
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,high
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,high
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,low


In [8]:
data=data.sample(frac=1.0)
rows, cols = data.shape
split_index_1 = int(rows * 0.1)
split_index_2 = int(rows * 0.2)

In [9]:
data_test:pd.DataFrame = data.iloc[0: split_index_1, :]
data_validate:pd.DataFrame = data.iloc[split_index_1:split_index_2, :]
data_train:pd.DataFrame = data.iloc[split_index_2: rows, :]

In [10]:
data_test.to_csv("test.csv", index=False)
data_validate.to_csv("valid.csv",index=False)
data_train.to_csv("train1.csv", index=False)

# Data Preprocessing

In [11]:
train_data=pd.read_csv("train1.csv")
valid_data=pd.read_csv("valid.csv")
test_data=pd.read_csv("test.csv")

In [15]:
train_data.shape

(1600, 21)

In [16]:
valid_data.shape

(200, 21)

In [18]:
test_data.shape

(200, 21)