# MACHINE LEARNING PROJECT: MOBILE PHONE PRICE PREDICTION

# Data Splitting

In [1]:
import pandas as pd
import numpy as np

In [2]:
data=pd.read_csv("train.csv")

In [3]:
data.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [4]:
data.columns

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')

Here is the attributes of our dataset:

       battery_power: Total energy a battery can store in one time measured in mAh
       blue: Has bluetooth or not
       clock_speed: speed at which microprocessor executes instructions
       dual_sim: Has dual sim support or not
       fc: Front Camera mega pixels
       four_g: Has 4G or not
       int_memory: Internal Memory in Gigabytes
       m_dep: Mobile Depth in cm
       mobile_wt: Weight of mobile phone
       n_cores: Number of cores of processor
       pc: Primary Camera mega pixels
       px_height: Pixel Resolution Height
       px_width: Pixel Resolution Width
       ram: Random Access Memory in Megabytes
       sc_h: Screen Height of mobile in cm
       sc_w: Screen Width of mobile in cm
       talk_time: longest time that a single battery charge will last when you are
       three_g: Has 3G or not
       touch_screen: Has touch screen or not
       wifi: Has wifi or not
       price_range: This is the target variable with value of 0 (low cost), 1 (medium cost), 2 (high cost) and 3 (very high cost)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   battery_power  2000 non-null   int64  
 1   blue           2000 non-null   int64  
 2   clock_speed    2000 non-null   float64
 3   dual_sim       2000 non-null   int64  
 4   fc             2000 non-null   int64  
 5   four_g         2000 non-null   int64  
 6   int_memory     2000 non-null   int64  
 7   m_dep          2000 non-null   float64
 8   mobile_wt      2000 non-null   int64  
 9   n_cores        2000 non-null   int64  
 10  pc             2000 non-null   int64  
 11  px_height      2000 non-null   int64  
 12  px_width       2000 non-null   int64  
 13  ram            2000 non-null   int64  
 14  sc_h           2000 non-null   int64  
 15  sc_w           2000 non-null   int64  
 16  talk_time      2000 non-null   int64  
 17  three_g        2000 non-null   int64  
 18  touch_sc

In [6]:
data.describe()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,...,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,1238.5185,0.495,1.52225,0.5095,4.3095,0.5215,32.0465,0.50175,140.249,4.5205,...,645.108,1251.5155,2124.213,12.3065,5.767,11.011,0.7615,0.503,0.507,1.5
std,439.418206,0.5001,0.816004,0.500035,4.341444,0.499662,18.145715,0.288416,35.399655,2.287837,...,443.780811,432.199447,1084.732044,4.213245,4.356398,5.463955,0.426273,0.500116,0.500076,1.118314
min,501.0,0.0,0.5,0.0,0.0,0.0,2.0,0.1,80.0,1.0,...,0.0,500.0,256.0,5.0,0.0,2.0,0.0,0.0,0.0,0.0
25%,851.75,0.0,0.7,0.0,1.0,0.0,16.0,0.2,109.0,3.0,...,282.75,874.75,1207.5,9.0,2.0,6.0,1.0,0.0,0.0,0.75
50%,1226.0,0.0,1.5,1.0,3.0,1.0,32.0,0.5,141.0,4.0,...,564.0,1247.0,2146.5,12.0,5.0,11.0,1.0,1.0,1.0,1.5
75%,1615.25,1.0,2.2,1.0,7.0,1.0,48.0,0.8,170.0,7.0,...,947.25,1633.0,3064.5,16.0,9.0,16.0,1.0,1.0,1.0,2.25
max,1998.0,1.0,3.0,1.0,19.0,1.0,64.0,1.0,200.0,8.0,...,1960.0,1998.0,3998.0,19.0,18.0,20.0,1.0,1.0,1.0,3.0


We can find that these attributes are dummy variables:blue,dual_sim,four_g,three_g,touch_screen,wifi.
And we try to scaling values of attributes to the same level [0,1] for data normalization.

In [7]:
data_norm = (data - data.min()) / (data.max() - data.min())

In [8]:
data_norm.describe()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,...,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,0.492664,0.495,0.4089,0.5095,0.226816,0.5215,0.484621,0.446389,0.502075,0.502929,...,0.329137,0.501679,0.499255,0.521893,0.320389,0.500611,0.7615,0.503,0.507,0.5
std,0.293533,0.5001,0.326402,0.500035,0.228497,0.499662,0.292673,0.320462,0.294997,0.326834,...,0.226419,0.288518,0.28988,0.300946,0.242022,0.303553,0.426273,0.500116,0.500076,0.372771
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.234302,0.0,0.08,0.0,0.052632,0.0,0.225806,0.111111,0.241667,0.285714,...,0.14426,0.250167,0.254276,0.285714,0.111111,0.222222,1.0,0.0,0.0,0.25
50%,0.484302,0.0,0.4,1.0,0.157895,1.0,0.483871,0.444444,0.508333,0.428571,...,0.287755,0.498665,0.505211,0.5,0.277778,0.5,1.0,1.0,1.0,0.5
75%,0.744322,1.0,0.68,1.0,0.368421,1.0,0.741935,0.777778,0.75,0.857143,...,0.483291,0.756342,0.750534,0.785714,0.5,0.777778,1.0,1.0,1.0,0.75
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
data_norm['price_range']=data_norm['price_range'].apply(lambda x:0 if x<0.5 else 1)

We use 0 to represent low and use 1 to represent high.

In [10]:
data_norm.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,0.227789,0.0,0.68,0.0,0.052632,0.0,0.080645,0.555556,0.9,0.142857,...,0.010204,0.170895,0.612774,0.285714,0.388889,0.944444,0.0,0.0,1.0,0
1,0.347361,1.0,0.0,1.0,0.0,1.0,0.822581,0.666667,0.466667,0.285714,...,0.461735,0.993324,0.634687,0.857143,0.166667,0.277778,1.0,1.0,0.0,1
2,0.041416,1.0,0.0,1.0,0.105263,1.0,0.629032,0.888889,0.541667,0.571429,...,0.644388,0.811749,0.627205,0.428571,0.111111,0.388889,1.0,1.0,0.0,1
3,0.076152,1.0,0.8,0.0,0.0,0.0,0.129032,0.777778,0.425,0.714286,...,0.620408,0.858478,0.671566,0.785714,0.444444,0.5,1.0,0.0,0.0,1
4,0.881764,1.0,0.28,0.0,0.684211,1.0,0.677419,0.555556,0.508333,0.142857,...,0.616327,0.4753,0.308658,0.214286,0.111111,0.722222,1.0,1.0,0.0,0


In [11]:
data=data_norm.sample(frac=1.0)
rows, cols = data.shape
split_index_1 = int(rows * 0.1)
split_index_2 = int(rows * 0.2)

In [12]:
data_test:pd.DataFrame = data.iloc[0: split_index_1, :]
data_validate:pd.DataFrame = data.iloc[split_index_1:split_index_2, :]
data_train:pd.DataFrame = data.iloc[split_index_2: rows, :]

In [13]:
data_test.to_csv("test.csv", index=False)
data_validate.to_csv("valid.csv",index=False)
data_train.to_csv("train1.csv", index=False)

# The Implemention of Machine Learning Model

## Naive Bayes

### Data preprocessing

In [14]:
train_data=pd.read_csv("train1.csv")
valid_data=pd.read_csv("valid.csv")
test_data=pd.read_csv("test.csv")

In [15]:
train_data.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,0.902472,0.0,0.36,0.0,0.210526,1.0,0.129032,0.666667,0.966667,0.0,...,0.295918,0.44259,0.992517,0.571429,0.055556,0.5,1.0,0.0,1.0,1
1,0.04008,1.0,0.2,0.0,0.0,0.0,0.145161,0.0,0.691667,1.0,...,0.25102,0.451268,0.845804,0.785714,0.111111,0.166667,1.0,1.0,1.0,1
2,0.865063,1.0,0.72,0.0,0.210526,0.0,0.677419,0.888889,0.866667,0.142857,...,0.522449,0.576101,0.887493,0.857143,0.388889,0.111111,1.0,1.0,0.0,1
3,0.625919,1.0,0.52,0.0,0.157895,0.0,0.225806,0.555556,0.741667,1.0,...,0.438265,0.244993,0.531801,0.642857,0.611111,0.722222,0.0,0.0,0.0,1
4,0.098196,1.0,0.0,1.0,0.473684,1.0,0.83871,0.333333,0.241667,0.571429,...,0.588265,0.879172,0.766435,0.0,0.0,0.611111,1.0,0.0,0.0,1


For the implemention of naive bayes,we need to discretize continuous attributes into intervals and split large number into ranges.
We don't need to consider about the dummy variables but only focus on the continuous and large variables. 

Now we decide to discretize continuous attributes into intervals [0,0.5] and [0.5,1]

In [16]:
train_data=train_data.applymap(lambda x:0 if x<0.5 else 1)
valid_data=valid_data.applymap(lambda x:0 if x<0.5 else 1)
test_data=test_data.applymap(lambda x:0 if x<0.5 else 1)

### Set up model 

In [17]:
yTrain = train_data.iloc[:,-1]
yTrainCounts = yTrain.value_counts()
yTrainCounts = yTrainCounts.apply(lambda x : (x + 1) / (yTrain.size + yTrainCounts.size)) #使用了拉普拉斯平滑

In [18]:
yTrainCounts

0    0.501873
1    0.498127
Name: price_range, dtype: float64

In [19]:
retModel = {}
for nameClass, val in yTrainCounts.items():
    retModel[nameClass] = {'PClass': val, 'PFeature':{}}

propNamesAll = train_data.columns[:-1]
allPropByFeature = {}
for nameFeature in propNamesAll:
    allPropByFeature[nameFeature] = list(train_data[nameFeature].value_counts().index)
#print(allPropByFeature)
for nameClass, group in train_data.groupby(train_data.columns[-1]):
    for nameFeature in propNamesAll:
        eachClassPFeature = {}
        propDatas = group[nameFeature]
        propClassSummary = propDatas.value_counts()# 频次汇总 得到各个特征对应的概率
        for propName in allPropByFeature[nameFeature]:
            if not propClassSummary.get(propName):
                propClassSummary[propName] = 0#如果有属性灭有，那么自动补0
        Ni = len(allPropByFeature[nameFeature])
        propClassSummary = propClassSummary.apply(lambda x : (x + 1) / (propDatas.size + Ni))#使用了拉普拉斯平滑
        for nameFeatureProp, valP in propClassSummary.items():
            eachClassPFeature[nameFeatureProp] = valP
        retModel[nameClass]['PFeature'][nameFeature] = eachClassPFeature

In [20]:
retModel

{0: {'PClass': 0.50187265917603,
  'PFeature': {'battery_power': {0: 0.5801242236024845,
    1: 0.41987577639751555},
   'blue': {0: 0.5006211180124224, 1: 0.4993788819875776},
   'clock_speed': {0: 0.591304347826087, 1: 0.40869565217391307},
   'dual_sim': {1: 0.515527950310559, 0: 0.484472049689441},
   'fc': {0: 0.8546583850931677, 1: 0.1453416149068323},
   'four_g': {1: 0.5304347826086957, 0: 0.46956521739130436},
   'int_memory': {0: 0.5229813664596273, 1: 0.47701863354037266},
   'm_dep': {0: 0.5366459627329192, 1: 0.46335403726708074},
   'mobile_wt': {1: 0.5055900621118012, 0: 0.49440993788819876},
   'n_cores': {0: 0.5279503105590062, 1: 0.4720496894409938},
   'pc': {1: 0.5105590062111801, 0: 0.4894409937888199},
   'px_height': {0: 0.7987577639751553, 1: 0.20124223602484473},
   'px_width': {0: 0.5440993788819876, 1: 0.45590062111801244},
   'ram': {0: 0.9204968944099379, 1: 0.07950310559006211},
   'sc_h': {1: 0.5838509316770186, 0: 0.4161490683229814},
   'sc_w': {0: 0.73

In [21]:
def predictBySeries(data,model):
        curMaxRate = None
        curClassSelect = None
        for nameClass, infoModel in model.items():
            rate = 0
            rate += np.log(infoModel['PClass'])
            PFeature = infoModel['PFeature']
            
            for nameFeature, val in data.items():
                propsRate = PFeature.get(nameFeature)
                if not propsRate:
                    continue
                rate += np.log(propsRate.get(val, 0))#使用log加法避免很小的小数连续乘，接近零
                #print(nameFeature, val, propsRate.get(val, 0))
            #print(nameClass, rate)
            if curMaxRate == None or rate > curMaxRate:
                curMaxRate = rate
                curClassSelect = nameClass
            
        return curClassSelect

In [22]:
def predict(data,model):
        if isinstance(data, pd.Series):
            return predictBySeries(data,model)
        return data.apply(lambda d: predictBySeries(d,model), axis=1)

In [23]:
predict(test_data,retModel)

0      1
1      1
2      1
3      0
4      0
      ..
195    1
196    0
197    0
198    0
199    1
Length: 200, dtype: int64

In [24]:
valid = pd.DataFrame({'预测值':predict(valid_data,retModel), '正取值':valid_data.iloc[:,-1]})
print(valid)
print('正确率:%f%%'%(valid[valid['预测值'] == valid['正取值']].shape[0] * 100.0 / valid.shape[0]))

     预测值  正取值
0      1    1
1      1    1
2      0    0
3      1    1
4      1    1
..   ...  ...
195    1    1
196    0    0
197    0    1
198    0    0
199    1    1

[200 rows x 2 columns]
正确率:85.000000%


In [25]:
test = pd.DataFrame({'预测值':predict(test_data,retModel), '正取值':test_data.iloc[:,-1]})
print(test)
print('正确率:%f%%'%(test[test['预测值'] == test['正取值']].shape[0] * 100.0 / test.shape[0]))

     预测值  正取值
0      1    1
1      1    1
2      1    0
3      0    0
4      0    0
..   ...  ...
195    1    1
196    0    1
197    0    0
198    0    0
199    1    1

[200 rows x 2 columns]
正确率:90.000000%
