In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder,MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error, log_loss
import operator
import json
from IPython import display
import os
import warnings
import random

np.random.seed(0)
warnings.filterwarnings("ignore")
THRESHOLD = 4

Task: To predict whether the user likes the mobile phone or not. <br>
Assumption: If the average rating of mobile >= threshold, then the user likes it, otherwise not.

<b>Missing values:</b><br>
'Also Known As'(459),'Applications'(421),'Audio Features'(437),'Bezel-less display'(266),'Browser'(449),'Build Material'(338),'Co-Processor'(451),'Display Colour'(457),'Mobile High-Definition Link(MHL)'(472),'Music'(447)
'Email','Fingerprint Sensor Position'(174),'Games'(446),'HDMI'(454),'Heart Rate Monitor'(467),'IRIS Scanner'(467),
'Optical Image Stabilisation'(219),'Other Facilities'(444),'Phone Book'(444),'Physical Aperture'(87),'Quick Charging'(122),'Ring Tone'(444),'Ruggedness'(430),SAR Value(315),'SIM 3'(472),'SMS'(470)', 'Screen Protection'(229),'Screen to Body Ratio (claimed by the brand)'(428),'Sensor'(242),'Software Based Aperture'(473),
'Special Features'(459),'Standby time'(334),'Stylus'(473),'TalkTime'(259), 'USB Type-C'(374),'Video Player'(456),
'Video Recording Features'(458),'Waterproof'(398),'Wireless Charging','USB OTG Support'(159), 'Video ,'Recording'(113),'Java'(471),'Browser'(448)

<b>Very low variance:</b><br>
'Architecture'(most entries are 64-bit),'Audio Jack','GPS','Loudspeaker','Network','Network Support','Other Sensors'(28),'SIM Size', 'VoLTE'


<b>Multivalued:</b><br>
'Colours','Custom UI','Model'(1),'Other Sensors','Launch Date'

<b>Not important:</b><br>
'Bluetooth', 'Settings'(75),'Wi-Fi','Wi-Fi Features'

<b>Doubtful:</b><br>
'Aspect Ratio','Autofocus','Brand','Camera Features','Fingerprint Sensor'(very few entries are missing),
'Fingerprint Sensor Position', 'Graphics'(multivalued),'Image resolution'(multivalued),'SIM Size','Sim Slot(s)', 'User Available Storage', 'SIM 1', 'SIM 2','Shooting Modes', 'Touch Screen'(24), 'USB Connectivity'
    
<b>To check:</b><br>
'Display Type','Expandable Memory','FM Radio'

<b>High Correlation with other features</b><br>
'SIM Slot(s)' high correlation with SIM1
'Weight' has high high correlation with capacity , screen-to-body ratio
'Height' - screen size is also there
    
<b>Given a mobile, we can't directly get these features</b><br>
'Rating Count', 'Review Count'

<b>Keeping:</b><br>
'Capacity','Flash'(17),'Height'(22),'Internal Memory'(20, require cleaning),'Operating System'(25, require cleaning), 'Pixel Density'(1, clean it),'Processor'(22, clean it), 'RAM'(17, clean), 'Rating','Resolution'(cleaning), 'Screen Resolution','Screen Size', 'Thickness'(22), 'Type','User Replaceable','Weight'(cleaning),'Sim Size'(), 'Other Sensors'(28), 'Screen to Body Ratio (calculated)','Width',


In [2]:
# read data from file
train = pd.read_csv("../input/padhai-perceptron-like-unlike-classification/train.csv") 
test = pd.read_csv("../input/padhai-perceptron-like-unlike-classification/test.csv")

# check the number of features and data points in train
print("Number of data points in train: %d" % train.shape[0])
print("Number of features in train: %d" % train.shape[1])

# check the number of features and data points in test
print("Number of data points in test: %d" % test.shape[0])
print("Number of features in test: %d" % test.shape[1])

Number of data points in train: 355
Number of features in train: 99
Number of data points in test: 119
Number of features in test: 98


In [3]:
miss_check=pd.DataFrame(train.isnull().sum(),columns=["Count"])
type(miss_check)
miss_check.describe()

Unnamed: 0,Count
count,99.0
mean,135.10101
std,142.505316
min,0.0
25%,14.0
50%,56.0
75%,319.5
max,354.0


In [4]:
miss_check["% Missing"]=(miss_check["Count"]/355)*100
print(miss_check)

                   Count  % Missing
PhoneId                0   0.000000
Also Known As        338  95.211268
Applications         312  87.887324
Architecture          82  23.098592
Aspect Ratio         151  42.535211
...                  ...        ...
Wi-Fi                 21   5.915493
Wi-Fi Features        26   7.323944
Width                 15   4.225352
Wireless Charging    330  92.957746
Rating                 0   0.000000

[99 rows x 2 columns]


In [5]:
def data_clean(data):
    
    # Let's first remove all missing value features
    columns_to_remove = ['Also Known As','Applications','Audio Features','Bezel-less display'
                         'Browser','Build Material','Co-Processor','Browser'
                         'Display Colour','Mobile High-Definition Link(MHL)',
                         'Music', 'Email','Fingerprint Sensor Position',
                         'Games','HDMI','Heart Rate Monitor','IRIS Scanner', 
                         'Optical Image Stabilisation','Other Facilities',
                         'Phone Book','Physical Aperture','Quick Charging',
                         'Ring Tone','Ruggedness','SAR Value','SIM 3','SMS',
                         'Screen Protection','Screen to Body Ratio (claimed by the brand)',
                         'Sensor','Software Based Aperture', 'Special Features',
                         'Standby time','Stylus','TalkTime', 'USB Type-C',
                         'Video Player', 'Video Recording Features','Waterproof',
                         'Wireless Charging','USB OTG Support', 'Video Recording','Java']

    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]

    #Features having very low variance 
    columns_to_remove = ['Architecture','Audio Jack','GPS','Loudspeaker','Network','Network Support','VoLTE']
    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]

    # Multivalued:
    columns_to_remove = ['Architecture','Launch Date','Audio Jack','GPS','Loudspeaker','Network','Network Support','VoLTE', 'Custom UI']
    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]

    # Not much important
    columns_to_remove = ['Bluetooth', 'Settings','Wi-Fi','Wi-Fi Features']
    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]
    
    return data

# Removing features

In [6]:
train = data_clean(train)
test = data_clean(test)

removing all those data points in which more than 15 features are missing 

In [7]:
train = train[(train.isnull().sum(axis=1) <= 15)]
# You shouldn't remove data points from test set
#test = test[(test.isnull().sum(axis=1) <= 15)]

In [8]:
# check the number of features and data points in train
print("Number of data points in train: %d" % train.shape[0])
print("Number of features in train: %d" % train.shape[1])

# check the number of features and data points in test
print("Number of data points in test: %d" % test.shape[0])
print("Number of features in test: %d" % test.shape[1])

Number of data points in train: 341
Number of features in train: 47
Number of data points in test: 119
Number of features in test: 46


# Filling Missing values

In [9]:
def for_integer(test):
    try:
        test = test.strip()
        return int(test.split(' ')[0])
    except IOError:
           pass
    except ValueError:
        pass
    except:
        pass

def for_string(test):
    try:
        test = test.strip()
        return (test.split(' ')[0])
    except IOError:
        pass
    except ValueError:
        pass
    except:
        pass

def for_float(test):
    try:
        test = test.strip()
        return float(test.split(' ')[0])
    except IOError:
        pass
    except ValueError:
        pass
    except:
        pass
def find_freq(test):
    try:
        test = test.strip()
        test = test.split(' ')
        if test[2][0] == '(':
            return float(test[2][1:])
        return float(test[2])
    except IOError:
        pass
    except ValueError:
        pass
    except:
        pass

    
def for_Internal_Memory(test):
    try:
        test = test.strip()
        test = test.split(' ')
        if test[1] == 'GB':
            return int(test[0])
        if test[1] == 'MB':
#             print("here")
            return (int(test[0]) * 0.001)
    except IOError:
           pass
    except ValueError:
        pass
    except:
        pass
    
def find_freq(test):
    try:
        test = test.strip()
        test = test.split(' ')
        if test[2][0] == '(':
            return float(test[2][1:])
        return float(test[2])
    except IOError:
        pass
    except ValueError:
        pass
    except:
        pass


In [10]:
def data_clean_2(x):
    data = x.copy()
    
    data['Capacity'] = data['Capacity'].apply(for_integer)

    data['Height'] = data['Height'].apply(for_float)
    data['Height'] = data['Height'].fillna(data['Height'].mean())

    data['Internal Memory'] = data['Internal Memory'].apply(for_Internal_Memory)

    data['Pixel Density'] = data['Pixel Density'].apply(for_integer)

    data['Internal Memory'] = data['Internal Memory'].fillna(data['Internal Memory'].median())
    data['Internal Memory'] = data['Internal Memory'].astype(int)

    data['RAM'] = data['RAM'].apply(for_integer)
    data['RAM'] = data['RAM'].fillna(data['RAM'].median())
    data['RAM'] = data['RAM'].astype(int)

    data['Resolution'] = data['Resolution'].apply(for_integer)
    data['Resolution'] = data['Resolution'].fillna(data['Resolution'].median())
    data['Resolution'] = data['Resolution'].astype(int)

    data['Screen Size'] = data['Screen Size'].apply(for_float)

    data['Thickness'] = data['Thickness'].apply(for_float)
    data['Thickness'] = data['Thickness'].fillna(data['Thickness'].mean())
    data['Thickness'] = data['Thickness'].round(2)

    data['Type'] = data['Type'].fillna('Li-Polymer')

    data['Screen to Body Ratio (calculated)'] = data['Screen to Body Ratio (calculated)'].apply(for_float)
    data['Screen to Body Ratio (calculated)'] = data['Screen to Body Ratio (calculated)'].fillna(data['Screen to Body Ratio (calculated)'].mean())
    data['Screen to Body Ratio (calculated)'] = data['Screen to Body Ratio (calculated)'].round(2)

    data['Width'] = data['Width'].apply(for_float)
    data['Width'] = data['Width'].fillna(data['Width'].mean())
    data['Width'] = data['Width'].round(2)

    data['Flash'][data['Flash'].isna() == True] = "Other"

    data['User Replaceable'][data['User Replaceable'].isna() == True] = "Other"

    data['Num_cores'] = data['Processor'].apply(for_string)
    data['Num_cores'][data['Num_cores'].isna() == True] = "Other"


    data['Processor_frequency'] = data['Processor'].apply(find_freq)
    #because there is one entry with 208MHz values, to convert it to GHz
    data['Processor_frequency'][data['Processor_frequency'] > 200] = 0.208
    data['Processor_frequency'] = data['Processor_frequency'].fillna(data['Processor_frequency'].mean())
    data['Processor_frequency'] = data['Processor_frequency'].round(2)

    data['Camera Features'][data['Camera Features'].isna() == True] = "Other"

    #simplifyig Operating System to os_name for simplicity
    data['os_name'] = data['Operating System'].apply(for_string)
    data['os_name'][data['os_name'].isna() == True] = "Other"

    data['Sim1'] = data['SIM 1'].apply(for_string)

    data['SIM Size'][data['SIM Size'].isna() == True] = "Other"

    data['Image Resolution'][data['Image Resolution'].isna() == True] = "Other"

    data['Fingerprint Sensor'][data['Fingerprint Sensor'].isna() == True] = "Other"

    data['Expandable Memory'][data['Expandable Memory'].isna() == True] = "No"

    data['Weight'] = data['Weight'].apply(for_integer)
    data['Weight'] = data['Weight'].fillna(data['Weight'].mean())
    data['Weight'] = data['Weight'].astype(int)

    data['SIM 2'] = data['SIM 2'].apply(for_string)
    data['SIM 2'][data['SIM 2'].isna() == True] = "Other"
    
    return data

In [11]:
train = data_clean_2(train)
test = data_clean_2(test)

# check the number of features and data points in train
print("Number of data points in train: %d" % train.shape[0])
print("Number of features in train: %d" % train.shape[1])

# check the number of features and data points in test
print("Number of data points in test: %d" % test.shape[0])
print("Number of features in test: %d" % test.shape[1])

Number of data points in train: 341
Number of features in train: 51
Number of data points in test: 119
Number of features in test: 50


Not very important feature

In [12]:
def data_clean_3(x):
    
    data = x.copy()

    columns_to_remove = ['User Available Storage','SIM Size','Chipset','Processor','Autofocus','Aspect Ratio','Touch Screen',
                        'Bezel-less display','Operating System','SIM 1','USB Connectivity','Other Sensors','Graphics','FM Radio',
                        'NFC','Shooting Modes','Browser','Display Colour' ]

    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]


    columns_to_remove = [ 'Screen Resolution','User Replaceable','Camera Features',
                        'Thickness', 'Display Type']

    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]


    columns_to_remove = ['Fingerprint Sensor', 'Flash', 'Rating Count', 'Review Count','Image Resolution','Type','Expandable Memory',\
                        'Colours','Width','Model']
    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]

    return data

In [13]:
train = data_clean_3(train)
test = data_clean_3(test)

# check the number of features and data points in train
print("Number of data points in train: %d" % train.shape[0])
print("Number of features in train: %d" % train.shape[1])

# check the number of features and data points in test
print("Number of data points in test: %d" % test.shape[0])
print("Number of features in test: %d" % test.shape[1])

Number of data points in train: 341
Number of features in train: 18
Number of data points in test: 119
Number of features in test: 17


In [14]:
# one hot encoding

train_ids = train['PhoneId']
test_ids = test['PhoneId']

cols = list(test.columns)
cols.remove('PhoneId')
cols.insert(0, 'PhoneId')

combined = pd.concat([train.drop('Rating', axis=1)[cols], test[cols]])
print(combined.shape)
print(combined.columns)

combined = pd.get_dummies(combined)
print(combined.shape)
print(combined.columns)

train_new = combined[combined['PhoneId'].isin(train_ids)]
test_new = combined[combined['PhoneId'].isin(test_ids)]

(460, 17)
Index(['PhoneId', 'Weight', 'Screen to Body Ratio (calculated)', 'Height',
       'SIM 2', 'SIM Slot(s)', 'Num_cores', 'Brand', 'Capacity', 'os_name',
       'RAM', 'Pixel Density', 'Resolution', 'Sim1', 'Internal Memory',
       'Processor_frequency', 'Screen Size'],
      dtype='object')
(460, 87)
Index(['PhoneId', 'Weight', 'Screen to Body Ratio (calculated)', 'Height',
       'Capacity', 'RAM', 'Pixel Density', 'Resolution', 'Internal Memory',
       'Processor_frequency', 'Screen Size', 'SIM 2_2G', 'SIM 2_3G',
       'SIM 2_4G', 'SIM 2_Other', 'SIM Slot(s)_Dual SIM, GSM+CDMA',
       'SIM Slot(s)_Dual SIM, GSM+GSM',
       'SIM Slot(s)_Dual SIM, GSM+GSM, Dual VoLTE',
       'SIM Slot(s)_Single SIM, GSM', 'Num_cores_312', 'Num_cores_Deca',
       'Num_cores_Dual', 'Num_cores_Hexa', 'Num_cores_Octa', 'Num_cores_Other',
       'Num_cores_Quad', 'Num_cores_Tru-Octa', 'Brand_10.or', 'Brand_Apple',
       'Brand_Asus', 'Brand_Billion', 'Brand_Blackberry', 'Brand_Comio',
      

In [15]:
train_new = train_new.merge(train[['PhoneId', 'Rating']], on='PhoneId')

In [16]:
# check the number of features and data points in train
print("Number of data points in train: %d" % train_new.shape[0])
print("Number of features in train: %d" % train_new.shape[1])

# check the number of features and data points in test
print("Number of data points in test: %d" % test_new.shape[0])
print("Number of features in test: %d" % test_new.shape[1])

Number of data points in train: 341
Number of features in train: 88
Number of data points in test: 119
Number of features in test: 87


In [17]:
train_new.head()

Unnamed: 0,PhoneId,Weight,Screen to Body Ratio (calculated),Height,Capacity,RAM,Pixel Density,Resolution,Internal Memory,Processor_frequency,...,os_name_Blackberry,os_name_KAI,os_name_Nokia,os_name_Other,os_name_Tizen,os_name_iOS,Sim1_2G,Sim1_3G,Sim1_4G,Rating
0,0,182,80.68,157.9,4000,4,403,20,64,1.8,...,0,0,0,0,0,0,0,0,1,4.5
1,1,168,80.85,156.2,4230,3,271,8,32,1.8,...,0,0,0,0,0,0,0,0,1,4.5
2,2,168,83.68,157.0,3500,3,409,25,32,2.1,...,0,0,0,0,0,0,0,0,1,4.4
3,4,169,74.78,159.8,3300,4,411,24,64,2.2,...,0,0,0,0,0,0,0,0,1,4.3
4,5,175,84.23,160.4,3750,4,396,16,64,2.2,...,0,0,0,0,0,0,0,0,1,4.4


In [18]:
test_new.head()

Unnamed: 0,PhoneId,Weight,Screen to Body Ratio (calculated),Height,Capacity,RAM,Pixel Density,Resolution,Internal Memory,Processor_frequency,...,os_name_Android,os_name_Blackberry,os_name_KAI,os_name_Nokia,os_name_Other,os_name_Tizen,os_name_iOS,Sim1_2G,Sim1_3G,Sim1_4G
0,3,168,80.85,156.2,4230,2,271,5,16,1.8,...,1,0,0,0,0,0,0,0,0,1
1,11,205,81.6,156.0,5000,4,402,12,64,1.8,...,1,0,0,0,0,0,0,0,0,1
2,13,169,83.84,156.7,3500,6,409,25,64,2.0,...,1,0,0,0,0,0,0,0,0,1
3,16,169,83.84,156.7,3500,4,409,16,64,2.0,...,1,0,0,0,0,0,0,0,0,1
4,19,181,77.43,158.6,4000,4,403,20,64,1.8,...,1,0,0,0,0,0,0,0,0,1


# Feature Engineering

In [19]:
train_new['class']=train_new['Rating'].map(lambda x: 1 if x>=4 else 0)
train_new['class'].value_counts()

1    238
0    103
Name: class, dtype: int64

In [20]:
train_new

Unnamed: 0,PhoneId,Weight,Screen to Body Ratio (calculated),Height,Capacity,RAM,Pixel Density,Resolution,Internal Memory,Processor_frequency,...,os_name_KAI,os_name_Nokia,os_name_Other,os_name_Tizen,os_name_iOS,Sim1_2G,Sim1_3G,Sim1_4G,Rating,class
0,0,182,80.68,157.9,4000,4,403,20,64,1.8,...,0,0,0,0,0,0,0,1,4.5,1
1,1,168,80.85,156.2,4230,3,271,8,32,1.8,...,0,0,0,0,0,0,0,1,4.5,1
2,2,168,83.68,157.0,3500,3,409,25,32,2.1,...,0,0,0,0,0,0,0,1,4.4,1
3,4,169,74.78,159.8,3300,4,411,24,64,2.2,...,0,0,0,0,0,0,0,1,4.3,1
4,5,175,84.23,160.4,3750,4,396,16,64,2.2,...,0,0,0,0,0,0,0,1,4.4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336,465,141,67.20,142.7,2420,1,220,5,16,1.2,...,0,0,0,0,0,0,0,1,3.8,0
337,466,165,69.95,156.5,3000,4,401,13,64,1.8,...,0,0,0,0,0,0,0,1,3.9,0
338,468,160,68.52,149.7,3000,3,282,8,16,1.4,...,0,0,0,0,0,0,0,1,4.1,1
339,470,156,63.93,132.0,1800,1,218,2,8,1.3,...,0,0,0,0,0,0,0,1,3.6,0


In [21]:
train_new_1=train_new.drop('Rating',axis=1)

In [22]:
train_new_2=train_new_1.set_index('PhoneId')

In [23]:
train_new_2.head()

Unnamed: 0_level_0,Weight,Screen to Body Ratio (calculated),Height,Capacity,RAM,Pixel Density,Resolution,Internal Memory,Processor_frequency,Screen Size,...,os_name_Blackberry,os_name_KAI,os_name_Nokia,os_name_Other,os_name_Tizen,os_name_iOS,Sim1_2G,Sim1_3G,Sim1_4G,class
PhoneId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,182,80.68,157.9,4000,4,403,20,64,1.8,6.26,...,0,0,0,0,0,0,0,0,1,1
1,168,80.85,156.2,4230,3,271,8,32,1.8,6.2,...,0,0,0,0,0,0,0,0,1,1
2,168,83.68,157.0,3500,3,409,25,32,2.1,6.3,...,0,0,0,0,0,0,0,0,1,1
4,169,74.78,159.8,3300,4,411,24,64,2.2,6.0,...,0,0,0,0,0,0,0,0,1,1
5,175,84.23,160.4,3750,4,396,16,64,2.2,6.5,...,0,0,0,0,0,0,0,0,1,1


In [24]:
correlation_dataframe=train_new_2.corr(method='pearson')

In [25]:
corr_df=pd.DataFrame(correlation_dataframe['class'])
corr_df

Unnamed: 0,class
Weight,0.073795
Screen to Body Ratio (calculated),0.229852
Height,0.212545
Capacity,0.145891
RAM,-0.176451
...,...
os_name_iOS,0.159801
Sim1_2G,0.050194
Sim1_3G,-0.129977
Sim1_4G,0.057015


In [26]:
corr_df_fin=corr_df.drop('class')
corr_df_fin

Unnamed: 0,class
Weight,0.073795
Screen to Body Ratio (calculated),0.229852
Height,0.212545
Capacity,0.145891
RAM,-0.176451
...,...
os_name_Tizen,-0.082439
os_name_iOS,0.159801
Sim1_2G,0.050194
Sim1_3G,-0.129977


In [27]:
corr_df_fin['class']=corr_df_fin['class'].map(lambda x: abs(x))
corr_df_fin.describe()

Unnamed: 0,class
count,83.0
mean,0.103003
std,0.067493
min,0.005152
25%,0.050362
50%,0.082439
75%,0.139882
max,0.326939


In [28]:
corr_df_fin_v2=corr_df_fin[corr_df_fin['class']>=0.1]
corr_df_fin_v2

Unnamed: 0,class
Screen to Body Ratio (calculated),0.229852
Height,0.212545
Capacity,0.145891
RAM,0.176451
Pixel Density,0.127747
Resolution,0.220252
Internal Memory,0.227565
Processor_frequency,0.326939
Screen Size,0.264962
SIM 2_2G,0.191743


In [29]:
final_var=list(corr_df_fin_v2.index.values)
final_var

['Screen to Body Ratio (calculated)',
 'Height',
 'Capacity',
 'RAM',
 'Pixel Density',
 'Resolution',
 'Internal Memory',
 'Processor_frequency',
 'Screen Size',
 'SIM 2_2G',
 'SIM 2_4G',
 'SIM Slot(s)_Dual SIM, GSM+GSM',
 'SIM Slot(s)_Dual SIM, GSM+GSM, Dual VoLTE',
 'Num_cores_Deca',
 'Num_cores_Octa',
 'Num_cores_Quad',
 'Brand_10.or',
 'Brand_Apple',
 'Brand_Blackberry',
 'Brand_Coolpad',
 'Brand_HTC',
 'Brand_InFocus',
 'Brand_Intex',
 'Brand_Karbonn',
 'Brand_LG',
 'Brand_Micromax',
 'Brand_Mobiistar',
 'Brand_OPPO',
 'Brand_Samsung',
 'Brand_Vivo',
 'Brand_Xiaomi',
 'Brand_Yu',
 'os_name_Blackberry',
 'os_name_KAI',
 'os_name_iOS',
 'Sim1_3G']

In [30]:
x_train=train_new_2[final_var]
y_train=train_new_2['class']
x_train

Unnamed: 0_level_0,Screen to Body Ratio (calculated),Height,Capacity,RAM,Pixel Density,Resolution,Internal Memory,Processor_frequency,Screen Size,SIM 2_2G,...,Brand_Mobiistar,Brand_OPPO,Brand_Samsung,Brand_Vivo,Brand_Xiaomi,Brand_Yu,os_name_Blackberry,os_name_KAI,os_name_iOS,Sim1_3G
PhoneId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,80.68,157.9,4000,4,403,20,64,1.8,6.26,0,...,0,0,0,0,1,0,0,0,0,0
1,80.85,156.2,4230,3,271,8,32,1.8,6.20,0,...,0,0,0,0,0,0,0,0,0,0
2,83.68,157.0,3500,3,409,25,32,2.1,6.30,0,...,0,0,0,0,0,0,0,0,0,0
4,74.78,159.8,3300,4,411,24,64,2.2,6.00,0,...,0,0,1,0,0,0,0,0,0,0
5,84.23,160.4,3750,4,396,16,64,2.2,6.50,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
465,67.20,142.7,2420,1,220,5,16,1.2,5.00,0,...,0,0,0,0,0,0,0,0,0,0
466,69.95,156.5,3000,4,401,13,64,1.8,5.50,0,...,0,0,0,0,0,0,0,0,0,0
468,68.52,149.7,3000,3,282,8,16,1.4,5.20,0,...,0,0,0,0,0,0,0,0,0,0
470,63.93,132.0,1800,1,218,2,8,1.3,4.50,1,...,0,0,0,0,0,0,0,0,0,0


In [31]:
x_train_array=x_train.values
y_train_array=y_train.values

In [32]:
x_train_array

array([[  80.68,  157.9 , 4000.  , ...,    0.  ,    0.  ,    0.  ],
       [  80.85,  156.2 , 4230.  , ...,    0.  ,    0.  ,    0.  ],
       [  83.68,  157.  , 3500.  , ...,    0.  ,    0.  ,    0.  ],
       ...,
       [  68.52,  149.7 , 3000.  , ...,    0.  ,    0.  ,    0.  ],
       [  63.93,  132.  , 1800.  , ...,    0.  ,    0.  ,    0.  ],
       [  74.21,  151.1 , 3000.  , ...,    0.  ,    0.  ,    0.  ]])

In [33]:
test=test_new.values

In [34]:
test

array([[  3.  , 168.  ,  80.85, ...,   0.  ,   0.  ,   1.  ],
       [ 11.  , 205.  ,  81.6 , ...,   0.  ,   0.  ,   1.  ],
       [ 13.  , 169.  ,  83.84, ...,   0.  ,   0.  ,   1.  ],
       ...,
       [469.  , 171.  ,  70.72, ...,   0.  ,   0.  ,   1.  ],
       [471.  , 159.  ,  68.66, ...,   0.  ,   0.  ,   1.  ],
       [473.  , 168.  ,  68.55, ...,   0.  ,   0.  ,   1.  ]])

 # Perceptron Model

In [35]:
from sklearn.metrics import accuracy_score

In [36]:
class Perceptron():
    def __init__(self) :
        self. w=None
        self. b=None
    def model(self, x) :
        return 1 if(np.dot(self.w, x) >=self.b) else 0
    def predict(self, X) :
        y=[]
        for x in X:
            result=self.model(x) 
            y. append(result) 
        return np. array(y) 
    def fit(self, X, Y, epochs=1, lr=1) :
        self. w=[random.random() for i in range (0,X.shape[1])]
        self. b=random.random() 
        accuracy={}
        max_accuracy=0
        for i in range(epochs) :
            for x, y in zip(X, Y) :
                y_pred=self.model(x) 
                if y==1 and y_pred==0:
                    self. w=self.w+ lr*x
                    self.b=self.b+lr*1
                elif y==0 and y_pred==1:
                    self.w=self.w-lr*x
                    self.b=self.b-lr*1
            accuracy[i]=accuracy_score(self.predict(X), Y) 
            if(accuracy[i]>max_accuracy) :
                max_accuracy=accuracy[i]
                chkptw=self.w
                chkptb=self.b
        self.w=chkptw
        self.b=chkptb
        return(max_accuracy) 
                                           

In [37]:
class Perceptron1 :
    def __init__(self):
        self.w=None
        self.b=None
    def model (self,x) :
        a= np.dot(self.w,x)
        #print(a,self.b)
        if a>=self.b :
            return 1
        else :
            return 0
    def predict(self,X):
        y=[]
        for x in X:
            y.append(self.model(x))
        return np.array(y)
    def fit(self,X,Y,epochs=1,lr=1):
        self.w=np.ones(X.shape[1])
        self.b=0
        accuracy={}
        max_accuracy=0
        for i in range(epochs):
            for x,y in zip(X,Y):
                y_pred=self.model(x)
                if y==1 and y_pred==0:
                    self.w=self.w + lr*x
                    self.b=self.b + lr*1
                elif y==0 and y_pred ==1 :
                    self.w=self.w - lr*x
                    self.b=self.b - lr*1
            accuracy[i]=accuracy_score(Y,self.predict(X))
            if accuracy[i]>max_accuracy :
                max_accuracy=accuracy[i]
                chkptw=self.w
                chkptb=self.b
        self.w=chkptw
        self.b=chkptb
        return max_accuracy
        #print(max_accuracy,accuracy.values())




In [38]:
perceptron=Perceptron1()

In [39]:
x_train_array

array([[  80.68,  157.9 , 4000.  , ...,    0.  ,    0.  ,    0.  ],
       [  80.85,  156.2 , 4230.  , ...,    0.  ,    0.  ,    0.  ],
       [  83.68,  157.  , 3500.  , ...,    0.  ,    0.  ,    0.  ],
       ...,
       [  68.52,  149.7 , 3000.  , ...,    0.  ,    0.  ,    0.  ],
       [  63.93,  132.  , 1800.  , ...,    0.  ,    0.  ,    0.  ],
       [  74.21,  151.1 , 3000.  , ...,    0.  ,    0.  ,    0.  ]])

In [40]:
def fun(k) :
    max_value1=0
    ep1=0
    max_value2=0
    ep2=0
    for i in range(1,k) :
        a=perceptron.fit(x_train_array, y_train_array, epochs=i) 
        print(i, ' ', a) 
        if(a>max_value1) :
            max_value1=a
            ep1=i
    return ep1
        
        
        
        
          
        

In [41]:
max_ep=fun(50)

1   0.6979472140762464
2   0.6979472140762464
3   0.6979472140762464
4   0.7067448680351907
5   0.7067448680351907
6   0.7126099706744868
7   0.7126099706744868
8   0.7126099706744868
9   0.7126099706744868
10   0.7126099706744868
11   0.7126099706744868
12   0.7126099706744868
13   0.7126099706744868
14   0.7126099706744868
15   0.7126099706744868
16   0.7126099706744868
17   0.7126099706744868
18   0.7126099706744868
19   0.7126099706744868
20   0.7126099706744868
21   0.7126099706744868
22   0.7126099706744868
23   0.7126099706744868
24   0.7126099706744868
25   0.7126099706744868
26   0.7126099706744868
27   0.7126099706744868
28   0.7126099706744868
29   0.7126099706744868
30   0.7126099706744868
31   0.7126099706744868
32   0.7126099706744868
33   0.7126099706744868
34   0.7126099706744868
35   0.7126099706744868
36   0.7126099706744868
37   0.7126099706744868
38   0.7126099706744868
39   0.7126099706744868
40   0.7126099706744868
41   0.7126099706744868
42   0.7126099706744868
4

In [42]:
max_ep

48

In [43]:
def fun_lr(k, ep) :
    mxx=0
    lr=1
    for I in range(1, k+1) :
        a=perceptron.fit(x_train_array,y_train_array, epochs=ep, lr=I/k) 
        print(I, '  ', a) 
        if(a>mxx) :
            mxx=a
            lr=I
    return lr/k
        
  

In [44]:
max_lr=fun_lr(100,max_ep) 

1    0.7126099706744868
2    0.7126099706744868
3    0.7126099706744868
4    0.7126099706744868
5    0.7360703812316716
6    0.7155425219941349
7    0.7302052785923754
8    0.7272727272727273
9    0.7214076246334311
10    0.7126099706744868
11    0.7126099706744868
12    0.7243401759530792
13    0.7243401759530792
14    0.7126099706744868
15    0.7126099706744868
16    0.7243401759530792
17    0.7243401759530792
18    0.7243401759530792
19    0.7126099706744868
20    0.7126099706744868
21    0.718475073313783
22    0.718475073313783
23    0.718475073313783
24    0.718475073313783
25    0.718475073313783
26    0.718475073313783
27    0.718475073313783
28    0.718475073313783
29    0.7126099706744868
30    0.7126099706744868
31    0.7155425219941349
32    0.7155425219941349
33    0.7126099706744868
34    0.7126099706744868
35    0.7126099706744868
36    0.7126099706744868
37    0.7126099706744868
38    0.7126099706744868
39    0.7126099706744868
40    0.7126099706744868
41    0.712609970

In [45]:
max_lr

0.05

In [46]:
 wt_matrix=perceptron.fit(x_train_array,y_train_array,epochs=max_ep,lr=max_lr)

# Score Test

In [47]:
test_new_1=test_new.drop('PhoneId',axis=1)
test_new_1

Unnamed: 0,Weight,Screen to Body Ratio (calculated),Height,Capacity,RAM,Pixel Density,Resolution,Internal Memory,Processor_frequency,Screen Size,...,os_name_Android,os_name_Blackberry,os_name_KAI,os_name_Nokia,os_name_Other,os_name_Tizen,os_name_iOS,Sim1_2G,Sim1_3G,Sim1_4G
0,168,80.85,156.2,4230,2,271,5,16,1.8,6.20,...,1,0,0,0,0,0,0,0,0,1
1,205,81.60,156.0,5000,4,402,12,64,1.8,6.20,...,1,0,0,0,0,0,0,0,0,1
2,169,83.84,156.7,3500,6,409,25,64,2.0,6.30,...,1,0,0,0,0,0,0,0,0,1
3,169,83.84,156.7,3500,4,409,16,64,2.0,6.30,...,1,0,0,0,0,0,0,0,0,1
4,181,77.43,158.6,4000,4,403,20,64,1.8,5.99,...,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,167,73.18,152.0,4000,4,401,8,32,1.8,5.50,...,1,0,0,0,0,0,0,0,0,1
115,189,83.99,158.1,3500,6,531,8,256,2.7,6.20,...,1,0,0,0,0,0,0,0,0,1
116,171,70.72,151.8,3000,3,267,5,8,1.2,5.50,...,1,0,0,0,0,0,0,0,0,1
117,159,68.66,149.2,2700,2,282,8,32,1.3,5.20,...,1,0,0,0,0,0,0,0,0,1


In [48]:
x_test=test_new_1[final_var]
x_test=x_test.values

In [49]:
y_test=perceptron.predict(x_test)
y_test

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0])

In [50]:
submission = pd.DataFrame({'PhoneId':test_new['PhoneId'], 'Class': y_test})
submission = submission[['PhoneId', 'Class']]
submission.head()

Unnamed: 0,PhoneId,Class
0,3,1
1,11,1
2,13,1
3,16,1
4,19,1


In [51]:
submission['Class']=submission['Class'].map(lambda x: 1 if x==True else 0)
submission.head()

Unnamed: 0,PhoneId,Class
0,3,1
1,11,1
2,13,1
3,16,1
4,19,1


In [52]:
submission.to_csv("submission.csv", index=False)