## Importing the essential libraries over here

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Importing the dataset over here

In [3]:
data=pd.read_csv("train.csv")

In [4]:
data.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


## Taking care of duplicate observations if present over here

In [5]:
data.duplicated().sum()

0

## Taking care of missing values if present over here

In [6]:
data.isnull().sum()

battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64

## Filtering all the numerical features over here

In [7]:
numerical_features=[feature for feature in data.columns if data[feature].dtype!='O']
for feature in numerical_features:
  print(feature)

battery_power
blue
clock_speed
dual_sim
fc
four_g
int_memory
m_dep
mobile_wt
n_cores
pc
px_height
px_width
ram
sc_h
sc_w
talk_time
three_g
touch_screen
wifi
price_range


In [8]:
data[numerical_features]

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,794,1,0.5,1,0,1,2,0.8,106,6,...,1222,1890,668,13,4,19,1,1,0,0
1996,1965,1,2.6,1,0,0,39,0.2,187,4,...,915,1965,2032,11,10,16,1,1,1,2
1997,1911,0,0.9,1,1,1,36,0.7,108,8,...,868,1632,3057,9,1,5,1,1,0,3
1998,1512,0,0.9,0,4,1,46,0.1,145,5,...,336,670,869,18,10,19,1,1,1,0


## Filteirng all the categorical features over here

In [10]:
cat_features=[feature for feature in data.columns if data[feature].dtype=='O']
for feature in cat_features:
  print(feature)

## Creating features and labels over here

In [11]:
X=data.iloc[:,:-1].values
y=data.iloc[:,-1].values

## Splitting the dataset into training set and testing set to avoid the problem overfitting over here

In [12]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

## Training the model on the training dataset over here

In [13]:
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier(n_estimators=10,criterion='entropy',random_state=0)
classifier.fit(X_train,y_train)

## Evaluating the performance of the testing set over here

In [14]:
y_pred=classifier.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))

[[3 3]
 [0 0]
 [2 2]
 [2 2]
 [2 2]
 [0 0]
 [0 0]
 [3 3]
 [3 3]
 [1 1]
 [0 1]
 [3 3]
 [0 0]
 [1 2]
 [3 3]
 [0 0]
 [3 3]
 [2 2]
 [2 2]
 [1 1]
 [0 0]
 [0 0]
 [3 3]
 [1 1]
 [1 2]
 [2 2]
 [3 3]
 [1 1]
 [3 3]
 [0 1]
 [0 1]
 [0 0]
 [1 2]
 [0 0]
 [2 1]
 [3 3]
 [0 0]
 [0 0]
 [2 3]
 [3 3]
 [2 3]
 [1 1]
 [3 3]
 [3 3]
 [1 1]
 [3 3]
 [0 0]
 [1 1]
 [3 3]
 [1 1]
 [1 1]
 [2 3]
 [0 0]
 [3 3]
 [0 0]
 [2 3]
 [2 2]
 [1 2]
 [0 0]
 [3 3]
 [3 3]
 [1 1]
 [2 3]
 [2 2]
 [1 1]
 [2 2]
 [3 3]
 [3 2]
 [1 2]
 [2 2]
 [3 3]
 [2 2]
 [1 1]
 [0 0]
 [1 1]
 [3 3]
 [2 2]
 [2 2]
 [2 1]
 [1 2]
 [3 3]
 [3 3]
 [3 3]
 [0 0]
 [0 0]
 [0 0]
 [2 2]
 [0 1]
 [2 2]
 [3 3]
 [1 1]
 [3 2]
 [2 2]
 [0 1]
 [0 0]
 [2 3]
 [3 3]
 [3 3]
 [0 0]
 [3 3]
 [2 1]
 [1 1]
 [2 2]
 [1 1]
 [3 3]
 [2 2]
 [2 2]
 [3 3]
 [2 2]
 [3 3]
 [2 3]
 [0 0]
 [0 0]
 [1 1]
 [2 3]
 [3 3]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 0]
 [3 3]
 [2 2]
 [2 2]
 [1 1]
 [2 1]
 [1 1]
 [1 1]
 [0 0]
 [2 2]
 [1 1]
 [3 3]
 [3 2]
 [3 3]
 [3 3]
 [3 3]
 [3 3]
 [1 2]
 [0 0]
 [2 1]
 [1 1]
 [2 2]
 [2 1]

## Evaluating the performance of the classification model that we have built over here

In [15]:
from sklearn.metrics import confusion_matrix,accuracy_score
cm=confusion_matrix(y_test,y_pred)
print(cm)
accuracy_score(y_test,y_pred)

[[91  4  0  0]
 [11 65 15  1]
 [ 1 21 68  9]
 [ 0  0 15 99]]


0.8075