### Understanding how feature selection can improve model performance

In [4]:
#import necessary libraries
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [5]:
#reading the file
df=pd.read_csv('train.csv')

In [6]:
df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [7]:
#price range is the output variable, checking its distribution
df['price_range'].value_counts()

3    500
2    500
1    500
0    500
Name: price_range, dtype: int64

In [8]:
#checking to see if any values are missing
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   battery_power  2000 non-null   int64  
 1   blue           2000 non-null   int64  
 2   clock_speed    2000 non-null   float64
 3   dual_sim       2000 non-null   int64  
 4   fc             2000 non-null   int64  
 5   four_g         2000 non-null   int64  
 6   int_memory     2000 non-null   int64  
 7   m_dep          2000 non-null   float64
 8   mobile_wt      2000 non-null   int64  
 9   n_cores        2000 non-null   int64  
 10  pc             2000 non-null   int64  
 11  px_height      2000 non-null   int64  
 12  px_width       2000 non-null   int64  
 13  ram            2000 non-null   int64  
 14  sc_h           2000 non-null   int64  
 15  sc_w           2000 non-null   int64  
 16  talk_time      2000 non-null   int64  
 17  three_g        2000 non-null   int64  
 18  touch_sc

In [9]:
#creating x, y
x=df.drop(['price_range'],1)
y=df['price_range']

In [10]:
#train test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

### Logistic Regression

In [8]:
#using a basic logistic regression
logr=LogisticRegression()

logr.fit(x_train, y_train)

pred=logr.predict(x_test)

accuracy_score(y_test, pred)

#feature selection using selectKBest
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(x,y)

dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(x.columns)

#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns

featureScores.sort_values(by='Score',ascending=False)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.6325

### Random Forest

In [11]:
#random forest classifier to check the accuracy 
rf=RandomForestClassifier()

rf.fit(x_train, y_train)

pred=rf.predict(x_test)

accuracy_score(y_test, pred)

0.885

RF returns an accuracy score of 0.885

In [13]:
#we can check the importance of a feature using the below logic 
rf.feature_importances_

#creating a dataframe for better visualization
l1=pd.DataFrame(zip(x_train.columns, rf.feature_importances_))

l1.sort_values(1, ascending=False)

Unnamed: 0,0,1
13,ram,0.47672
0,battery_power,0.075662
11,px_height,0.059284
12,px_width,0.056726
8,mobile_wt,0.038337
6,int_memory,0.037674
10,pc,0.030869
16,talk_time,0.030405
14,sc_h,0.028904
2,clock_speed,0.027774


In [14]:
#considering top 6 features with higher importance and checking if the performance improves 
df2=df[['ram','px_height','battery_power','px_width','mobile_wt','int_memory','price_range']]

In [15]:
#creating x,y from the new dataframe
x2=df2.drop(['price_range'],1)
y2=df2['price_range']

x2_train, x2_test, y2_train, y2_test = train_test_split(x2, y2, test_size=0.2, random_state=42)

In [16]:
#using RF on the newly created DataFrame, the accuracy improves to 0.917
rf.fit(x2_train, y2_train)

pred2=rf.predict(x2_test)

accuracy_score(y2_test, pred2)

0.9175

Selecting lesser features with higher importance has improved the performance to 0.917