In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import pydataset
from sklearn.feature_selection import f_regression, SelectKBest, RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

## 1) Load the tips dataset.

In [2]:
tips = pydataset.data('tips')
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.50,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
240,29.03,5.92,Male,No,Sat,Dinner,3
241,27.18,2.00,Female,Yes,Sat,Dinner,2
242,22.67,2.00,Male,Yes,Sat,Dinner,2
243,17.82,1.75,Male,No,Sat,Dinner,2


### - Create a column named price_per_person. This should be the total bill divided by the party size.

In [3]:
tips['price_per_person'] = tips.total_bill / tips.size

In [4]:
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,0.009947
2,10.34,1.66,Male,No,Sun,Dinner,3,0.006054
3,21.01,3.50,Male,No,Sun,Dinner,3,0.012301
4,23.68,3.31,Male,No,Sun,Dinner,2,0.013864
5,24.59,3.61,Female,No,Sun,Dinner,4,0.014397
...,...,...,...,...,...,...,...,...
240,29.03,5.92,Male,No,Sat,Dinner,3,0.016996
241,27.18,2.00,Female,Yes,Sat,Dinner,2,0.015913
242,22.67,2.00,Male,Yes,Sat,Dinner,2,0.013273
243,17.82,1.75,Male,No,Sat,Dinner,2,0.010433


### - Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?

- i think the best features are size and the total bill and mayyyybeeee the time of day ?

### - Use select k best to select the top 2 features for predicting tip amount. What are they?

In [7]:
X_train = tips[['total_bill', 'size','price_per_person']]
y_train = tips.tip


In [8]:
kbest = SelectKBest(f_regression, k=2)
kbest.fit(X_train, y_train)

SelectKBest(k=2, score_func=<function f_regression at 0x14d8abdc0>)

In [9]:
kbest_results = pd.DataFrame(dict(p=kbest.pvalues_, f=kbest.scores_), index=X_train.columns)
kbest_results

Unnamed: 0,p,f
total_bill,6.692471e-34,203.357723
size,4.300543e-16,76.175426
price_per_person,6.692471e-34,203.357723


In [10]:
X_train.columns[kbest.get_support()]

Index(['total_bill', 'price_per_person'], dtype='object')

In [11]:
X_train_transformed = pd.DataFrame(
    kbest.transform(X_train),
    index=X_train.index,
    columns=X_train.columns[kbest.get_support()]
)
X_train_transformed.head()

Unnamed: 0,total_bill,price_per_person
1,16.99,0.009947
2,10.34,0.006054
3,21.01,0.012301
4,23.68,0.013864
5,24.59,0.014397


### - Use recursive feature elimination to select the top 2 features for tip amount. What are they?

In [12]:
from sklearn.linear_model import LinearRegression

In [13]:
model = LinearRegression()
rfe = RFE(model, n_features_to_select=2)
rfe.fit(X_train, y_train)

RFE(estimator=LinearRegression(), n_features_to_select=2)

In [14]:
pd.DataFrame({'rfe_ranking': rfe.ranking_}, index=X_train.columns)

Unnamed: 0,rfe_ranking
total_bill,1
size,1
price_per_person,2


In [15]:
X_train.columns[rfe.get_support()]

Index(['total_bill', 'size'], dtype='object')

In [16]:
X_train_transformed = pd.DataFrame(
    rfe.transform(X_train),
    index=X_train.index,
    columns=X_train.columns[rfe.support_]
)
X_train_transformed.head()

Unnamed: 0,total_bill,size
1,16.99,2.0
2,10.34,3.0
3,21.01,3.0
4,23.68,2.0
5,24.59,4.0


### - Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features your are selecting?

The features from SelectKBest are being chosen based on correlation with the target variable, while the features from RFE are selected based on the performance of a linear regression model. These two approaches are different, which is why they may produce different results.

## 2)Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [24]:
def select_kbest(X, y, k):
    kbest = SelectKBest(k=k)
    kbest.fit(X, y)
    mask = kbest.get_support()
    return X.columns[mask]

In [None]:
#Define the SelectKBest function
def select_kbest(X, y, k):
    
    f_selector = SelectKBest(f_regression, k=k)
    
    f_selector.fit(X, y)
    
    mask = f_selector.get_support()
    
    return list(X.columns[mask])

In [None]:
#Verify it works
select_kbest(train_X, train_y, 5)

## 3)Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [26]:
def rfe(X, y, k, model=LinearRegression()):
    rfe = RFE(model, n_features_to_select=k)
    rfe.fit(X, y)
    mask = rfe.get_support()
    return X.columns[mask]

## 4) Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [18]:
swiss = pydataset.data('swiss')
swiss.head()

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6


In [20]:
swiss.size

282

In [22]:
train_validate, test = train_test_split(swiss, random_state=123, test_size=.2)
train, validate = train_test_split(train_validate, random_state=123, test_size=.2)

train.size , validate.size, test.size

(174, 48, 60)

In [23]:
X_train = train.drop(columns='Fertility')
y_train = train.Fertility

In [25]:
select_kbest(X_train, y_train, 3)

  msw = sswn / float(dfwn)


Index(['Education', 'Catholic', 'Infant.Mortality'], dtype='object')

In [27]:
rfe(X_train, y_train, 3, model=LinearRegression())

Index(['Examination', 'Education', 'Infant.Mortality'], dtype='object')