In [37]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pydataset import data
import wrangle
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from env import get_db_url, host, user, password
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.feature_selection import RFE

### Load the tips dataset.

In [11]:
df = data('tips')

In [12]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.50,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
240,29.03,5.92,Male,No,Sat,Dinner,3
241,27.18,2.00,Female,Yes,Sat,Dinner,2
242,22.67,2.00,Male,Yes,Sat,Dinner,2
243,17.82,1.75,Male,No,Sat,Dinner,2


### Create a column named price_per_person. This should be the total bill divided by the party size.

In [15]:
df['price_per_person'] = df.total_bill / df['size']

In [16]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,8.495000
2,10.34,1.66,Male,No,Sun,Dinner,3,3.446667
3,21.01,3.50,Male,No,Sun,Dinner,3,7.003333
4,23.68,3.31,Male,No,Sun,Dinner,2,11.840000
5,24.59,3.61,Female,No,Sun,Dinner,4,6.147500
...,...,...,...,...,...,...,...,...
240,29.03,5.92,Male,No,Sat,Dinner,3,9.676667
241,27.18,2.00,Female,Yes,Sat,Dinner,2,13.590000
242,22.67,2.00,Male,Yes,Sat,Dinner,2,11.335000
243,17.82,1.75,Male,No,Sat,Dinner,2,8.910000


In [17]:
df['price_per_person'] = df['price_per_person'].round(2)

In [18]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person
1,16.99,1.01,Female,No,Sun,Dinner,2,8.49
2,10.34,1.66,Male,No,Sun,Dinner,3,3.45
3,21.01,3.50,Male,No,Sun,Dinner,3,7.00
4,23.68,3.31,Male,No,Sun,Dinner,2,11.84
5,24.59,3.61,Female,No,Sun,Dinner,4,6.15
...,...,...,...,...,...,...,...,...
240,29.03,5.92,Male,No,Sat,Dinner,3,9.68
241,27.18,2.00,Female,Yes,Sat,Dinner,2,13.59
242,22.67,2.00,Male,Yes,Sat,Dinner,2,11.34
243,17.82,1.75,Male,No,Sat,Dinner,2,8.91


In [22]:
sex_mapping = {'Female': 0, 'Male': 1}
df['sex'] = df['sex'].map(sex_mapping)

In [30]:
df = df.drop(columns = ['smoker', 'day', 'time'])

In [31]:
df

Unnamed: 0,total_bill,tip,sex,size,price_per_person
1,16.99,1.01,0,2,8.49
2,10.34,1.66,1,3,3.45
3,21.01,3.50,1,3,7.00
4,23.68,3.31,1,2,11.84
5,24.59,3.61,0,4,6.15
...,...,...,...,...,...
240,29.03,5.92,1,3,9.68
241,27.18,2.00,0,2,13.59
242,22.67,2.00,1,2,11.34
243,17.82,1.75,1,2,8.91


### Before using any of the methods discussed in the lesson, which features do you think would be most important for predicting the tip amount?

size and total_bill

### Use select k best to select the top 2 features for predicting tip amount. What are they?

In [33]:
X = df.drop('tip', axis=1) 
y = df['tip'] 

k = 2 
selector = SelectKBest(score_func=f_regression, k=k)
selector.fit(X, y)


top_feature_indices = selector.get_support(indices=True)


top_features = X.columns[top_feature_indices]


print("Top 2 features for predicting tip amount:")
for feature in top_features:
    print(feature)

Top 2 features for predicting tip amount:
total_bill
size


### Use recursive feature elimination to select the top 2 features for tip amount. What are they?

In [38]:
X = df[['total_bill', 'sex', 'size', 'price_per_person']]
y = df['tip']

estimator = LinearRegression() 
k = 2 
rfe = RFE(estimator, n_features_to_select=k)
rfe.fit(X, y)

top_feature_mask = rfe.support_

top_features = X.columns[top_feature_mask]

print("Top 2 features for predicting tip amount:")
for feature in top_features:
    print(feature)

Top 2 features for predicting tip amount:
total_bill
price_per_person


### Why do you think select k best and recursive feature elimination might give different answers for the top features? Does this change as you change the number of features you are selecting?

- SelectKBest: Increasing the number of features to select may change the rankings and selection order of features. Lower-ranked features may be selected when more features are chosen.
- RFE: Changing the number of features to select can affect the recursive elimination process. Increasing the number of features to select may lead to the elimination of different features potentially resulting in a different final selection.

### Write a function named select_kbest that takes in the predictors (X), the target (y), and the number of features to select (k) and returns the names of the top k selected features based on the SelectKBest class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [40]:
def select_kbest(X, y, k):
    
    
    selector = SelectKBest(score_func=f_regression, k=k)

    
    X_selected = selector.fit_transform(X, y)

    
    feature_indices = selector.get_support(indices=True)

    
    selected_features = X.columns[feature_indices].tolist()

    return selected_features

In [42]:
X = df.drop('tip', axis=1)
y = df['tip']

selected_features = select_kbest(X, y, k=2)

print("Top 2 features based on SelectKBest:")
for feature in selected_features:
    print(feature)

Top 2 features based on SelectKBest:
total_bill
size


### Write a function named rfe that takes in the predictors, the target, and the number of features to select. It should return the top k features based on the RFE class. Test your function with the tips dataset. You should see the same results as when you did the process manually.

In [44]:
def rfe(X, y, k):
   
    
    estimator = LinearRegression()

    selector = RFE(estimator, n_features_to_select=k)

    X_selected = selector.fit_transform(X, y)

    feature_indices = selector.get_support(indices=True)

    selected_features = X.columns[feature_indices].tolist()

    return selected_features

In [45]:
X = df.drop('tip', axis=1)
y = df['tip']

selected_features = rfe(X, y, k=2)

print("Top 2 features based on RFE:")
for feature in selected_features:
    print(feature)

Top 2 features based on RFE:
total_bill
price_per_person


### Load the swiss dataset and use all the other features to predict Fertility. Find the top 3 features using both select k best and recursive feature elimination (use the functions you just built to help you out).

In [46]:
df = data('swiss')

In [47]:
df

Unnamed: 0,Fertility,Agriculture,Examination,Education,Catholic,Infant.Mortality
Courtelary,80.2,17.0,15,12,9.96,22.2
Delemont,83.1,45.1,6,9,84.84,22.2
Franches-Mnt,92.5,39.7,5,5,93.4,20.2
Moutier,85.8,36.5,12,7,33.77,20.3
Neuveville,76.9,43.5,17,15,5.16,20.6
Porrentruy,76.1,35.3,9,7,90.57,26.6
Broye,83.8,70.2,16,7,92.85,23.6
Glane,92.4,67.8,14,8,97.16,24.9
Gruyere,82.4,53.3,12,7,97.67,21.0
Sarine,82.9,45.2,16,13,91.38,24.4


In [52]:
X = df.drop('Fertility', axis=1)
y = df['Fertility']

selected_features = select_kbest(X, y, k=3)

print("Top 3 features based on SelectKBest:")
for feature in selected_features:
    print(feature)

Top 3 features based on SelectKBest:
Examination
Education
Catholic


In [51]:
X = df.drop('Fertility', axis=1)
y = df['Fertility']

selected_features = rfe(X, y, k=3)

print("Top 3 features based on RFE:")
for feature in selected_features:
    print(feature)

Top 3 features based on RFE:
Examination
Education
Infant.Mortality
