In [2]:
import seaborn as sns

In [65]:
data=sns.load_dataset('tips')

In [66]:
data.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


## Automate below process
#### handling missing values
#### categorical feature
#### handling outliers
#### feature scalling

In [6]:
data.isnull().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [67]:
data.dtypes[data.dtypes=='category']

sex       category
smoker    category
day       category
time      category
dtype: object

In [68]:
data.dtypes[(data.dtypes== 'float64')| (data.dtypes== 'int64')]

total_bill    float64
tip           float64
size            int64
dtype: object

## here we consider totol bill as a target variable

In [69]:
##independ and dependent feature 
X=data.drop('total_bill',axis=1)
y=data['total_bill']

In [70]:
from sklearn.model_selection import train_test_split

In [77]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42)

In [78]:
from sklearn.impute import SimpleImputer ## handling the missing values numerical data
from sklearn.preprocessing import OneHotEncoder ## handling catgrforial value in to numerical value
from sklearn.preprocessing import StandardScaler ## feature scalling

## Create pipeline 

In [79]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [80]:
## sepreate numerical and categoricql data 
category_col=['sex','smoker','day','time']
numeric_col=['tip','size']

## Feature engineering automation

In [81]:
#numerical pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='mean')),## handling the missing values
    ('scaler',StandardScaler()),##  numerical feature
    ]    
)

#categorical pipeline
cate_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),## handling the missing values
    ('onehotencoder',OneHotEncoder()), ##categorical feature
    ]    
)


In [82]:
## column tranfer
## wrapper method column tranfer
preprocessor=ColumnTransformer([
    ('num_pipeline',num_pipeline,numeric_col),
    ('cate_pipeline',cate_pipeline,category_col),
])

In [83]:
## x_train and X_test feature scalling 
X_train=preprocessor.fit_transform(X_train)

In [84]:
X_test=preprocessor.transform(X_test)

In [111]:
## import machine learning models
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import ElasticNetCV
from sklearn.svm import SVR


In [112]:
## Automate model training 
model={
    'random forest':RandomForestRegressor(),
    'LinearRegression':LinearRegression(),
    'Ridge':Ridge(),
    'Lasso':Lasso(),
    'ElasticNet':ElasticNet(),
    'Ridge cv':RidgeCV(cv=5),
    'Lasso cv':LassoCV(cv=5),
    'ElasticNet cv':ElasticNetCV(cv=5),
    'svm':SVR()
}

In [90]:
from sklearn.metrics import r2_score

In [109]:
## function for evaluate model
def evauatemodel(X_train, X_test, y_train, y_test,models):
    report={}
    for i in range(len(models)):
        model=list(models.values())[i]
        #train the model
        model.fit(X_train,y_train)
        ## predict the model
        y_predict=model.predict(X_test)
        ## score of the model
        score=r2_score(y_test,y_predict)
        report[list(models.keys())[i]]=score
    return report

In [113]:
evauatemodel(X_train, X_test, y_train, y_test,model)

{'random forest': 0.4138152381403115,
 'LinearRegression': 0.5855595093805562,
 'Ridge': 0.5863835781278818,
 'Lasso': 0.554601970474276,
 'ElasticNet': 0.5201760503775457,
 'Ridge cv': 0.5882893842397763,
 'Lasso cv': 0.5868154720567639,
 'ElasticNet cv': 0.5905366755312005,
 'svm': 0.42046690984229396}

In [114]:
from sklearn.model_selection import RandomizedSearchCV

In [117]:
parameter={
    "l1_ratio":[0.5,1,1.5,2],
    "eps":[0.001,0.01,0.10,0.0001],
    "cv":[3,4,5],
    "max_iter":[1000,1100,1200]
}
    

In [116]:
elasticcv=ElasticNetCV()

In [118]:
rcv=RandomizedSearchCV(estimator=elasticcv,param_distributions=parameter,verbose=3,scoring='neg_mean_squared_error')

In [None]:
rcv.fit(X_train,y_train)

In [120]:
rcv.best_params_

{'max_iter': 1100, 'l1_ratio': 0.5, 'eps': 0.0001, 'cv': 4}

In [121]:
elastic=ElasticNetCV(max_iter=1100,l1_ratio=0.5,eps=0.0001,cv=4)

In [122]:
elastic.fit(X_train,y_train)

ElasticNetCV(cv=4, eps=0.0001, max_iter=1100)

In [123]:
y_pred=elastic.predict(X_test)

In [124]:
r2_score(y_test,y_pred)

0.5908021892072525