In [None]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# train test split the data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

# import regression algorithms
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

#import grid search cv for cross validation
from sklearn.model_selection import GridSearchCV

# import preprocessors
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
df = sns.load_dataset('tips')
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [13]:
X = df.drop('tip',axis=1)
y = df['tip']

for cols in X.columns:
    if X[cols].dtype=='object' or X[cols].dtype.name=='category':
        X[cols] = LabelEncoder().fit_transform(X[cols])
X.head()

Unnamed: 0,total_bill,sex,smoker,day,time,size
0,16.99,0,0,2,0,2
1,10.34,1,0,2,0,3
2,21.01,1,0,2,0,3
3,23.68,1,0,2,0,2
4,24.59,0,0,2,0,4


In [14]:
X.dtypes

total_bill    float64
sex             int64
smoker          int64
day             int64
time            int64
size            int64
dtype: object

In [15]:
X_test,X_train,y_test,y_train = train_test_split(X,y,random_state=42,test_size=0.2)


In [17]:
models={
    'LinearRegression':LinearRegression(),
    'RandomForestRegressor':RandomForestRegressor(),
    'DecisionTreeRegressor':DecisionTreeRegressor(),
    'KNeighborsRegressor':KNeighborsRegressor(),
    'SVR':SVR(),
    'GradientBoostingRegressor':GradientBoostingRegressor(),
    'XGBRegressor':XGBRegressor()
}

model_scores = []
for name, model in models.items():
    # fit each model from models on training data
    model.fit(X_train, y_train)
    
    # make prediction from each model
    y_pred = model.predict(X_test)
    metric = mean_absolute_error(y_test, y_pred)
    model_scores.append((name, metric))

sorted_models = sorted(model_scores,key=lambda x:x[1],reverse=False)
for model in sorted_models:
    print('Mean Absolute error for', f"{model[0]} is {model[1]: .2f}")


Mean Absolute error for LinearRegression is  0.83
Mean Absolute error for SVR is  0.83
Mean Absolute error for KNeighborsRegressor is  0.83
Mean Absolute error for RandomForestRegressor is  0.85
Mean Absolute error for GradientBoostingRegressor is  0.92
Mean Absolute error for XGBRegressor is  0.94
Mean Absolute error for DecisionTreeRegressor is  1.11


In [18]:
model_scores = []
for name, model in models.items():
    # fit each model from models on training data
    model.fit(X_train, y_train)
    
    # make prediction from each model
    y_pred = model.predict(X_test)
    metric = r2_score(y_test, y_pred)
    model_scores.append((name, metric))
    
    # # print the performing metric
    # print(name, 'MSE: ', mean_squared_error(y_test, y_pred))
    # print(name, 'R2: ', r2_score(y_test, y_pred))
    # print(name, 'MAE: ', mean_absolute_error(y_test, y_pred))
    # print('\n')
# selecting the best model from all above models with evaluation metrics sorting method
sorted_models = sorted(model_scores, key=lambda x: x[1], reverse=True)
for model in sorted_models:
    print('R_squared Score', f"{model[0]} is {model[1]: .2f}") 

R_squared Score LinearRegression is  0.36
R_squared Score SVR is  0.32
R_squared Score KNeighborsRegressor is  0.31
R_squared Score RandomForestRegressor is  0.28
R_squared Score GradientBoostingRegressor is  0.21
R_squared Score XGBRegressor is  0.18
R_squared Score DecisionTreeRegressor is -0.03


## **Hyper-Parameter Tuning**

In [35]:
df = sns.load_dataset('tips')
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [36]:

for cols in df.columns:
    if df[cols].dtype=='object' or df[cols].dtype.name=='category':
        if df[cols].nunique()==2:
            df[cols] = LabelEncoder().fit_transform(df[cols])
        
df = pd.get_dummies(df, drop_first=True,dtype='int')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,time,size,day_Fri,day_Sat,day_Sun
0,16.99,1.01,0,0,0,2,0,0,1
1,10.34,1.66,1,0,0,3,0,0,1
2,21.01,3.5,1,0,0,3,0,0,1
3,23.68,3.31,1,0,0,2,0,0,1
4,24.59,3.61,0,0,0,4,0,0,1


In [37]:
df

Unnamed: 0,total_bill,tip,sex,smoker,time,size,day_Fri,day_Sat,day_Sun
0,16.99,1.01,0,0,0,2,0,0,1
1,10.34,1.66,1,0,0,3,0,0,1
2,21.01,3.50,1,0,0,3,0,0,1
3,23.68,3.31,1,0,0,2,0,0,1
4,24.59,3.61,0,0,0,4,0,0,1
...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,1,0,0,3,0,1,0
240,27.18,2.00,0,1,0,2,0,1,0
241,22.67,2.00,1,1,0,2,0,1,0
242,17.82,1.75,1,0,0,2,0,1,0


In [38]:
X = df.drop('tip',axis=1)
y = df['tip']

In [39]:
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.2,random_state=42)

In [41]:
from math import sqrt

In [43]:
models={
    'LinearRegression':(LinearRegression(),{}),
    'RandomForestRegressor': (RandomForestRegressor(), {'n_estimators': [10, 100]}),
    'DecisionTreeRegressor':(DecisionTreeRegressor(),{'max_depth':[None,5,10]}),
    'KNeighborsRegressor':(KNeighborsRegressor(),{'n_neighbors':np.arange(3,100,2)}),
    'SVR':(SVR(),{'kernel':['rbf','sigmoid','poly']}),
    'GradientBoostingRegressor': (GradientBoostingRegressor(), {'n_estimators': [10, 100]}),
    'XGBRegressor': (XGBRegressor(), {'n_estimators': [10, 100]})
}

for name,(model,params) in models.items():
    pipeline = GridSearchCV(model,params,cv=10)
    pipeline.fit(X_train,y_train)
    y_pred = pipeline.predict(X_test)

    print(name, 'MSE: ', mean_squared_error(y_test, y_pred))
    print(name, 'RMSE: ', sqrt(mean_squared_error(y_test, y_pred)))
    print(name, 'R2: ', r2_score(y_test, y_pred))
    print(name, 'MAE: ', mean_absolute_error(y_test, y_pred))
    print('\n')

LinearRegression MSE:  0.9950773879667745
LinearRegression RMSE:  0.9975356574913874
LinearRegression R2:  0.4472602084905919
LinearRegression MAE:  0.7515437172807755


RandomForestRegressor MSE:  1.1441801275510206
RandomForestRegressor RMSE:  1.069663558111157
RandomForestRegressor R2:  0.3644374871747402
RandomForestRegressor MAE:  0.8027275510204083


DecisionTreeRegressor MSE:  1.6369408767006803
DecisionTreeRegressor RMSE:  1.2794299030039435
DecisionTreeRegressor R2:  0.09072161638651066
DecisionTreeRegressor MAE:  0.8925697278911565




Traceback (most recent call last):
  File "c:\Users\Adnan\Desktop\AIML Course\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 942, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "c:\Users\Adnan\Desktop\AIML Course\.venv\Lib\site-packages\sklearn\metrics\_scorer.py", line 492, in __call__
    return estimator.score(*args, **kwargs)
           ~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^
  File "c:\Users\Adnan\Desktop\AIML Course\.venv\Lib\site-packages\sklearn\base.py", line 636, in score
    y_pred = self.predict(X)
  File "c:\Users\Adnan\Desktop\AIML Course\.venv\Lib\site-packages\sklearn\neighbors\_regression.py", line 243, in predict
    neigh_ind = self.kneighbors(X, return_distance=False)
  File "c:\Users\Adnan\Desktop\AIML Course\.venv\Lib\site-packages\sklearn\neighbors\_base.py", line 854, in kneighbors
    raise ValueError(
    ...<3 lines>...
    )
ValueError: Expected n_neighbors <= n_samples_fit, but n_neighbors = 45, n_samples_fit

KNeighborsRegressor MSE:  1.2204888809242702
KNeighborsRegressor RMSE:  1.1047573855486417
KNeighborsRegressor R2:  0.3220499453212806
KNeighborsRegressor MAE:  0.7626205936920224


SVR MSE:  1.201410326445363
SVR RMSE:  1.0960886489902917
SVR R2:  0.3326475896376886
SVR MAE:  0.7243213016135414


GradientBoostingRegressor MSE:  1.1802487050352917
GradientBoostingRegressor RMSE:  1.0863925188601455
GradientBoostingRegressor R2:  0.34440232383992375
GradientBoostingRegressor MAE:  0.782977499617087


XGBRegressor MSE:  1.3011571958974995
XGBRegressor RMSE:  1.1406827761904268
XGBRegressor R2:  0.2772407795831012
XGBRegressor MAE:  0.7980550469670977




In [44]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# dont show warnings
import warnings
warnings.filterwarnings('ignore')

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Create a dictionary of classifiers to evaluate
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier()
}

# Perform k-fold cross-validation and calculate the mean accuracy
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

for name, classifier in classifiers.items():
    scores = cross_val_score(classifier, X, y, cv=kfold)
    accuracy = np.mean(scores)
    print("Classifier:", name)
    print("Mean Accuracy:", accuracy)
    print()

Classifier: Logistic Regression
Mean Accuracy: 0.9733333333333334

Classifier: Decision Tree
Mean Accuracy: 0.9533333333333335

Classifier: Random Forest
Mean Accuracy: 0.9600000000000002

Classifier: SVM
Mean Accuracy: 0.9666666666666668

Classifier: KNN
Mean Accuracy: 0.9733333333333334

