# Naive-Bayes Algorithm

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.datasets import load_iris

In [6]:
iris = load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [11]:
# model intialize
gnb = GaussianNB()

gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)

print(f'Akkurasy: \n {accuracy_score(y_test, y_pred)}')
print(f'confusion_matrix: \n {confusion_matrix(y_test, y_pred)}')
print(f'classification_report: \n {classification_report(y_test, y_pred)}')

Akkurasy: 
 0.9777777777777777
confusion_matrix: 
 [[19  0  0]
 [ 0 12  1]
 [ 0  0 13]]
classification_report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      0.92      0.96        13
           2       0.93      1.00      0.96        13

    accuracy                           0.98        45
   macro avg       0.98      0.97      0.97        45
weighted avg       0.98      0.98      0.98        45



# Use Multinomial NB

In [12]:
# model intialize
mnb = MultinomialNB()

mnb.fit(X_train, y_train)
y_pred = mnb.predict(X_test)

print(f'Akkurasy: \n {accuracy_score(y_test, y_pred)}')
print(f'confusion_matrix: \n {confusion_matrix(y_test, y_pred)}')
print(f'classification_report: \n {classification_report(y_test, y_pred)}')

Akkurasy: 
 0.9555555555555556
confusion_matrix: 
 [[19  0  0]
 [ 0 12  1]
 [ 0  1 12]]
classification_report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       0.92      0.92      0.92        13
           2       0.92      0.92      0.92        13

    accuracy                           0.96        45
   macro avg       0.95      0.95      0.95        45
weighted avg       0.96      0.96      0.96        45



# Use Bernouli NB

In [14]:
bnb = BernoulliNB()

bnb.fit(X_train, y_train)
y_pred = bnb.predict(X_test)

print(f'Akkurasy: \n {accuracy_score(y_test, y_pred)}')
print(f'confusion_matrix: \n {confusion_matrix(y_test, y_pred)}')
print(f'classification_report: \n {classification_report(y_test, y_pred)}')



Akkurasy: 
 0.28888888888888886
confusion_matrix: 
 [[ 0 19  0]
 [ 0 13  0]
 [ 0 13  0]]
classification_report: 
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        19
           1       0.29      1.00      0.45        13
           2       0.00      0.00      0.00        13

    accuracy                           0.29        45
   macro avg       0.10      0.33      0.15        45
weighted avg       0.08      0.29      0.13        45



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Selekting the best model with Best Hyperparameters

In [51]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.datasets import load_iris
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [52]:
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [53]:
df.time.unique()

['Dinner', 'Lunch']
Categories (2, object): ['Lunch', 'Dinner']

In [54]:
df['time'].unique()

['Dinner', 'Lunch']
Categories (2, object): ['Lunch', 'Dinner']

In [55]:
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

# Regression Tasks

In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [57]:
X = df.drop('tip', axis=1)
y = df.tip

from sklearn.preprocessing import LabelEncoder
# Assuming df is your DataFrame
le = LabelEncoder()
for k in df.columns:
    if df[k].dtype == 'category':
        df[k] = le.fit_transform(df[k])

In [58]:
X = df.drop('tip', axis=1)
y = df.tip

In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    int32  
 3   smoker      244 non-null    int32  
 4   day         244 non-null    int32  
 5   time        244 non-null    int32  
 6   size        244 non-null    int64  
dtypes: float64(2), int32(4), int64(1)
memory usage: 9.7 KB


In [60]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,0,0,2,0,2
1,10.34,1.66,1,0,2,0,3
2,21.01,3.5,1,0,2,0,3
3,23.68,3.31,1,0,2,0,2
4,24.59,3.61,0,0,2,0,4


In [61]:
df['smoker'].unique()

array([0, 1])

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [63]:
# kreate a dik of moidels to evaluate performanse
models = {
    'LinearRegression': LinearRegression(),
    'SVR': SVR(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'RandomForesRegressor' : RandomForestRegressor(),
    'KNeighborsRegressor':   KNeighborsRegressor(),
    'GradientBoosgtingRegressor': GradientBoostingRegressor(),
    'XGBRegressor': XGBRegressor()
}

#train and predikt each model with evaluation metriks as well using a for loop.
model_scores = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    metric = mean_absolute_error(y_test, y_pred)
    model_scores.append((name, metric))
    
sorted_models = sorted(model_scores, key=lambda x: x[1], reverse=False)
for model in sorted_models:
    print('Mean Absolute error for', f'{model[0]} is {model[1]: 0.2f}')

Mean Absolute error for SVR is  0.63
Mean Absolute error for LinearRegression is  0.72
Mean Absolute error for RandomForesRegressor is  0.76
Mean Absolute error for KNeighborsRegressor is  0.77
Mean Absolute error for XGBRegressor is  0.77
Mean Absolute error for GradientBoosgtingRegressor is  0.78
Mean Absolute error for DecisionTreeRegressor is  0.98
