In [1]:
import pandas as pd
import numpy as np

In [3]:
num_items = 41
data = {
    'City Population': np.random.randint(10000, 1000000, num_items),
    'Continent': np.random.choice(['Asia', 'Europe', 'North America', 'South America'], num_items),
    'Venue Capacity': np.random.randint(500, 20000, num_items),
    'Day Of Week': np.random.choice(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], num_items),
    'Multiple Concerts': np.random.randint(0, 2, num_items),
    'Sold Out': np.random.randint(0, 2, num_items)
}

df = pd.DataFrame(data)

In [35]:
df.head(3)

Unnamed: 0,City Population,Continent,Venue Capacity,Day Of Week,Multiple Concerts,Sold Out
0,511389,North America,10642,Wednesday,0,1
1,599638,North America,7779,Thursday,1,1
2,367245,Asia,19386,Saturday,0,1


In [9]:
df2 = pd.get_dummies(df[['Continent', 'Day Of Week']])

In [36]:
df2.head(3)

Unnamed: 0,Continent_Asia,Continent_Europe,Continent_North America,Continent_South America,Day Of Week_Friday,Day Of Week_Monday,Day Of Week_Saturday,Day Of Week_Sunday,Day Of Week_Thursday,Day Of Week_Tuesday,Day Of Week_Wednesday
0,False,False,True,False,False,False,False,False,False,False,True
1,False,False,True,False,False,False,False,False,True,False,False
2,True,False,False,False,False,False,True,False,False,False,False


In [11]:
df3 = pd.concat([df,df2], axis=1)

In [37]:
df3.head(3)

Unnamed: 0,City Population,Continent,Venue Capacity,Day Of Week,Multiple Concerts,Sold Out,Continent_Asia,Continent_Europe,Continent_North America,Continent_South America,Day Of Week_Friday,Day Of Week_Monday,Day Of Week_Saturday,Day Of Week_Sunday,Day Of Week_Thursday,Day Of Week_Tuesday,Day Of Week_Wednesday
0,511389,North America,10642,Wednesday,0,1,False,False,True,False,False,False,False,False,False,False,True
1,599638,North America,7779,Thursday,1,1,False,False,True,False,False,False,False,False,True,False,False
2,367245,Asia,19386,Saturday,0,1,True,False,False,False,False,False,True,False,False,False,False


In [13]:
df4 = df3.drop(['Continent', 'Day Of Week'], axis=1)

In [38]:
df4.head(3)

Unnamed: 0,City Population,Venue Capacity,Multiple Concerts,Sold Out,Continent_Asia,Continent_Europe,Continent_North America,Continent_South America,Day Of Week_Friday,Day Of Week_Monday,Day Of Week_Saturday,Day Of Week_Sunday,Day Of Week_Thursday,Day Of Week_Tuesday,Day Of Week_Wednesday
0,511389,10642,0,1,False,False,True,False,False,False,False,False,False,False,True
1,599638,7779,1,1,False,False,True,False,False,False,False,False,True,False,False
2,367245,19386,0,1,True,False,False,False,False,False,True,False,False,False,False


In [15]:
X = df4.drop(['Sold Out'], axis=1)

In [16]:
y = df4['Sold Out']

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

In [19]:
from sklearn.naive_bayes import GaussianNB

In [20]:
gnb = GaussianNB()

In [21]:
gnb.fit(X_train, y_train)

In [22]:
y_pred = gnb.predict(X_test)

In [24]:
from sklearn.metrics import classification_report

In [25]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.33      0.25      0.29         4
           1       0.50      0.60      0.55         5

    accuracy                           0.44         9
   macro avg       0.42      0.42      0.42         9
weighted avg       0.43      0.44      0.43         9



In [26]:
gnb.score(X_train, y_train)

0.65625

In [27]:
gnb.score(X_test, y_test)

0.4444444444444444

# add in parameter

In [28]:
param_grid = {
    'var_smoothing': [0.00000001, 0.00000001, 0.00000001]
}

In [29]:
from sklearn.model_selection import GridSearchCV

In [31]:
grid_search = GridSearchCV(gnb, param_grid,cv=5, scoring='accuracy', n_jobs=-1)

In [32]:
grid_search.fit(X_train, y_train)

In [33]:
grid_search.best_params_

{'var_smoothing': 1e-08}

In [34]:
grid_search.best_score_

np.float64(0.5333333333333333)