In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('./data/beers.csv').drop('Unnamed: 0', axis=1)

In [3]:
len(df)

2410

In [4]:
df.head()

Unnamed: 0,abv,ibu,id,name,style,brewery_id,ounces
0,0.05,,1436,Pub Beer,American Pale Lager,408,12.0
1,0.066,,2265,Devil's Cup,American Pale Ale (APA),177,12.0
2,0.071,,2264,Rise of the Phoenix,American IPA,177,12.0
3,0.09,,2263,Sinister,American Double / Imperial IPA,177,12.0
4,0.075,,2262,Sex and Candy,American IPA,177,12.0


In [5]:
df.describe()

Unnamed: 0,abv,ibu,id,brewery_id,ounces
count,2348.0,1405.0,2410.0,2410.0,2410.0
mean,0.059773,42.713167,1431.113278,231.749793,13.592241
std,0.013542,25.954066,752.459975,157.685604,2.352204
min,0.001,4.0,1.0,0.0,8.4
25%,0.05,21.0,808.25,93.0,12.0
50%,0.056,35.0,1453.5,205.0,12.0
75%,0.067,64.0,2075.75,366.0,16.0
max,0.128,138.0,2692.0,557.0,32.0


In [6]:
ibu_nan_count = 0
for row in df.iterrows():
    if pd.isnull(row[1]['ibu']):
        ibu_nan_count += 1
ibu_nan_count

1005

In [7]:
ibu_nan_count/len(df)

0.4170124481327801

In [8]:
len(df['name'].unique())

2305

In [9]:
df_dropna = df.dropna(how='any')

In [10]:
df_dropna.head()

Unnamed: 0,abv,ibu,id,name,style,brewery_id,ounces
14,0.061,60.0,1979,Bitter Bitch,American Pale Ale (APA),177,12.0
21,0.099,92.0,1036,Lower De Boom,American Barleywine,368,8.4
22,0.079,45.0,1024,Fireside Chat,Winter Warmer,368,12.0
24,0.044,42.0,876,Bitter American,American Pale Ale (APA),368,12.0
25,0.049,17.0,802,Hell or High Watermelon Wheat (2009),Fruit / Vegetable Beer,368,12.0


In [11]:
df_dropna.describe()

Unnamed: 0,abv,ibu,id,brewery_id,ounces
count,1403.0,1403.0,1403.0,1403.0,1403.0
mean,0.059919,42.739843,1413.88881,223.375624,13.510264
std,0.013585,25.962692,757.572191,150.38751,2.254112
min,0.027,4.0,1.0,0.0,8.4
25%,0.05,21.0,771.0,95.5,12.0
50%,0.057,35.0,1435.0,198.0,12.0
75%,0.068,64.0,2068.5,350.0,16.0
max,0.125,138.0,2692.0,546.0,32.0


In [12]:
def get_unique_dict(series):
    d = {}
    for idx, row in enumerate(series.unique()):
        d[row] = idx
    return d

In [13]:
def get_df(series, unique_dict, header):
    li = []
    for row in series:
        li.append(unique_dict[row])
    return pd.DataFrame.from_dict({header: li})

In [14]:
name_dict =  get_unique_dict(df_dropna['name'])
df_name = get_df(df_dropna['name'], name_dict, 'name')
df_name.index = df_dropna['name'].index

In [15]:
style_dict = get_unique_dict(df_dropna['style'])
df_style = get_df(df_dropna['style'], style_dict, 'style')
df_style.index = df_dropna['style'].index

In [16]:
df_learn = pd.concat([
    df_dropna['abv'], 
    df_dropna['ibu'],
    df_name['name'],
    df_style['style'],
    df_dropna['ounces'],
    df_dropna['brewery_id'],
], axis=1, sort=True, join='outer')

In [17]:
df_learn.head()

Unnamed: 0,abv,ibu,name,style,ounces,brewery_id
14,0.061,60.0,0,0,12.0,177
21,0.099,92.0,1,1,8.4,368
22,0.079,45.0,2,2,12.0,368
24,0.044,42.0,3,0,12.0,368
25,0.049,17.0,4,3,12.0,368


In [18]:
df_name.head()

Unnamed: 0,name
14,0
21,1
22,2
24,3
25,4


In [19]:
len(df_name)

1403

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
X = df_learn.drop(['brewery_id', 'name', 'ounces'], axis=1).values
y = df_learn['name'].values

In [22]:
(X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size=0.3, random_state=3)

In [23]:
from sklearn.svm import SVC

In [24]:
svm_rbf = SVC(C=1.0, kernel='rbf', gamma=0.3, max_iter=1000)

In [25]:
svm_rbf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.3, kernel='rbf',
  max_iter=1000, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [26]:
svm_rbf.score(X_test, y_test)

0.06413301662707839

In [27]:
from sklearn.metrics import accuracy_score
predicted = svm_rbf.predict(X_test)
expected = y_test
accuracy_score(predicted, expected)

0.06413301662707839

In [28]:
abv = 0.030
ibu = 17.0
style = 15
pred = svm_rbf.predict([
    (abv, ibu, style)
])

In [29]:
pred_beer = pred[0]
pred_beer

95

In [30]:
name = ''
for k, v in name_dict.items():
    if v == pred_beer:
        name = k
for row in df_dropna.itertuples():
    if name in row.name:
        print(row)
        print('style_id', style_dict[row.style])

Pandas(Index=166, abv=0.055, ibu=17.0, id=106, name='Ellie’s Brown Ale', style='American Brown Ale', brewery_id=37, ounces=12.0)
style_id 15


In [31]:
# correct = 0
# for x in df_learn.iterrows():
#     test = x[1].drop(['name', 'ounces', 'brewery_id']).values
#     pred = svm_rbf.predict([test])
#     if (int(x[1]['name'])) == pred[0]:
#         correct += 1
# print(correct / len(df_learn))