In [None]:
# Importing all the necessary libraries for this project.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pylab import rcParams
import re
from nltk import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
import string
import math

In [None]:
### importing all the models directory

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.learning_curve import learning_curve
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [None]:
# VIEW ALL COLUMNS & ALL TEXT IN EACH CELL

pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', 20)

# Set ipython's max row display
pd.set_option('display.max_row', 200)


In [None]:
final_review = pd.read_csv('Final_Review.csv', index_col=0)
whisky_pd = pd.read_csv('Whisky_EDA.csv', index_col=0)

In [None]:
whisky_pd.drop_duplicates(inplace = True)

In [None]:
#H/T Dale Wahl/Joe Klien for helping create confusion matrix.

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, roc_auc_score

def eval_sklearn_model(y_true, predictions, average, model=None, X=None):
    """This function takes the true values for y and the predictions made by the model and prints out the confusion matrix along with Accuracy, Precision, and, if model and X provided, Roc_Auc Scores."""
    cnf_matrix = confusion_matrix(y_true, predictions)

    print('True Negative: ', cnf_matrix[0, 0], '| False Positive: ', cnf_matrix[0, 1])
    print('False Negative: ', cnf_matrix[1, 0], '| True Positive: ', cnf_matrix[1, 1], '\n')

    sensitivity = cnf_matrix[1, 1]/ (cnf_matrix[1, 0] + cnf_matrix[1, 1])
    specificity = cnf_matrix[0, 0]/ (cnf_matrix[0, 1] + cnf_matrix[0, 0])

    print('Sensitivity (TP/ TP + FN): ', sensitivity)
    print('Specificity (TN/ TN + FP): ', specificity, '\n')

    print('Accuracy: ', accuracy_score(y_true, predictions, normalize=True))
    print('Precision: ', precision_score(y_true, predictions, average = average))
    if model != None:
        print('Roc-Auc: ', roc_auc_score(y_true, [x[1] for x in model.predict_proba(X)]))
    else:
        pass
    print('\n')

In [None]:
## START WITH DROPPING STUFF THAT WONT BE USEFUL IN MODEL BUILDING

whisky_pd.drop(['Volume_ml', 'Price_Bottle$', 'WhiskyURL', 'Bottler'], axis = 1, inplace = True)

In [None]:
## Make everything lower case

columns = ['WhiskyType', 'Description', 'Distillery', 'Whisky', 'Country', 'Region', 'TasteInfo']
for i in columns:
    whisky_pd[i] = whisky_pd[i].str.lower()

In [None]:
## CREATE DUMMIES FOR WHISKYTYPE

whiskytype = pd.get_dummies(whisky_pd['WhiskyType'], prefix = 'w')
whiskytype.drop('w_rest of world whisky', axis = 1, inplace = True)


In [None]:
## CREATE DUMMIES FOR DISTILLERY TYPE

distillerytype = pd.get_dummies(whisky_pd['Distillery'], prefix = 'd')
distillerytype.drop('d_johnnie walker', axis = 1, inplace = True)

In [None]:
## CREATE DUMMIES FOR COUNTRY 

countrytype = pd.get_dummies(whisky_pd['Country'], prefix = 'c')
countrytype.drop('c_japan', axis = 1, inplace = True)

In [None]:
## CREATE DUMMIES FOR REGION

regiontype = pd.get_dummies(whisky_pd['Region'], prefix = 'r')
regiontype.drop('r_kentucky', axis = 1, inplace = True)

In [None]:
## CREATE DUMMIES FOR COLOURING. LET US TRY A DIFFERENT WAY

whisky_pd['Colouring'].replace(('Yes', 'No'), (1, 0), inplace=True)

In [None]:
## CREATE DUMMIES FOR STATUS. LET US TRY A DIFFERENT WAY

whisky_pd['Status'].replace(('Active', 'Silent'), (1, 0), inplace=True)

In [None]:
## CREATE DUMMIES FOR AGE_TYPE. LET US TRY A DIFFERENT WAY

whisky_pd['AgeType'].replace(('YAS', 'NAS'), (1, 0), inplace=True)

In [None]:
## CREATE DUMMIES FOR AGE_TYPE. LET US TRY A DIFFERENT WAY

whisky_pd['BottlingType'].replace(('Distillery', 'Independent'), (1, 0), inplace=True)

In [None]:
## CREATE DUMMIES FOR AGE_TYPE. LET US TRY A DIFFERENT WAY

whisky_pd['VintageInfo'].replace(('No Vintage', 'Vintage'), (0, 1), inplace=True)

In [None]:
## CREATE DUMMIES FOR EDITIONS

whisky_pd['LimitedEditions'].replace(('No Info', 'Limited'), (0, 1), inplace=True)

In [None]:
## CREATE DUMMIES FOR EDITIONS

whisky_pd['CaskInfo'].replace(('No Info', 'SpecialCask'), (0, 1), inplace=True)

In [None]:
## GET A LIST OF FLAVORS HERE. DROP CHARACTER ITEM

flavorlist = list(whisky_pd['Character'])
flavorlist

completeflavors = []
for i in flavorlist:
    if type(i)==float:
        pass
    else:
        for j in i.split():
            j = j.lower().replace('(', '').replace(')', '')
            completeflavors.append(j)

## REMOVE DUPLICATES
completeflavors = list(set(completeflavors))

## REMOVE 'CHARACTER'

otherflavors = ['peat', 'sherry', 'wine', 'rum', 'citrus', 'bourbon', 'cocoa', 'complex', 'creamy', 'oloroso', 'corn', 'fire', 'herbal', 'sea', 'maple', 'medicinal', 'moonshine', 'nectar', 'organic', 'roasted', 'chocolatey', 'peaty', 'smoky', 'nut', 'brine', 'briny', 'maritime']
completeflavors.extend(otherflavors)

## Add Character column to Description

whisky_pd['Description'] = whisky_pd['Description'] + " " + whisky_pd['Character'].fillna('')



In [None]:
## REMOVE COLUMNS

whisky_pd.drop(['#Ratings', 'Bottling Date', 'Cask Number', 'Cask Type', 'Chill Filtered', 'Series', 
                'Location', 'Owner', 'No of Bottles', 'Year closed', 'TasteInfo', 'Character'], axis = 1, inplace = True)

In [None]:
frames = [whisky_pd, whiskytype, regiontype, countrytype, distillerytype]

whisky_pd = pd.concat(frames, axis = 1).reset_index(drop = True)

In [None]:
whisky_pd.drop(['WhiskyType', 'Country', 'Region', 'Distillery'], axis = 1, inplace = True)

In [None]:
def stemmer(description):
    print(description)
    translator = str.maketrans('', '', string.punctuation)   ### REMOVE PUNCTUATION
    descsplit = description.split(" ")
    wordstemmer = SnowballStemmer('english')
    finallist = [wordstemmer.stem(word) for word in descsplit] 
    return(' '.join(finallist))

whisky_pd['NewDescription'] = whisky_pd['Description'].apply(stemmer)

CHANGING STOP WORDS LIST HERE. TO INCORPORATE A LOT OF WORDS THAT DO NOT ADD ANY INFORMATION

In [None]:
## Defining stop words here before we process 'Description'

from nltk.corpus import stopwords
new_stop = stopwords.words('english')

## Adding Column Names to stop words because no need for them in the description

columnnames = list(whisky_pd.columns)
columnnames = [x.lower().split() for x in columnnames]


import itertools
columnnames = list(itertools.chain.from_iterable(columnnames))

new_stop.extend(columnnames)

## now stop words contain words from distillery, country, region, numbers (age), years(vintage)

units = ['zero', 'one','two','three','four','five','six','seven','eight','nine']
teens = ['eleven','twelve','thirteen','fourteen','fifteen','sixteen', \
             'seventeen','eighteen','nineteen']
tens = ['ten','twenty','thirty','forty','fifty','sixty','seventy', \
            'eighty','ninety']

#syears =  [str(x) for x in range(0,101)]
#strvintage =  [str(x) for x in range(1900, 2018)]
# svintage =  [str(x) + 's' for x in range(1900, 2018)]
# stryears =  [str(x) + 'year' for x in range(0,101)]
# syoyears = [str(x) + 'yo' for x in range(0,101)]

otherwords = ['year', 'old', 'bottle', 'bottle', 'whisky', 'whiskey', "'s", 'bottled', 'distillery', 'character', 'characters', 'please', 'note', 'years']

new_stop = new_stop + otherwords

In [None]:
new_stop = [s.replace("d_", '') for s in new_stop]
new_stop = [s.replace("r_", '') for s in new_stop]
new_stop = [s.replace("c_", '') for s in new_stop]
new_stop = [s.replace("w_", '') for s in new_stop]
new_stop = sorted(list(set(new_stop)))

In [None]:
## START WITH COUNTVECTORIZER

countvec = CountVectorizer(stop_words=new_stop, max_features=2000, ngram_range=(1,1))
wordfeatures = countvec.fit_transform(whisky_pd["NewDescription"])

descriptions = pd.DataFrame(wordfeatures.todense(),columns=countvec.get_feature_names())

In [None]:
## GET LIST OF COUNTVECTORIZED COLUMNS

cols = descriptions.columns

In [None]:
## LIST OF MOST FREQUENTLY USED WORDS

top_50 = descriptions.transpose().sum(axis = 1).sort_values(0, ascending = False)
top_50 = top_50.reset_index()
top_50.columns = ['Word', "Frequency"]

In [None]:
## ARRANGE THEM IN ALPHABETICAL ORDER FOR EASY VIEWING

descriptions = descriptions.reindex_axis(sorted(descriptions.columns), axis=1)

In [None]:
## GET FLAVOR LIST & RENAME COLUMNS THAT ARE FLAVORS TO F_FLAVOR 

newflavors = ['f_' + x for x in cols if x in completeflavors]
oldflavors = [x for x in cols if x in completeflavors]

## GETTING RENAMING DONE HERE

columndict = dict(zip(oldflavors, newflavors))
descriptions = descriptions.rename(columns = columndict)

### STARTING WITH BINARY CLASS MODEL FIRST & THEN MOVING TO THE MULTICLASS MODEL LATER

In [None]:
### START WITH MODEL BUILDING HERE GUYS!! FIX WHISKY LATER!! FIX THE MISSING AGES LATER TOO!!

Xbin = whisky_pd.drop(['Description', 'Whisky', 'WhiskyAge', 'WhiskyVintage', 'Price_L', 'Class', 'NewDescription'], axis = 1)
Xbin = pd.concat([Xbin, descriptions], axis = 1).drop('BinClass', axis = 1)

ybin = whisky_pd['BinClass']

In [None]:
Xbin_train, Xbin_test, ybin_train, ybin_test = train_test_split(Xbin, ybin, random_state = 42, test_size = 0.3)

print(Xbin_train.shape, Xbin_test.shape)
print(ybin_train.shape, ybin_test.shape)

In [None]:
%%time
logit = LogisticRegression()

param_grid = {'penalty' : ['l1', 'l2'],
                'C' : [10 ** i for i in range(-2, 5)]}

grid = GridSearchCV(logit, param_grid, cv=3)

grid.fit(Xbin_train, ybin_train)

best_results = {'params': list(grid.best_params_.items()), 'score': grid.best_score_}

best_logit = grid.best_estimator_

model = best_logit.fit(Xbin_train, ybin_train)
logit_predictions = best_logit.predict(Xbin_test)

score = best_logit.score(Xbin_test, ybin_test)

print("{} Score: {:0.3}".format('Logitistic Classifier', score, '\n'))

print(grid.best_estimator_, '\n')
print('Best Hyperparameters we tested for', '\n', best_results)

eval_sklearn_model(ybin_test, logit_predictions,'binary')

In [None]:
modelpdbin = pd.DataFrame(model.coef_)
modelpdbin.columns = Xbin_train.columns

modelpdbin = modelpdbin.transpose()
modelpdbin.columns = ['Coefficients']

modelpdbin = modelpdbin.sort_values('Coefficients', ascending=False).reset_index()

modelpdbin = modelpdbin[~modelpdbin['index'].str.contains('_')]
modelpdbin

In [None]:
%%time

rfbin = RandomForestClassifier(random_state=42, n_jobs=-1)

rfbin.fit(Xbin_train, ybin_train)


param_grid = {'n_estimators' : [100, 200, 500, 1000],
             'max_features' : [10, 50, 100, 500, 1000],
             'min_samples_leaf' : [1,5,10,50,100,200,500]}

grid = GridSearchCV(rfbin, param_grid, cv=3)

grid.fit(Xbin_train, ybin_train)


best_results = {'params': list(grid.best_params_.items()), 'score': grid.best_score_}

best_rf = grid.best_estimator_

best_rf.fit(Xbin_train, ybin_train)

rf_predictions = best_rf.predict(Xbin_test)

score = best_rf.score(Xbin_test, ybin_test)

print("{} Score: {:0.3}".format('Random Forest Classifier', score, '\n'))

print(grid.best_estimator_, '\n')
print('Best Hyperparameters we tested for', '\n', best_results)


In [None]:
### BINARY CLAS IMPORTANCES USING RANDOMFOREST

importancesbin = pd.concat([pd.DataFrame(Xbin_train.columns),pd.DataFrame(rfbin.feature_importances_)], axis = 1)
importancesbin.columns = ['Feature', 'Importance']
importancesbin = importancesbin.sort_values('Importance', ascending=False)

### MOVING TO THE MULTICLASS MODEL HERE

In [None]:
### START WITH MODEL BUILDING HERE GUYS!! FIX WHISKY LATER!! FIX THE MISSING AGES LATER TOO!!

X = whisky_pd.drop(['Description', 'Whisky', 'WhiskyAge', 'WhiskyVintage', 'Price_L', 'BinClass', 'NewDescription'], axis = 1)
X = pd.concat([X, descriptions], axis = 1).drop('Class', axis = 1)

y = whisky_pd['Class'] # THIS HAS 0-1-2-3 as the classes of whisky

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.3)

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

In [None]:
%%time
logitmulti = LogisticRegression(random_state=42)

param_grid = {'penalty' : ['l1', 'l2'],
                'C' : [10 ** i for i in range(-2, 5)],
             'class_weight' : [None, 'balanced']}

grid = GridSearchCV(logitmulti, param_grid, cv=3)

grid.fit(X_train, y_train)


best_results = {'params': list(grid.best_params_.items()), 'score': grid.best_score_}

best_logit = grid.best_estimator_

multimodel = best_logit.fit(X_train, y_train)
logit_predictions_multi = best_logit.predict(X_test)

score = best_logit.score(X_test, y_test)

print("{} Score: {:0.3}".format('Logitistic Classifier', score, '\n'))

print(grid.best_estimator_, '\n')
print('Best Hyperparameters we tested for', '\n', best_results)

eval_sklearn_model(y_test, logit_predictions_multi, 'micro')

In [None]:
logit_predictions_multi

In [None]:
y_test

In [None]:
whisky_pd.iloc[[141]]

In [None]:
modelpd = pd.DataFrame(multimodel.coef_)
modelpd.columns = X_train.columns

modelpd.head()

modelpd = modelpd.transpose()
modelpd.columns = ['Class 0', 'Class 1', 'Class 2', 'Class 3']

modelpd = modelpd.sort_values('Class 0', ascending=False).reset_index()
modelpd



In [None]:
%%time

rf = RandomForestClassifier(random_state=42, n_jobs=-1)

rf.fit(X_train, y_train)


param_grid = {'n_estimators' : [100, 200, 500, 1000],
             'max_features' : [10, 50, 100, 500, 1000],
             'min_samples_leaf' : [1,5,10,50,100,200,500]}

grid = GridSearchCV(rf, param_grid, cv=3)

grid.fit(X_train, y_train)


best_results = {'params': list(grid.best_params_.items()), 'score': grid.best_score_}

best_rf = grid.best_estimator_

best_rf.fit(X_train, y_train)

rf_predictions = best_rf.predict(X_test)

score = best_rf.score(X_test, y_test)

print("{} Score: {:0.3}".format('Random Forest Classifier', score, '\n'))

print(grid.best_estimator_, '\n')
print('Best Hyperparameters we tested for', '\n', best_results)

In [None]:
## GET FEATURE IMPORTANCES

importances = pd.concat([pd.DataFrame(X_train.columns),pd.DataFrame(rf.feature_importances_)], axis = 1)
importances.columns = ['Feature', 'Importance']
importances = importances.sort_values('Importance', ascending=False)

## VISUALIZATIONS

In [None]:
import matplotlib.style as style
style.use('seaborn-poster')
style.use('fivethirtyeight')


In [None]:
#PLOTTING THE OVERALL DISTRIBUTION


plt.figure(figsize=(8,6))
plt.rcParams["font.family"] = "serif"
dist_plot = plt.hist(whisky_pd['Price_L'], bins = 25, linewidth=0.5, edgecolor = 'white')
plt.xticks(fontsize = 15) #FONT OF TICKS
plt.yticks(fontsize = 15)
plt.xlabel('Price/L in \$', fontsize = 15, weight = 'bold') ### XLABEL
plt.ylabel('# of Bottles', fontsize = 15, weight = 'bold')  ### YLABEL
plt.text(x=0, y=2800, s = 'The distribution of whisky prices on TWE', fontsize = 20, weight = 'bold') ## TOP HEADER
plt.text(x=0, y=2550, s = 'Whisky prices range from under \$50 for everyday bottles to\n more than \$50000 for rare collectible bottles from closed stills', fontsize = 15)
plt.axhline(y = 2, color = 'black', linewidth = 3, alpha = .7)
plt.text(x=-8000, y=-500, s = '     Data Source : The WhiskyExchange | Anirudh Kashyap', fontsize = 12, color = 'black') 
plt.text(x = -8000, y = -350,
    s = '_______________________________________________________________________________________',
    color = 'black', alpha = .7)

plt.show()


In [None]:
discusclass =list(whisky_pd['Class'].value_counts().keys())
discusvalues = list(whisky_pd['Class'].value_counts().values)

In [None]:
#PLOTTING THE CLASSIFICATION DISTRIBUTION

plt.figure(figsize=(8,6))
boxplot = plt.bar(discusclass, discusvalues, width = 0.4, align = 'center')
plt.xlabel('Price/L in \$', fontsize = 15, weight = 'bold')
plt.ylabel('# of Bottles', fontsize = 15, weight = 'bold')
plt.xticks(discusclass)
plt.axhline(y = 5, color = 'black', linewidth = 3, alpha = .7)
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)
plt.text(x=-0.5, y=1300, s = 'Categorizing Whisky prices using DiSCUS\nclassification', fontsize = 20, weight = 'bold')
plt.text(x=-0.3, y=500, s = 'under $50', fontsize = 15)
plt.text(x=0.5, y=700, s = 'more than $50\nless than $100', fontsize = 15)
plt.text(x=1.50, y=1200, s = 'more than $100\nless than $1000', fontsize = 15)
plt.text(x=2.50, y=700, s = 'more than $1000', fontsize = 15)
plt.text(x=-0.75, y=-250, s = '     Data Source : The WhiskyExchange | Anirudh Kashyap', fontsize = 12, color = 'black') 
plt.text(x = -0.75, y = -175,
    s = '__________________________________________________________________________________',
    color = 'black', alpha = .7)
plt.show()

In [None]:
binclass =list(whisky_pd['BinClass'].value_counts().keys())
binvalues = list(whisky_pd['BinClass'].value_counts().values)

In [None]:
#PLOTTING THE BINARY CLASS DISTRIBUTION

plt.figure(figsize=(7,6))
boxplot = plt.bar(binclass, binvalues, width = 0.4, align = 'center')
plt.xlabel('Price/L in \$', fontsize = 15, weight = 'bold')
plt.ylabel('# of Bottles', fontsize = 15, weight = 'bold')
plt.xticks(binclass, size='small')
plt.axhline(y = 5, color = 'black', linewidth = 3, alpha = .7)
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)
plt.text(x=-0.5, y=2000, s = 'Categorizing Whisky prices using Anirudh\nclassification', fontsize = 20, weight = 'bold')
plt.text(x=-0.15, y=1800, s = 'under $500', fontsize = 15)
plt.text(x=0.85, y=1100, s = 'more than $500', fontsize = 15)
plt.text(x=-0.55, y=-350, s = '     Data Source : The WhiskyExchange | Anirudh Kashyap', fontsize = 12, color = 'black') 
plt.text(x = -0.55, y = -250,
    s = '______________________________________________________________________________',
    color = 'black', alpha = .7)
plt.show()

In [None]:
modelodds = modelpd[['Class 0', 'Class 1', 'Class 2', 'Class 3']].apply(lambda x: np.e**x)
modelodds['Factor'] = modelpd['index']
modelodds = modelodds[['Factor', 'Class 0', 'Class 1', 'Class 2', 'Class 3']]

In [None]:
## THIS CELL BLOCK IS VERY VERSATILE CHANGE USING NOTES BELOW TO SORT INFO

modelodds = modelodds.sort_values('Class 3', ascending=False).reset_index(drop = True)

modelodds = modelodds[~modelodds['Factor'].str.contains('_')]

modelodds.head(15)
bars = tuple(modelodds['Factor'].head(15))

In [None]:
#PLOTTING THE CLASSIFICATION ODDS DISTRIBUTION

plt.figure(figsize=(12,6))

bars = tuple(modelodds['Factor'].head(15))
y_pos = np.arange(len(bars))
odds_number = list(modelodds['Class 3'].head(15))


barplot = plt.bar(y_pos, odds_number, color = 'darkgreen', alpha = 0.85)
plt.xlabel('Factors', fontsize = 15, weight = 'bold')
plt.ylabel('Odd times as likely', fontsize = 15, weight = 'bold')
plt.xticks(y_pos, bars, rotation=90)
# plt.axhline(y = 0, color = 'black', linewidth = 3, alpha = .7)
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)
plt.text(x=-0.5, y=3.25, s = 'Analyzing the factors that place whisky in class 3', fontsize = 20, weight = 'bold')
plt.text(x=-0.5, y=2.95, s = 'To read the chart: Whisky is (y-value) times as likely to be in class 3 (>$1000)\nif it contains (x-value)', fontsize = 15)
plt.text(x=-2, y=-2.0, s = 'Data Source : The WhiskyExchange | Anirudh Kashyap', fontsize = 12, color = 'black') 
plt.text(x = -2, y = -1.75, s = '___________________________________________________________________________________________________________'
         ,color = 'black', alpha = .7)
plt.show()