In [49]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder
from nltk.tokenize import sent_tokenize
from sklearn.naive_bayes import MultinomialNB
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from tqdm import tqdm
import pandas as pd
import numpy as np
import ast
import nltk
nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\krsou\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\krsou\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [50]:
beer = pd.read_csv(r"beer_for_FE.csv")

In [51]:
beer.sample()

Unnamed: 0,beer/ABV,beer/brewerId,beer/name,beer/style,review/appearance,review/aroma,review/overall,review/palate,review/taste,review/text,review/timeStruct,user/birthdayRaw,user/gender
20627,4.8,850,Wild Wacky Wit,Witbier,4.0,4.0,4.5,3.5,4.0,Pale yellow with appropriate cloudiness and a ...,"{'min': 14, 'hour': 1, 'mday': 10, 'sec': 56, ...",1980.0,


### One-hot encoding for `"beer/name"` and `"beer/style"`

In [52]:
# One-hot encoding for "beer/name"
beer_name_encoded = pd.get_dummies(beer["beer/name"], prefix="beer_name")
beer_style_encoded = pd.get_dummies(beer["beer/style"], prefix="beer_style")

# Concatenate the one-hot encoded columns with the original DataFrame
beer = pd.concat(
    [beer, beer_name_encoded.iloc[:, :beer_name_encoded.shape[1] - 1]], axis=1)
beer = pd.concat(
    [beer, beer_style_encoded.iloc[:, :beer_style_encoded.shape[1] - 1]], axis=1)

# Drop the original "beer/name" and "beer/style" column
beer.drop("beer/name", axis=1, inplace=True)
beer.drop("beer/style", axis=1, inplace=True)

beer.shape

(37490, 1792)

### Handling `"review/timeStruct"`

In [53]:
beer["review/timeStruct"] = beer["review/timeStruct"].apply(
    lambda x: np.nan if pd.isna(x) else ast.literal_eval(x))

### Normalize the dictionary into separate columns using keys as column names

In [54]:
normalized_cols = pd.json_normalize(beer["review/timeStruct"])
normalized_cols

Unnamed: 0,min,hour,mday,sec,year,wday,mon,isdst,yday
0,38,3,16,10,2008,1,12,0,351
1,38,23,8,58,2008,4,8,0,221
2,7,18,26,2,2004,4,11,0,331
3,7,1,20,5,2011,0,6,0,171
4,51,6,12,48,2011,5,3,0,71
...,...,...,...,...,...,...,...,...,...
37485,56,23,10,1,2008,3,4,0,101
37486,45,5,10,14,2010,6,1,0,10
37487,3,1,25,36,2003,5,10,0,298
37488,52,19,29,33,2011,5,1,0,29


##### Assigning index as "id" because original index is not accurate

In [55]:
beer["id"] = np.array(i for i in range(beer.shape[0]))
normalized_cols["id"] = np.array(i for i in range(normalized_cols.shape[0]))

In [56]:
beer = pd.merge(beer, normalized_cols)
beer.head(2)

Unnamed: 0,beer/ABV,beer/brewerId,review/appearance,review/aroma,review/overall,review/palate,review/taste,review/text,review/timeStruct,user/birthdayRaw,...,id,min,hour,mday,sec,year,wday,mon,isdst,yday
0,5.0,14338,4.0,4.0,4.0,4.0,4.0,Pours a clouded gold with a thin white head. N...,"{'min': 38, 'hour': 3, 'mday': 16, 'sec': 10, ...",0.0,...,0,38,3,16,10,2008,1,12,0,351
1,11.0,395,4.0,3.5,3.5,3.5,3.0,12oz bottle into 8oz snifter.\t\tDeep ruby red...,"{'min': 38, 'hour': 23, 'mday': 8, 'sec': 58, ...",0.0,...,1,38,23,8,58,2008,4,8,0,221


####  Droping "review/timeStruct" column. because there is no more need of it.

In [57]:
beer.drop(columns=["review/timeStruct"], inplace=True)
beer.shape

(37490, 1801)

In [58]:
beer

Unnamed: 0,beer/ABV,beer/brewerId,review/appearance,review/aroma,review/overall,review/palate,review/taste,review/text,user/birthdayRaw,user/gender,...,id,min,hour,mday,sec,year,wday,mon,isdst,yday
0,5.00,14338,4.0,4.0,4.0,4.0,4.0,Pours a clouded gold with a thin white head. N...,0.0,,...,0,38,3,16,10,2008,1,12,0,351
1,11.00,395,4.0,3.5,3.5,3.5,3.0,12oz bottle into 8oz snifter.\t\tDeep ruby red...,0.0,,...,1,38,23,8,58,2008,4,8,0,221
2,4.70,365,3.5,4.0,3.5,3.5,3.5,First enjoyed at the brewpub about 2 years ago...,0.0,Male,...,2,7,18,26,2,2004,4,11,0,331
3,4.40,1,3.0,3.0,2.5,3.0,3.0,First thing I noticed after pouring from green...,1976.0,Male,...,3,7,1,20,5,2011,0,6,0,171
4,4.40,1417,4.0,3.0,3.0,3.5,2.5,A: pours an amber with a one finger head but o...,0.0,,...,4,51,6,12,48,2011,5,3,0,71
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37485,5.50,3268,4.0,3.5,3.5,3.5,3.5,12 oz brown longneck with no freshness dating....,0.0,,...,37485,56,23,10,1,2008,3,4,0,101
37486,8.50,1199,4.5,4.0,3.5,4.5,4.5,A - A bright red with a maroon-amber hue; mini...,0.0,,...,37486,45,5,10,14,2010,6,1,0,10
37487,4.75,394,4.0,3.5,4.0,4.5,4.0,Sampled on tap at Redbones.\t\tThis marzen sty...,0.0,,...,37487,3,1,25,36,2003,5,10,0,298
37488,11.20,1199,4.0,4.0,4.0,5.0,5.0,Pours a black body with a brown head that very...,0.0,,...,37488,52,19,29,33,2011,5,1,0,29


### Working on `"review/text"`

In [59]:
beer["review/text"][55]

"With Nemesis 2010 arriving next week, it's finally time to try this one, now a year old. Can't wait to try it! Oh yeah, gotta thank paco1029384756 and imperialking for sending me a couple of these bad boys!\t\tIt pours almost no head, just a light fizzy covering with big bubbles, with a super hazy murky yeast filled reddish orange almost golden copper.\t\tThe nose is awesome, with tons of malts, looooaded with malts, wheat malts especially, some dark fruits, but nothing I can specifically pick out, perhaps a touch of peach or apricot, some juicy aromas too, with just a hint of bitterness. Also plenty of bourbon, some light oak and vanilla, but bourbon just right out on the nose.\t\tTaste is very similar to the aroma, starting again with tons of malts, especially wheat, it gets very sweet too, perhaps a little too sweet. A little bit of a boozy taste, warmth, and touch of tingly feel. Also plenty bourbon, which I think is less that the nose was, but still there, bringing a little more 

##### Lowering The Documents

In [60]:
beer["review/text"] = beer["review/text"].apply(lambda x: x.lower())

##### Removing The Stop Words

In [61]:
sw = stopwords.words("english")


def remove_stopwords(sen):
    return " ".join([word for word in sen.split() if word not in sw])


beer["review/text"] = beer["review/text"].apply(remove_stopwords)

In [62]:
beer["review/text"][55]

"nemesis 2010 arriving next week, finally time try one, year old. can't wait try it! oh yeah, gotta thank paco1029384756 imperialking sending couple bad boys! pours almost head, light fizzy covering big bubbles, super hazy murky yeast filled reddish orange almost golden copper. nose awesome, tons malts, looooaded malts, wheat malts especially, dark fruits, nothing specifically pick out, perhaps touch peach apricot, juicy aromas too, hint bitterness. also plenty bourbon, light oak vanilla, bourbon right nose. taste similar aroma, starting tons malts, especially wheat, gets sweet too, perhaps little sweet. little bit boozy taste, warmth, touch tingly feel. also plenty bourbon, think less nose was, still there, bringing little oak vanilla. fruit, light fruit, little apple apricot perhaps, think i'm picking hat really, fruit really hard label specifics. couple iffy flavors too. first strange bitterness, almost minerality bitterness would come roasted malts, seem hops, maybe touch... metall

##### Making Corpus so that can be use further for modeling the Word2Vec

In [63]:
corpus = list()
for doc in beer['review/text']:
    for sent in sent_tokenize(doc):
        corpus.append(simple_preprocess(sent))
len(corpus)

397672

##### Modeling Word2Vec

In [64]:
model = Word2Vec(corpus, vector_size=100, window=5, min_count=1, workers=6)
len(model.wv.index_to_key)

40278

In [65]:
len(model.wv[corpus[0]])

6

##### Vectorization of Documents

In [67]:
def document_vector(document):
    return np.mean(model.wv[simple_preprocess(document)], axis=0)


x = []

for document in tqdm(beer["review/text"]):

    x.append(document_vector(document))

100%|██████████| 37490/37490 [00:08<00:00, 4638.00it/s]


In [68]:
reviews = pd.DataFrame(
    x, columns=[f"review/text-{i}" for i in range(len(x[0]))])
reviews.head()

Unnamed: 0,review/text-0,review/text-1,review/text-2,review/text-3,review/text-4,review/text-5,review/text-6,review/text-7,review/text-8,review/text-9,...,review/text-90,review/text-91,review/text-92,review/text-93,review/text-94,review/text-95,review/text-96,review/text-97,review/text-98,review/text-99
0,0.089004,0.422065,-0.201858,-0.017968,-0.30597,0.212429,0.012411,-0.065674,-0.153521,-0.262662,...,0.017024,-0.307924,0.090578,0.20499,-0.150781,0.226938,0.076331,0.407467,0.342286,-0.122595
1,0.23243,0.291413,-0.06513,-0.402699,0.203945,-0.067767,-0.076542,-0.124616,-0.218011,-0.377633,...,0.241999,-0.365195,-0.02078,0.26244,-0.154483,0.111203,-0.199695,0.382408,0.129511,-0.492966
2,0.253319,0.378973,0.092469,0.258483,0.24313,-0.345678,0.038351,-0.027167,0.0925,-0.204187,...,0.520932,-0.411671,0.094994,0.444975,-0.19835,-0.114731,0.030218,0.08386,-0.044264,-0.605932
3,0.132004,0.492431,0.116301,0.245045,-0.083347,-0.355751,0.46338,-0.472135,-0.22492,-0.231353,...,0.273106,-0.231966,0.406228,0.5043,0.137679,0.104936,-0.394003,-0.138998,0.110955,-0.000434
4,-6.2e-05,0.398575,0.079889,-0.098852,-0.2231,-0.313552,-0.527659,-0.442219,-0.177667,-0.193283,...,0.267248,-0.388855,-0.091385,0.251287,-0.313317,0.316631,-0.283718,0.458991,-0.100753,-0.106418


In [69]:
reviews.shape, beer.shape

((37490, 100), (37490, 1801))

##### Merging Vectorized Documents into Original DataFrame

In [70]:
beer = pd.concat([beer, reviews], axis=1).drop(columns=["review/text"])

In [71]:
beer

Unnamed: 0,beer/ABV,beer/brewerId,review/appearance,review/aroma,review/overall,review/palate,review/taste,user/birthdayRaw,user/gender,"beer_name_""The Wind Cried Mari..."" Scottish Heather Ale",...,review/text-90,review/text-91,review/text-92,review/text-93,review/text-94,review/text-95,review/text-96,review/text-97,review/text-98,review/text-99
0,5.00,14338,4.0,4.0,4.0,4.0,4.0,0.0,,False,...,0.017024,-0.307924,0.090578,0.204990,-0.150781,0.226938,0.076331,0.407467,0.342286,-0.122595
1,11.00,395,4.0,3.5,3.5,3.5,3.0,0.0,,False,...,0.241999,-0.365195,-0.020780,0.262440,-0.154483,0.111203,-0.199695,0.382408,0.129511,-0.492966
2,4.70,365,3.5,4.0,3.5,3.5,3.5,0.0,Male,False,...,0.520932,-0.411671,0.094994,0.444975,-0.198350,-0.114731,0.030218,0.083860,-0.044264,-0.605932
3,4.40,1,3.0,3.0,2.5,3.0,3.0,1976.0,Male,False,...,0.273106,-0.231966,0.406228,0.504300,0.137679,0.104936,-0.394003,-0.138998,0.110955,-0.000434
4,4.40,1417,4.0,3.0,3.0,3.5,2.5,0.0,,False,...,0.267248,-0.388855,-0.091385,0.251287,-0.313317,0.316631,-0.283718,0.458991,-0.100753,-0.106418
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37485,5.50,3268,4.0,3.5,3.5,3.5,3.5,0.0,,False,...,0.235011,-0.281995,0.135551,0.249779,-0.096675,0.026082,-0.359951,0.128446,0.043574,-0.366790
37486,8.50,1199,4.5,4.0,3.5,4.5,4.5,0.0,,False,...,0.360767,-0.262791,-0.039468,0.346618,-0.214537,-0.060027,-0.607675,0.229711,0.268611,-0.025718
37487,4.75,394,4.0,3.5,4.0,4.5,4.0,0.0,,False,...,0.003659,-0.481469,-0.314323,-0.029599,-0.305596,-0.036522,-0.101157,0.379458,0.580949,-0.288529
37488,11.20,1199,4.0,4.0,4.0,5.0,5.0,0.0,,False,...,0.390467,-0.303697,0.324539,0.461305,-0.269546,0.125866,-0.487757,0.072669,0.168629,0.206380


### Filling Missing Values of `"user/gender"` Using Model-based imputation - `Multinomial Naive Bayes`

In [72]:
train = beer[~beer["user/gender"].isna()]
test = beer[beer["user/gender"].isna()]

##### Label Encoding of `"user/gender"` column as Target column

In [73]:
encoder = LabelEncoder()
encoded_data = encoder.fit_transform(train["user/gender"])
encoder.classes_

array(['Female', 'Male'], dtype=object)

In [74]:
xTrainG = train.drop(columns=["id", "user/gender"])
yTrainG = encoded_data

In [75]:
xTestG = test.drop(columns=["id", "user/gender"])

In [76]:
xTrainG.shape, yTrainG.shape, xTestG.shape

((15308, 1898), (15308,), (22182, 1898))

##### Scaling The Values before Training

In [77]:
scaler = MinMaxScaler()
xTrainGScaled = scaler.fit_transform(xTrainG)
xTestGScaled = scaler.transform(xTestG)

##### Training the Data

In [78]:
mnb = MultinomialNB()
mnb.fit(xTrainGScaled, yTrainG)

##### Filling with Predictions

In [79]:
yPredG = mnb.predict(xTestGScaled)
yPredG

array([1, 1, 1, ..., 1, 1, 1])

In [80]:
pd.DataFrame(encoder.inverse_transform(yPredG)).unstack().unique()

array(['Male'], dtype=object)

Here, All Predictions of Gender are MALE which may not be Accurate, so Fill the Missing Values with `Random` word.

In [81]:
beer["user/gender"] = beer["user/gender"].astype("category")
beer["user/gender"] = beer["user/gender"].cat.add_categories("Random")
beer["user/gender"].fillna(value="Random", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  beer["user/gender"].fillna(value="Random", inplace=True)


In [82]:
beer

Unnamed: 0,beer/ABV,beer/brewerId,review/appearance,review/aroma,review/overall,review/palate,review/taste,user/birthdayRaw,user/gender,"beer_name_""The Wind Cried Mari..."" Scottish Heather Ale",...,review/text-90,review/text-91,review/text-92,review/text-93,review/text-94,review/text-95,review/text-96,review/text-97,review/text-98,review/text-99
0,5.00,14338,4.0,4.0,4.0,4.0,4.0,0.0,Random,False,...,0.017024,-0.307924,0.090578,0.204990,-0.150781,0.226938,0.076331,0.407467,0.342286,-0.122595
1,11.00,395,4.0,3.5,3.5,3.5,3.0,0.0,Random,False,...,0.241999,-0.365195,-0.020780,0.262440,-0.154483,0.111203,-0.199695,0.382408,0.129511,-0.492966
2,4.70,365,3.5,4.0,3.5,3.5,3.5,0.0,Male,False,...,0.520932,-0.411671,0.094994,0.444975,-0.198350,-0.114731,0.030218,0.083860,-0.044264,-0.605932
3,4.40,1,3.0,3.0,2.5,3.0,3.0,1976.0,Male,False,...,0.273106,-0.231966,0.406228,0.504300,0.137679,0.104936,-0.394003,-0.138998,0.110955,-0.000434
4,4.40,1417,4.0,3.0,3.0,3.5,2.5,0.0,Random,False,...,0.267248,-0.388855,-0.091385,0.251287,-0.313317,0.316631,-0.283718,0.458991,-0.100753,-0.106418
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37485,5.50,3268,4.0,3.5,3.5,3.5,3.5,0.0,Random,False,...,0.235011,-0.281995,0.135551,0.249779,-0.096675,0.026082,-0.359951,0.128446,0.043574,-0.366790
37486,8.50,1199,4.5,4.0,3.5,4.5,4.5,0.0,Random,False,...,0.360767,-0.262791,-0.039468,0.346618,-0.214537,-0.060027,-0.607675,0.229711,0.268611,-0.025718
37487,4.75,394,4.0,3.5,4.0,4.5,4.0,0.0,Random,False,...,0.003659,-0.481469,-0.314323,-0.029599,-0.305596,-0.036522,-0.101157,0.379458,0.580949,-0.288529
37488,11.20,1199,4.0,4.0,4.0,5.0,5.0,0.0,Random,False,...,0.390467,-0.303697,0.324539,0.461305,-0.269546,0.125866,-0.487757,0.072669,0.168629,0.206380


##### One-Hot Encoding `"user/gender"` column

In [83]:
encoder = OneHotEncoder(sparse_output=False)
userGenderEncoded = encoder.fit_transform(
    np.array(beer["user/gender"]).reshape(-1, 1))
userGenderEncoded

array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]])

##### Merging Encoded Data into Original DataFrame

In [84]:
beer = pd.concat([beer.drop(columns=["id", "user/gender"]), pd.DataFrame(userGenderEncoded,
                 columns=["user/female", "user/male", "user/random"]).drop(columns=["user/random"])], axis=1)

In [85]:
beer

Unnamed: 0,beer/ABV,beer/brewerId,review/appearance,review/aroma,review/overall,review/palate,review/taste,user/birthdayRaw,"beer_name_""The Wind Cried Mari..."" Scottish Heather Ale",beer_name_'Pooya Porter,...,review/text-92,review/text-93,review/text-94,review/text-95,review/text-96,review/text-97,review/text-98,review/text-99,user/female,user/male
0,5.00,14338,4.0,4.0,4.0,4.0,4.0,0.0,False,False,...,0.090578,0.204990,-0.150781,0.226938,0.076331,0.407467,0.342286,-0.122595,0.0,0.0
1,11.00,395,4.0,3.5,3.5,3.5,3.0,0.0,False,False,...,-0.020780,0.262440,-0.154483,0.111203,-0.199695,0.382408,0.129511,-0.492966,0.0,0.0
2,4.70,365,3.5,4.0,3.5,3.5,3.5,0.0,False,False,...,0.094994,0.444975,-0.198350,-0.114731,0.030218,0.083860,-0.044264,-0.605932,0.0,1.0
3,4.40,1,3.0,3.0,2.5,3.0,3.0,1976.0,False,False,...,0.406228,0.504300,0.137679,0.104936,-0.394003,-0.138998,0.110955,-0.000434,0.0,1.0
4,4.40,1417,4.0,3.0,3.0,3.5,2.5,0.0,False,False,...,-0.091385,0.251287,-0.313317,0.316631,-0.283718,0.458991,-0.100753,-0.106418,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37485,5.50,3268,4.0,3.5,3.5,3.5,3.5,0.0,False,False,...,0.135551,0.249779,-0.096675,0.026082,-0.359951,0.128446,0.043574,-0.366790,0.0,0.0
37486,8.50,1199,4.5,4.0,3.5,4.5,4.5,0.0,False,False,...,-0.039468,0.346618,-0.214537,-0.060027,-0.607675,0.229711,0.268611,-0.025718,0.0,0.0
37487,4.75,394,4.0,3.5,4.0,4.5,4.0,0.0,False,False,...,-0.314323,-0.029599,-0.305596,-0.036522,-0.101157,0.379458,0.580949,-0.288529,0.0,0.0
37488,11.20,1199,4.0,4.0,4.0,5.0,5.0,0.0,False,False,...,0.324539,0.461305,-0.269546,0.125866,-0.487757,0.072669,0.168629,0.206380,0.0,0.0


In [86]:
beer.columns

Index(['beer/ABV', 'beer/brewerId', 'review/appearance', 'review/aroma',
       'review/overall', 'review/palate', 'review/taste', 'user/birthdayRaw',
       'beer_name_"The Wind Cried Mari..." Scottish Heather Ale',
       'beer_name_'Pooya Porter',
       ...
       'review/text-92', 'review/text-93', 'review/text-94', 'review/text-95',
       'review/text-96', 'review/text-97', 'review/text-98', 'review/text-99',
       'user/female', 'user/male'],
      dtype='object', length=1900)

In [87]:
beer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37490 entries, 0 to 37489
Columns: 1900 entries, beer/ABV to user/male
dtypes: bool(1781), float32(100), float64(9), int64(10)
memory usage: 83.4 MB


## Exporting Data For EDA_2

In [88]:
beer.to_csv(path_or_buf="beer_EDA_2.csv", index=False)

# Now, GoTo "`EDA_2.ipynb`"