In [89]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/wine-reviews/winemag-data_first150k.csv
/kaggle/input/wine-reviews/winemag-data-130k-v2.json
/kaggle/input/wine-reviews/winemag-data-130k-v2.csv


### Importing libraries

In [90]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer 
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import re

#### Loading the dataset

In [91]:
data = pd.read_csv("../input/wine-reviews/winemag-data-130k-v2.csv")
data.head(3)

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm


In [92]:
reviews = pd.Series(data.description.tolist()).astype(str)
reviews

0         Aromas include tropical fruit, broom, brimston...
1         This is ripe and fruity, a wine that is smooth...
2         Tart and snappy, the flavors of lime flesh and...
3         Pineapple rind, lemon pith and orange blossom ...
4         Much like the regular bottling from 2012, this...
                                ...                        
129966    Notes of honeysuckle and cantaloupe sweeten th...
129967    Citation is given as much as a decade of bottl...
129968    Well-drained gravel soil gives this wine its c...
129969    A dry style of Pinot Gris, this is crisp with ...
129970    Big, rich and off-dry, this is powered by inte...
Length: 129971, dtype: object

### Text pre-processing

In [93]:
def text_clean(reviews, keep_list):
    
    cleaned_reviews = pd.Series()
    for row in reviews:
        qs = []
        for word in row.split():
            if word not in keep_list:
                p1 = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=word)
                p1 = p1.lower()
                qs.append(p1)
            else : qs.append(word)
        cleaned_reviews = cleaned_reviews.append(pd.Series(' '.join(qs)))
    return cleaned_reviews

#### removing stop words

In [94]:
def stopwords_removal(reviews):
    wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']
    stop = set(stopwords.words('english'))
    for word in wh_words:
        stop.remove(word)
    reviews = [[x for x in x.split() if x not in stop] for x in reviews]
    return reviews

#### Stemming

In [95]:
def stem(reviews, stem_type = None):
    if stem_type == 'snowball':
        stemmer = SnowballStemmer(language = 'english')
        reviews = [[stemmer.stem(x) for x in x] for x in reviews]
    else :
        stemmer = PorterStemmer()
        reviews = [[stemmer.stem(x) for x in x] for x in reviews]
    return reviews

In [96]:
def preprocess(reviews, keep_list, cleaning = True, stemming = False, stem_type = None, lemmatization = False, remove_stopwords = True):
    
    
    
    if cleaning == True:
        reviews = text_clean(reviews, keep_list)
    
    if remove_stopwords == True:
        reviews = stopwords_removal(reviews)
    else :
        reviews = [[x for x in x.split()] for x in reviews]
    
    if lemmatization == True:
        reviews = lemmatize(reviews)
        
        
    if stemming == True:
        reviews = stem(reviews, stem_type)
    
    reviews = [' '.join(x) for x in reviews]        

    return reviews

In [97]:
common_dot_words = ['etc.']

In [98]:
reviews_with_stemming = preprocess(reviews, keep_list = common_dot_words, stemming = True, stem_type = "snowball", lemmatization = False, remove_stopwords = True)

  This is separate from the ipykernel package so we can avoid doing imports until


#### Lemmitization

In [99]:
def lemmatize(reviews):
    lem = WordNetLemmatizer()
    reviews = [[lem.lemmatize(x, pos = 'v') for x in x] for x in reviews]
    return reviews

In [100]:
# Preprocessing with Lemmatization
reviews_with_lemmatization = preprocess(reviews, keep_list = common_dot_words, stemming = False, stem_type = None, lemmatization = True, remove_stopwords = True)

  This is separate from the ipykernel package so we can avoid doing imports until


### Results

In [101]:
print("String after lemmatiization: ", reviews_with_lemmatization[0])


String after lemmatiization:  aromas include tropical fruit broom brimstone dry herb palate overly expressive offer unripened apple citrus dry sage alongside brisk acidity


In [102]:
print("After stemming: ", reviews_with_stemming[0])


After stemming:  aroma includ tropic fruit broom brimston dri herb palat over express offer unripen appl citrus dri sage alongsid brisk acid
