In [79]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [80]:
import re
import nltk

In [81]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [82]:
raw = pd.read_csv("winemag-data-130k-v2.csv")

In [9]:
reviews = data["description"]

In [10]:
reviews.head()

0    Aromas include tropical fruit, broom, brimston...
1    This is ripe and fruity, a wine that is smooth...
2    Tart and snappy, the flavors of lime flesh and...
3    Pineapple rind, lemon pith and orange blossom ...
4    Much like the regular bottling from 2012, this...
Name: description, dtype: object

### Categorical

In [None]:
# Encoding categorical data
# Encoding the Independent Variable
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X = LabelEncoder()
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
onehotencoder = OneHotEncoder(categorical_features = [0])
X = onehotencoder.fit_transform(X).toarray()

# Encoding the Dependent Variable
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)

## Text pre-processing

### Remove punctuation

In [52]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [54]:
r = reviews[1]

In [55]:
nopunc = [char for char in r if char not in string.punctuation]

In [56]:
nopunc = ''.join(nopunc)

In [57]:
nopunc

'This is ripe and fruity a wine that is smooth while still structured Firm tannins are filled out with juicy red berry fruits and freshened with acidity Its  already drinkable although it will certainly be better from 2016'

In [40]:
# method2
r = re.sub('[^a-zA-Z]', ' ', reviews[0]).lower().split()

### Extract year from title

In [74]:
t = data["title"][0]

In [75]:
t

'Nicosia 2013 Vulk√† Bianco  (Etna)'

In [78]:
[w for w in t.split() if re.search('^[0-9]{4}$', w)]

['2013']

### Stemming

In [41]:
ps = PorterStemmer()

### Remove stopwords

In [None]:
stopwords.words('english')

In [None]:
ps = PorterStemmer()

In [58]:
def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [59]:
r = reviews[1]

In [65]:
r0 = data["description"].head(0).apply(text_process)

In [66]:
r0

Series([], Name: description, dtype: object)

In [68]:
text_process(r)

['ripe',
 'fruity',
 'wine',
 'smooth',
 'still',
 'structured',
 'Firm',
 'tannins',
 'filled',
 'juicy',
 'red',
 'berry',
 'fruits',
 'freshened',
 'acidity',
 'already',
 'drinkable',
 'although',
 'certainly',
 'better',
 '2016']

### Vecotrization

In [46]:
from sklearn.feature_extraction.text import CountVectorizer

In [49]:
# bag of words
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()

In [None]:
# Might take awhile...
bow_transformer = CountVectorizer(analyzer=text_process).fit(messages['message'])

# Print total number of vocab words
print(len(bow_transformer.vocabulary_))

In [None]:
message4 = messages['message'][3]
print(message4)

In [None]:
bow4 = bow_transformer.transform([message4])
print(bow4)
print(bow4.shape)

In [None]:
print(bow_transformer.get_feature_names()[4073])
print(bow_transformer.get_feature_names()[9570])

Now we can use **.transform** on our Bag-of-Words (bow) transformed object and transform the entire DataFrame of messages. Let's go ahead and check out how the bag-of-words counts for the entire SMS corpus is a large, sparse matrix:

In [None]:
messages_bow = bow_transformer.transform(messages['message'])

In [None]:
print('Shape of Sparse Matrix: ', messages_bow.shape)
print('Amount of Non-Zero occurences: ', messages_bow.nnz)

In [None]:
sparsity = (100.0 * messages_bow.nnz / (messages_bow.shape[0] * messages_bow.shape[1]))
print('sparsity: {}'.format(round(sparsity)))

In [None]:
# another way
vectorizer = CountVectorizer(ngram_range = (2,2))
X1 = vectorizer.fit_transform(raw100["title"])
features = (vectorizer.get_feature_names()) 

In [None]:
print("\n\nFeatures : \n", features) 

In [None]:
print("\n\nX1 : \n", X1.toarray()) 

### TF-IDF

In [70]:
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
tfidf_transformer = TfidfTransformer().fit(messages_bow)
tfidf4 = tfidf_transformer.transform(bow4)
print(tfidf4)

In [None]:
print(tfidf_transformer.idf_[bow_transformer.vocabulary_['u']])
print(tfidf_transformer.idf_[bow_transformer.vocabulary_['university']])

To transform the entire bag-of-words corpus into TF-IDF corpus at once:

In [None]:
messages_tfidf = tfidf_transformer.transform(messages_bow)
print(messages_tfidf.shape)

In [48]:
corpus = []
corpus.append(r)

In [50]:
X

array([[1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
      dtype=int64)

In [None]:
#birgram
#https://www.geeksforgeeks.org/tf-idf-for-bigrams-trigrams/

## Creating a Data Pipeline