# About This Notebook

### Import Libraries

In [107]:
import pandas as pd
import numpy as np

# Importing Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Importing NLP Libraries
import nltk
import contractions
import spacy
import scattertext as st
# from scattertext import CorpusFromPandas, produce_scattertext_explorer

### Read in Data

In [37]:
wine = pd.read_csv('../data/wine.csv', )

In [38]:
beer = pd.read_csv('../data/beer.csv')

In [39]:
wine.head()

Unnamed: 0,title,selftext,subreddit
0,Resources for a newbie home winemaker,Resources for a newbie home winemaker I want t...,winemaking
1,A question about kit wine,A question about kit wine I’ve made quite a fe...,winemaking
2,Riesling - My First Wine,"Riesling - My First Wine Hi Everyone,\n\nI am ...",winemaking
3,Persimmon Wine Straining?,Persimmon Wine Straining? Hello! This is my fi...,winemaking
4,Wire used for Trellis,Wire used for Trellis Hey all - I'm planting s...,winemaking


In [40]:
beer.head()

Unnamed: 0,title,selftext,subreddit
0,Sitrep Monday,"Sitrep Monday You've had a week, what's your s...",Homebrewing
1,"Daily Q &amp; A! - December 28, 2020","Daily Q &amp; A! - December 28, 2020 Welcome t...",Homebrewing
2,Boosting ABV with table sugar,Boosting ABV with table sugar I’ve read about ...,Homebrewing
3,Is this pellicile or yeast?,Is this pellicile or yeast? For some backgroun...,Homebrewing
4,Using blowoff tube instead of airlock for carboy?,Using blowoff tube instead of airlock for carb...,Homebrewing


Dropping title column from both DFs. The text from the column has already been merged to selftext in the previous notebook. 

In [41]:
wine.drop(columns = 'title', inplace = True)

In [42]:
beer.drop(columns = 'title', inplace = True)

In [43]:
wine.columns

Index(['selftext', 'subreddit'], dtype='object')

In [44]:
beer.columns

Index(['selftext', 'subreddit'], dtype='object')

# Concatenating 
wine and beer dataframes into one df on the row index 

In [45]:
posts_df = pd.concat([wine, beer], ignore_index = True)

In [46]:
posts_df.head()

Unnamed: 0,selftext,subreddit
0,Resources for a newbie home winemaker I want t...,winemaking
1,A question about kit wine I’ve made quite a fe...,winemaking
2,"Riesling - My First Wine Hi Everyone,\n\nI am ...",winemaking
3,Persimmon Wine Straining? Hello! This is my fi...,winemaking
4,Wire used for Trellis Hey all - I'm planting s...,winemaking


In [47]:
posts_df.tail()

Unnamed: 0,selftext,subreddit
4297,"Any input on this recipe 6 lb - Pale Malt, Mar...",Homebrewing
4298,"Bottling Tepache Hello brewers, \n\na couple d...",Homebrewing
4299,Wiring a spa panel for ebiab gfi I've been all...,Homebrewing
4300,What can I ferment at 60-65°F? The basement in...,Homebrewing
4301,Does a decoction mash with wheat leave a bread...,Homebrewing


### Expanding Contractions
* The code for this section has been adapted from this article in Towards Data Science. 
* https://towardsdatascience.com/preprocessing-text-data-using-python-576206753c28


Creating a new column 'no_contraction'.
* by applying a lambda function to 'selftext' column that takes in a word from a post in a series that has been split.
* if the word is a contraction it converts it to two words using the **fix function** from the **Contrations Library** imported above. 

In [48]:
posts_df['no_contraction'] = posts_df['selftext'].apply(lambda x: 
[contractions.fix(word) for word in x.split()])

Converting the lists in 'no_contraction' series back to string objects. 
* creating a list comprehension that joins mappings of str conversions to a list for every list in 'no_contraction' series. 

In [50]:
posts_df['no_contraction_str'] = [' '.join(map(str, l)) for l in posts_df['no_contraction']]

In [51]:
posts_df.head()

Unnamed: 0,selftext,subreddit,no_contraction,no_contraction_str
0,Resources for a newbie home winemaker I want t...,winemaking,"[Resources, for, a, newbie, home, winemaker, I...",Resources for a newbie home winemaker I want t...
1,A question about kit wine I’ve made quite a fe...,winemaking,"[A, question, about, kit, wine, I have, made, ...",A question about kit wine I have made quite a ...
2,"Riesling - My First Wine Hi Everyone,\n\nI am ...",winemaking,"[Riesling, -, My, First, Wine, Hi, Everyone,, ...","Riesling - My First Wine Hi Everyone, I am goi..."
3,Persimmon Wine Straining? Hello! This is my fi...,winemaking,"[Persimmon, Wine, Straining?, Hello!, This, is...",Persimmon Wine Straining? Hello! This is my fi...
4,Wire used for Trellis Hey all - I'm planting s...,winemaking,"[Wire, used, for, Trellis, Hey, all, -, I am, ...",Wire used for Trellis Hey all - I am planting ...


Make 'selftext' column the 'no_contraction_str' column

In [55]:
posts_df['selftext'] = posts_df['no_contraction_str']

### Create new 'posts_nc' DataFrame 
* that does not contain contractions. 

In [57]:
posts_nc = posts_df.drop(columns = ['no_contraction', 'no_contraction_str'])

In [59]:
posts_nc.head()

Unnamed: 0,selftext,subreddit
0,Resources for a newbie home winemaker I want t...,winemaking
1,A question about kit wine I have made quite a ...,winemaking
2,"Riesling - My First Wine Hi Everyone, I am goi...",winemaking
3,Persimmon Wine Straining? Hello! This is my fi...,winemaking
4,Wire used for Trellis Hey all - I am planting ...,winemaking


# Using spaCY for Preprocessing and EDA
* https://spacy.io/usage/spacy-101

Loading Spacy Core English Language Library
* 'sm' indicates we are using the small version of this library. 

In [61]:
nlp = spacy.load('en_core_web_sm')

Parse posts using nlp library

Creating a list of parsed words using spacy to determine the amount of parsed objects in the entire corpus of posts. 

In [72]:
parsed_list = []
for post in posts_nc['selftext']:
    for word in nlp(post):
        parsed_list.append(word)

In [80]:
# The count of parsed objects in the entire corpus of posts. 
len(parsed_list)

535980

Creating a DF out of the parsed data. 

In [95]:
corpus = pd.DataFrame(parsed_list)

In [96]:
corpus.shape

(535980, 1)

In [97]:
corpus.head()

Unnamed: 0,0
0,Resources
1,for
2,a
3,newbie
4,home


In [98]:
corpus.rename(columns = {0: 'parsed'}, inplace = True)

In [99]:
corpus.columns

Index(['parsed'], dtype='object')

In [94]:
lemma_list = []
for token in parsed['parsed']:
    parsed['lemma'] = lemma_list.append(token.lemma_)


KeyboardInterrupt: 

In [None]:
parsed.head()

In [74]:
posts_nc['parsed'] = posts_nc['selftext'].apply(nlp)

In [100]:
# loading spacy stop_words
stopwords = spacy.lang.en.stop_words.STOP_WORDS
print(f'Number of Stop Words: {len(stopwords)}')
print(f'First Ten Stop Words: {list(stopwords)[:10]}')

Number of Stop Words: 326
First Ten Stop Words: ['yours', 'me', 'hers', 'off', 'seemed', 'whole', 'they', 'give', 'i', 'only']


In [108]:
#creating a corpus to use for our scattertext plot
corpus1 = st.CorpusFromPandas(posts_nc, category_col = 'subreddit', text_col = 'selftext', nlp = nlp).build()

AttributeError: 'CorpusDF' object has no attribute 'token_'

In [113]:
list(corpus1.get_scaled_f_scores_vs_background().index[:40])

['fermenter',
 'fermenting',
 'fermentation',
 'carboy',
 'keg',
 'ferment',
 'airlock',
 'abv',
 'hydrometer',
 'homebrewing',
 'brew',
 'winemaking',
 'yeast',
 'bottling',
 'kegs',
 'campden',
 'neipa',
 'brewed',
 'hops',
 'malt',
 'brewing',
 'batches',
 'wort',
 'mash',
 'carboys',
 'biab',
 'carbonation',
 'kveik',
 'sparge',
 'kegging',
 'keezer',
 'fermented',
 'kegerator',
 'citra',
 'cider',
 'racking',
 'ipa',
 'racked',
 'pilsner',
 'ibu']

In [None]:
corpus1[]