In [3]:
# import libraries 
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import warnings 
warnings.filterwarnings('ignore')
import nltk
import string

# Load Data

In [4]:
raw_data = pd.read_csv('mfp_data.csv')
raw_data.head(10)

Unnamed: 0,user,date,post
0,avskk,"October 24, 2018 2:32PM",Autumn has fully arrived and I'm cooking about...
1,janejellyroll,"October 24, 2018 2:37PM",Breakfast:\nChinese sticky rice with tempeh sa...
2,firlena227,"October 24, 2018 4:07PM",Ooh I love butternut squash risotto (apart fro...
3,amy19355,"November 2, 2018 3:30PM",Can of low sodium chicken\nwith wild rice soup...
4,Sunshine_And_Sand,"November 2, 2018 3:38PM",Breakfast - zoats with melted American cheese ...
5,avskk,"November 2, 2018 4:42PM","Today's going to be a good, if carby, day, I t..."
6,MelanieCN77,"November 3, 2018 2:00PM",Breakfast: home made plain yoghurt with bluebe...
7,nicsflyingcircus,"November 3, 2018 4:31PM",Breakfast: the Protein Plate at our local dine...
8,MelanieCN77,"November 3, 2018 5:10PM",OK well my nice plan went out the window. I'm ...
9,rainingribbons,"November 3, 2018 8:01PM",Breakfast: 2 homemade cinnamon rolls\nMorning ...


In [5]:
raw_data.dtypes

user    object
date    object
post    object
dtype: object

In [6]:
raw_data.columns

Index(['user', 'date', 'post'], dtype='object')

In [7]:
raw_data.shape

(21, 3)

In [8]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   user    21 non-null     object
 1   date    21 non-null     object
 2   post    21 non-null     object
dtypes: object(3)
memory usage: 632.0+ bytes


# Text Processing (NLP)

**Convert to LowerCase**

In [9]:
raw_data['post'] = [post.lower() for post in raw_data['post']]

print(raw_data['post'])

0     autumn has fully arrived and i'm cooking about...
1     breakfast:\nchinese sticky rice with tempeh sa...
2     ooh i love butternut squash risotto (apart fro...
3     can of low sodium chicken\nwith wild rice soup...
4     breakfast - zoats with melted american cheese ...
5     today's going to be a good, if carby, day, i t...
6     breakfast: home made plain yoghurt with bluebe...
7     breakfast: the protein plate at our local dine...
8     ok well my nice plan went out the window. i'm ...
9     breakfast: 2 homemade cinnamon rolls\nmorning ...
10    it's my birthday, so i ate all of the things a...
11    seltzermint555\nwrote:\n»\nit's my birthday, s...
12    breakfast:\nchickpea soup with rice and kale, ...
13    breakfast:\ncranberry muffin with a little yog...
14    avskk\nwrote:\n»\nseltzermint555\nwrote:\n»\ni...
15    my big bowl for lunch today combined:\n1/2 cup...
16    no breakfast, my usual brunch: 110 calorie eng...
17    breakfast: fried egg open faced sandwich o

**Tokenization**

In [10]:
# word tokenization 

from nltk.tokenize import word_tokenize
token_post = [word_tokenize(post) for post in raw_data['post']]
print(token_post)

print('-------------------------------------------')

# sentence tokenization

from nltk.tokenize import sent_tokenize
sent_token = [sent_tokenize(post) for post in raw_data['post']]
print(sent_token)

[['autumn', 'has', 'fully', 'arrived', 'and', 'i', "'m", 'cooking', 'about', 'it', '.', 'breakfast', ':', 'toaster', 'waffles', 'with', 'yogurt', 'butter', 'and', 'syrup', ',', 'fried', 'ham', ',', 'coffee', '.', 'lunch', ':', 'leftover', 'chili', 'with', 'cheese', ',', 'sour', 'cream', ',', 'and', 'a', 'few', 'corn', 'chips', '.', 'i', "'m", 'having', 'a', 'fairly', 'small', 'portion', 'of', 'this', '(', 'it', "'s", '...', 'not', 'at', 'all', '``', 'light', "''", ')', ',', 'so', 'if', 'i', "'m", 'still', 'hungry', 'i', "'ll", 'have', 'some', 'broccoli', ',', 'baby', 'carrots', ',', 'and', 'cherry', 'tomatoes', '.', 'dinner', ':', 'leftover', 'vegetable-loaded', 'chicken-tortellini', 'soup', '.', 'snacks', ':', 'clementines', ',', 'grapes', ',', 'or', 'pomegranate', '.'], ['breakfast', ':', 'chinese', 'sticky', 'rice', 'with', 'tempeh', 'sausage', ',', 'cucumber', 'salad', 'lunch', ':', 'going', 'out', 'to', 'lunch', 'with', 'co-workers', '.', '.', '.', 'probably', 'a', 'black', 'bean'

**Remove Punctuation**

In [11]:
# remove punctuation 

import re

# reg strips URL, punctuations, usernames, 
# and non alphanumeric characters. 
# Also separates words into single space

reg = re.compile('(@[a-z0-9]+)|([^0-9a-z \t])|(\w+:\/\/\S+)')

no_punc = []

for filt in token_post:
    review = []
    for token in filt:
        new_token = reg.sub(u'', token)
        if not new_token == u'':
            review.append(new_token)
    no_punc.append(review)
    
print(no_punc)

[['autumn', 'has', 'fully', 'arrived', 'and', 'i', 'm', 'cooking', 'about', 'it', 'breakfast', 'toaster', 'waffles', 'with', 'yogurt', 'butter', 'and', 'syrup', 'fried', 'ham', 'coffee', 'lunch', 'leftover', 'chili', 'with', 'cheese', 'sour', 'cream', 'and', 'a', 'few', 'corn', 'chips', 'i', 'm', 'having', 'a', 'fairly', 'small', 'portion', 'of', 'this', 'it', 's', 'not', 'at', 'all', 'light', 'so', 'if', 'i', 'm', 'still', 'hungry', 'i', 'll', 'have', 'some', 'broccoli', 'baby', 'carrots', 'and', 'cherry', 'tomatoes', 'dinner', 'leftover', 'vegetableloaded', 'chickentortellini', 'soup', 'snacks', 'clementines', 'grapes', 'or', 'pomegranate'], ['breakfast', 'chinese', 'sticky', 'rice', 'with', 'tempeh', 'sausage', 'cucumber', 'salad', 'lunch', 'going', 'out', 'to', 'lunch', 'with', 'coworkers', 'probably', 'a', 'black', 'bean', 'burger', 'with', 'guacamole', 'and', 'some', 'fruit', 'salad', 'dinner', 'lettuce', 'basil', 'mint', 'and', 'radishes', 'with', 'thaistyle', 'dressing', 'sweet

**Remove Stopwords**

In [12]:
# Removing the stopwords
from nltk.corpus import stopwords

no_stop = []

for post in no_punc:
    new_term_vector = []
    for word in post:
        if not word in stopwords.words('english'):
            new_term_vector.append(word)
            
    no_stop.append(new_term_vector)
    
print(no_stop)

[['autumn', 'fully', 'arrived', 'cooking', 'breakfast', 'toaster', 'waffles', 'yogurt', 'butter', 'syrup', 'fried', 'ham', 'coffee', 'lunch', 'leftover', 'chili', 'cheese', 'sour', 'cream', 'corn', 'chips', 'fairly', 'small', 'portion', 'light', 'still', 'hungry', 'broccoli', 'baby', 'carrots', 'cherry', 'tomatoes', 'dinner', 'leftover', 'vegetableloaded', 'chickentortellini', 'soup', 'snacks', 'clementines', 'grapes', 'pomegranate'], ['breakfast', 'chinese', 'sticky', 'rice', 'tempeh', 'sausage', 'cucumber', 'salad', 'lunch', 'going', 'lunch', 'coworkers', 'probably', 'black', 'bean', 'burger', 'guacamole', 'fruit', 'salad', 'dinner', 'lettuce', 'basil', 'mint', 'radishes', 'thaistyle', 'dressing', 'sweet', 'potato', 'stew', 'love', 'autumn', 'cooking', 'got', 'butternut', 'squash', 'csa', 'box', 'thinking', 'want', 'weekend', 'thinking', 'maybe', 'roasted', 'butternut', 'squash', 'risotto'], ['ooh', 'love', 'butternut', 'squash', 'risotto', 'apart', 'chopping', 'butternut', 'squash',

**Stemming and Lemmatization**

In [13]:
# Stemming and Lemmatizing

from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

pstem = PorterStemmer()
wlem = WordNetLemmatizer()

preproc_text = []

for text in no_stop:
    final_text = []
    for word in text:
        pstem.stem(word)
        final_text.append(wlem.lemmatize(word))
    
    preproc_text.append(final_text)
    
print(preproc_text)

[['autumn', 'fully', 'arrived', 'cooking', 'breakfast', 'toaster', 'waffle', 'yogurt', 'butter', 'syrup', 'fried', 'ham', 'coffee', 'lunch', 'leftover', 'chili', 'cheese', 'sour', 'cream', 'corn', 'chip', 'fairly', 'small', 'portion', 'light', 'still', 'hungry', 'broccoli', 'baby', 'carrot', 'cherry', 'tomato', 'dinner', 'leftover', 'vegetableloaded', 'chickentortellini', 'soup', 'snack', 'clementine', 'grape', 'pomegranate'], ['breakfast', 'chinese', 'sticky', 'rice', 'tempeh', 'sausage', 'cucumber', 'salad', 'lunch', 'going', 'lunch', 'coworkers', 'probably', 'black', 'bean', 'burger', 'guacamole', 'fruit', 'salad', 'dinner', 'lettuce', 'basil', 'mint', 'radish', 'thaistyle', 'dressing', 'sweet', 'potato', 'stew', 'love', 'autumn', 'cooking', 'got', 'butternut', 'squash', 'csa', 'box', 'thinking', 'want', 'weekend', 'thinking', 'maybe', 'roasted', 'butternut', 'squash', 'risotto'], ['ooh', 'love', 'butternut', 'squash', 'risotto', 'apart', 'chopping', 'butternut', 'squash', 'part', '

In [14]:
# create final data set
data = raw_data.copy()

new_col = pd.Series(preproc_text)
data['post'] = new_col
data.head(10)

Unnamed: 0,user,date,post
0,avskk,"October 24, 2018 2:32PM","[autumn, fully, arrived, cooking, breakfast, t..."
1,janejellyroll,"October 24, 2018 2:37PM","[breakfast, chinese, sticky, rice, tempeh, sau..."
2,firlena227,"October 24, 2018 4:07PM","[ooh, love, butternut, squash, risotto, apart,..."
3,amy19355,"November 2, 2018 3:30PM","[low, sodium, chicken, wild, rice, soup, added..."
4,Sunshine_And_Sand,"November 2, 2018 3:38PM","[breakfast, zoats, melted, american, cheese, s..."
5,avskk,"November 2, 2018 4:42PM","[today, going, good, carby, day, think, breakf..."
6,MelanieCN77,"November 3, 2018 2:00PM","[breakfast, home, made, plain, yoghurt, bluebe..."
7,nicsflyingcircus,"November 3, 2018 4:31PM","[breakfast, protein, plate, local, diner, 4, s..."
8,MelanieCN77,"November 3, 2018 5:10PM","[ok, well, nice, plan, went, window, 10am, roa..."
9,rainingribbons,"November 3, 2018 8:01PM","[breakfast, 2, homemade, cinnamon, roll, morni..."


In [15]:
# save processed data to csv
#data.to_csv(r'mfp_1_proc_data.csv', index=False,header=True, encoding='utf-8')

In [23]:
# Function to streamline NLP Process

def nlp(file):
    # Load Dataset
    raw_data = pd.read_csv(file + '.csv')
    
    # Convert to lowercase
    raw_data['post'] = [post.lower() for post in raw_data['post']]

    # Word & Sentence Tokenization
    token_post = [word_tokenize(post) for post in raw_data['post']]

    sent_token = [sent_tokenize(post) for post in raw_data['post']]
    
    # Remove Punctuation
    reg = re.compile('(@[a-z0-9]+)|([^0-9a-z \t])|(\w+:\/\/\S+)')

    no_punc = []

    for filt in token_post:
        review = []
        for token in filt:
            new_token = reg.sub(u'', token)
            if not new_token == u'':
                review.append(new_token)
        no_punc.append(review)
        
    # Remove Stopwords
    no_stop = []

    for post in no_punc:
        new_term_vector = []
        for word in post:
            if not word in stopwords.words('english'):
                new_term_vector.append(word)

        no_stop.append(new_term_vector)
        
    # Stemming & Lemmatization
    pstem = PorterStemmer()
    wlem = WordNetLemmatizer()

    preproc_text = []

    for text in no_stop:
        final_text = []
        for word in text:
            pstem.stem(word)
            final_text.append(wlem.lemmatize(word))

        preproc_text.append(final_text)
        
    # create final data set
    data = raw_data.copy()

    new_col = pd.Series(preproc_text)
    data['post'] = new_col
    print(data.head(5))
    
    # save processed data to csv
    data.to_csv(file + '_proc.csv', index=False,header=True, encoding='utf-8')

In [24]:
# Process & save data sets

files = ['mfp_data34', 'mfp_data38', 'mfp_data41'] 

for i in files:
    nlp(i)

                user                    date  \
0          nooshi713  March 12, 2019 10:05PM   
1           JennJ323   March 13, 2019 1:56PM   
2     seltzermint555   March 13, 2019 3:26PM   
3  Sunshine_And_Sand   March 13, 2019 7:39PM   
4          nooshi713   March 13, 2019 7:41PM   

                                                post  
0  [breakfast, tea, 12, tbsp, agave, 1, scrambled...  
1  [breakfast, homestyle, frozen, waffle, x2, oni...  
2  [breakfast, oatmeal, peanut, butter, strawberr...  
3  [breakfast, oatmeal, shredded, zucchini, melte...  
4  [breakfast, 2, scrambled, egg, tangerine, tea,...  
             user                   date  \
0        JennJ323   April 2, 2019 3:12PM   
1        lalee115   April 2, 2019 3:16PM   
2      my3sons527  April 2, 2019 11:17PM   
3        JennJ323   April 4, 2019 7:07PM   
4  seltzermint555   April 4, 2019 7:50PM   

                                                post  
0  [breakfast, 2, turkey, sausage, link, hard, bo...  
1  [je

# EDA (Exploratory Data Analysis)

## Term Frequency-Inverse Document Frequency (TF-IDF)

In [15]:
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer as tfi
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from sklearn.metrics import pairwise_distances

tf_idf_vect = tfi(stop_words = 'english',
                 max_features = 20)

x = data['post']

#data = [tweets.strip() for tweets in cleaned_data['text']]
#texts = [[word.lower() for word in tweet.split()]]
         
tf_idf = tf_idf_vect.fit_transform([' '.join(post) for post in x])
tf_idf_norm = normalize(tf_idf)
tf_idf_array = tf_idf_norm.toarray()

tf_data = pd.DataFrame(tf_idf_array,
                     columns = tf_idf_vect.get_feature_names())
tf_data.head(15)

Unnamed: 0,bean,breakfast,butter,carrot,cheese,coffee,cup,day,dinner,eat,green,lunch,potato,rice,salad,sandwich,slice,snack,soup,yogurt
0,0.0,0.208507,0.372521,0.372521,0.310581,0.328878,0.0,0.0,0.218433,0.0,0.0,0.240169,0.0,0.0,0.0,0.0,0.0,0.328878,0.372521,0.349332
1,0.323362,0.20501,0.0,0.0,0.0,0.0,0.0,0.0,0.214769,0.0,0.0,0.472281,0.305372,0.343473,0.610744,0.0,0.0,0.0,0.0,0.0
2,0.428627,0.271747,0.0,0.0,0.404781,0.0,0.0,0.485507,0.284684,0.0,0.0,0.313012,0.404781,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.395381,0.0,0.0,0.370769,0.0,0.0,0.0,0.0,0.0,0.0,0.741538,0.0,0.0,0.0,0.0,0.395381,0.0
4,0.307131,0.389438,0.0,0.0,0.580087,0.0,0.0,0.0,0.407978,0.326232,0.0,0.224287,0.0,0.0,0.0,0.0,0.0,0.307131,0.0,0.0
5,0.176792,0.112085,0.0,0.400506,0.500869,0.176792,0.0,0.200253,0.117421,0.187787,0.429286,0.129105,0.0,0.187787,0.0,0.166956,0.400506,0.0,0.0,0.0
6,0.0,0.178669,0.0,0.319212,0.266136,0.281815,0.299342,0.0,0.187175,0.0,0.0,0.2058,0.266136,0.299342,0.266136,0.0,0.0,0.563629,0.0,0.0
7,0.0,0.270614,0.0,0.0,0.403093,0.0,0.0,0.0,0.0,0.453387,0.0,0.0,0.0,0.0,0.403093,0.403093,0.483483,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9,0.33968,0.215356,0.0,0.0,0.320783,0.0,0.0,0.384757,0.225608,0.0,0.412406,0.0,0.320783,0.0,0.0,0.0,0.384757,0.33968,0.0,0.0


In [16]:
tf_data.describe()

Unnamed: 0,bean,breakfast,butter,carrot,cheese,coffee,cup,day,dinner,eat,green,lunch,potato,rice,salad,sandwich,slice,snack,soup,yogurt
count,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0
mean,0.134441,0.180743,0.106685,0.097916,0.165691,0.104769,0.142628,0.115616,0.166466,0.105542,0.104327,0.152526,0.127577,0.124409,0.135004,0.153115,0.109338,0.155193,0.118667,0.111502
std,0.193761,0.133612,0.184546,0.161732,0.206819,0.140567,0.234354,0.204858,0.119389,0.158826,0.205938,0.128872,0.158679,0.205052,0.178166,0.240044,0.17934,0.218261,0.204566,0.166624
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.129994,0.0,0.0,0.0,0.0,0.0,0.0,0.117421,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.187824,0.0,0.0,0.0,0.0,0.0,0.0,0.196765,0.0,0.0,0.196326,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.300619,0.208507,0.232249,0.232249,0.320783,0.281815,0.31468,0.200253,0.214769,0.285561,0.0,0.219532,0.266136,0.285561,0.279773,0.279773,0.335568,0.328878,0.335568,0.31468
max,0.650093,0.572184,0.538514,0.400506,0.580087,0.328878,0.856684,0.686279,0.407978,0.453387,0.721096,0.472281,0.426148,0.741538,0.610744,1.0,0.483483,0.605878,0.696746,0.479318


In [4]:
stuff = pd.read_csv('mfp_1_proc_data.csv')
stuff.head()

Unnamed: 0,user,date,post
0,avskk,"October 24, 2018 2:32PM","['autumn', 'fully', 'arrived', 'cooking', 'bre..."
1,janejellyroll,"October 24, 2018 2:37PM","['breakfast', 'chinese', 'sticky', 'rice', 'te..."
2,firlena227,"October 24, 2018 4:07PM","['ooh', 'love', 'butternut', 'squash', 'risott..."
3,amy19355,"November 2, 2018 3:30PM","['low', 'sodium', 'chicken', 'wild', 'rice', '..."
4,Sunshine_And_Sand,"November 2, 2018 3:38PM","['breakfast', 'zoats', 'melted', 'american', '..."
