In [1]:
# imports
import numpy as np
import pandas as pd

In [2]:
df = pd.read_json("clean_tweets.json")
print('dataframe length:', len(df))
df.columns

dataframe length: 6524


Index(['username', 'description', 'date', 'location', 'following', 'followers',
       'totaltweets', 'retweet', 'retweetcount', 'text', 'hashtags'],
      dtype='object')

In [3]:
from nltk.tokenize import word_tokenize

def search_words(tweet, l):
    tokens = word_tokenize(tweet)
    res = []
    
    for w in l:
        if w in tokens:
            res.append(w)
            
    if len(res) == 0:
        return None
    return res

In [4]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer() 

brand = ['champagne', 'chardonnay', 'pinot', 'cabernet', 'noir', 'merlot', 'airen', 'sauvignon', 'tempranillo', 'syrah', 'garnacha', 'trebbiano']
color = ['red', 'white', 'rose', 'sparkling', 'redwine', 'whitewine', 'rosewine', 'sparklingwine']
country = ['italy', 'italian', 'spain', 'spanish', 'france', 'french', 'chile', 'chilean', 'australia', 'australian']

stem_brand = [stemmer.stem(w) for w in brand]
stem_color = [stemmer.stem(w) for w in color]
stem_country = [stemmer.stem(w) for w in country]

In [5]:
brand = []
color = []
country = []

for tweet in df['text']:
    brand.append(search_words(tweet, stem_brand))
    color.append(search_words(tweet, stem_color))
    country.append(search_words(tweet, stem_country))

df['brand'] = brand
df['color'] = color
df['country'] = country
    
df.head()

Unnamed: 0,username,description,date,location,following,followers,totaltweets,retweet,retweetcount,text,hashtags,brand,color,country
0,1267886436080107521,Get your store up: https://t.co/qf78jAU5eo,2022-10-03 18:06:57,United States,15,41,6223,False,0,repurposedupcycled green wine bottle base glas...,"[Repurposedupcycled, Glasses, set, Bottle, win...",,,
1,406344750,"The Best Tasting Wine Shop, a retail store wit...",2022-10-03 18:05:51,302 Pier Avenue,1939,995,2839,False,0,thanks much linda uncorked uncorkedwineshops w...,"[Uncorked, UncorkedWineShops, wineshop, shoplo...",,,
2,83460300,Love wine... Love champagne... Love life...,2022-10-03 18:05:48,"Sydney, Australia",1107,589,3456,False,0,louis jadot charmes chambertin grand cru gtgtg...,"[AD, wine, redwine, whitewine, whisky]",,[red],
3,853779552329502720,We tweet and retweet all things cool and inter...,2022-10-03 18:05:07,,362,191,61865,False,7,deals events make feel larger life httpstcoscc...,"[soulnightevents, party]",,,
4,838972202,"Serial entrepreneur, that's taken two successf...",2022-10-03 18:05:05,"Delray Beach, FL",1029,8772,16697,False,0,become wine influencer six figures per year wi...,"[wine, winelovers, winetasting, wineisfun, fun...",,,


In [8]:
df.to_json('tweets_with_features.json')

In [6]:
print('total number of rows:', len(df))
print(df['brand'].describe())
print()
print(df['color'].describe())
print()
print(df['country'].describe())

total number of rows: 6524
count              548
unique              31
top       [chardonnay]
freq               128
Name: brand, dtype: object

count       441
unique        6
top       [red]
freq        247
Name: color, dtype: object

count           281
unique           13
top       [italian]
freq             60
Name: country, dtype: object


n-grams of: 
- grape
- year
- country
- taste/winetaste

In [45]:
from nltk import ngrams
import collections

# Find all the n-grams in the given dictionary (grams)
# that contain the given word
# Returns a dictionary {(ngram): freq}
def find_words(word, grams):
    res = {}
    for ele in grams:
        if word in ele:
            res[ele] = grams[ele]
    return res

# returns a dictionary with {(ngram): freq}
def extract_ngrams(tweet):
    tokens = word_tokenize(tweet)
    ngram = ngrams(tokens, 2)
    freq = collections.Counter(ngram)
    ngram = ngrams(tokens, 3)
    freq += collections.Counter(ngram)
    
    return freq

ngram_freq = {} # dictionary with {(ngram): freq}

for tweet in df['text']: # extract ngrams for all tweets
    ngrams_found = extract_ngrams(tweet)
    
    for gram in ngrams_found:
        if gram in ngram_freq:
            ngram_freq[gram] += ngrams_found[gram]
        else:
            ngram_freq[gram] = ngrams_found[gram]
            
sorted_ngram_freq = sorted(ngram_freq.items(), key=lambda item: item[1], reverse = True)

In [48]:
# Find all the ngrams that contains the given word
# and prints them in descending frequency order
word = 'grape'
elements = find_words(word, ngram_freq)
sorted_elements = sorted(elements.items(), key=lambda item: item[1], reverse = True)
sorted_elements

[(('wine', 'grape'), 8),
 (('grape', 'varieties'), 7),
 (('grape', 'harvest'), 6),
 (('refers', 'grape'), 5),
 (('grape', 'region'), 5),
 (('term', 'refers', 'grape'), 5),
 (('refers', 'grape', 'region'), 5),
 (('grape', 'wine'), 4),
 (('grape', 'guide'), 4),
 (('wine', 'grape', 'guide'), 4),
 (('grape', 'skin'), 3),
 (('red', 'wine', 'grape'), 3),
 (('grape', 'juice'), 3),
 (('grape', 'vines'), 2),
 (('grape', 'growers'), 2),
 (('grape', 'skins'), 2),
 (('grape', 'say'), 2),
 (('guess', 'grape'), 2),
 (('white', 'grape'), 2),
 (('grape', 'variety'), 2),
 (('grape', 'variety', 'wine'), 2),
 (('grape', 'varietals'), 2),
 (('grape', 'amp'), 2),
 (('grape', 'amp', 'wine'), 2),
 (('grape', 'growing'), 2),
 (('greece', 'grape'), 2),
 (('grape', 'pink'), 2),
 (('peloponnese', 'greece', 'grape'), 2),
 (('greece', 'grape', 'pink'), 2),
 (('grape', 'pink', 'skins'), 2),
 (('separates', 'grape'), 1),
 (('production', 'separates', 'grape'), 1),
 (('separates', 'grape', 'skin'), 1),
 (('grape', 's