In [1]:
# imports
import numpy as np
import pandas as pd

In [2]:
df = pd.read_json("clean_tweets.json")
print('dataframe length:', len(df))
df.columns

dataframe length: 6524


Index(['username', 'description', 'date', 'location', 'following', 'followers',
       'totaltweets', 'retweet', 'retweetcount', 'text', 'hashtags'],
      dtype='object')

In [4]:
from nltk.tokenize import word_tokenize

def search_words(tweet, l):
    tokens = word_tokenize(tweet)
    res = []
    
    for w in l:
        if w in tokens:
            res.append(w)
            
    if len(res) == 0:
        return None
    return res

In [5]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer() 

brand = ['champagne', 'chardonnay', 'pinot', 'cabernet', 'noir', 'merlot', 'airen', 'sauvignon', 'tempranillo', 'syrah', 'garnacha', 'trebbiano']
color = ['red', 'white', 'rose', 'sparkling', 'redwine', 'whitewine', 'rosewine', 'sparklingwine']
country = ['italy', 'italian', 'spain', 'spanish', 'france', 'french', 'chile', 'chilean', 'australia', 'australian']

stem_brand = [stemmer.stem(w) for w in brand]
stem_color = [stemmer.stem(w) for w in color]
stem_country = [stemmer.stem(w) for w in country]

In [6]:
brand = []
color = []
country = []

for tweet in df['text']:
    brand.append(search_words(tweet, stem_brand))
    color.append(search_words(tweet, stem_color))
    country.append(search_words(tweet, stem_country))

df['brand'] = brand
df['color'] = color
df['country'] = country
    
df.head()

Unnamed: 0,username,description,date,location,following,followers,totaltweets,retweet,retweetcount,text,hashtags,brand,color,country
0,1267886436080107521,Get your store up: https://t.co/qf78jAU5eo,2022-10-03 18:06:57,United States,15,41,6223,False,0,repurposedupcycled green wine bottle base glas...,"[Repurposedupcycled, Glasses, set, Bottle, win...",,,
1,406344750,"The Best Tasting Wine Shop, a retail store wit...",2022-10-03 18:05:51,302 Pier Avenue,1939,995,2839,False,0,thanks much linda uncorked uncorkedwineshops w...,"[Uncorked, UncorkedWineShops, wineshop, shoplo...",,,
2,83460300,Love wine... Love champagne... Love life...,2022-10-03 18:05:48,"Sydney, Australia",1107,589,3456,False,0,louis jadot charmes chambertin grand cru gtgtg...,"[AD, wine, redwine, whitewine, whisky]",,[red],
3,853779552329502720,We tweet and retweet all things cool and inter...,2022-10-03 18:05:07,,362,191,61865,False,7,deals events make feel larger life httpstcoscc...,"[soulnightevents, party]",,,
4,838972202,"Serial entrepreneur, that's taken two successf...",2022-10-03 18:05:05,"Delray Beach, FL",1029,8772,16697,False,0,become wine influencer six figures per year wi...,"[wine, winelovers, winetasting, wineisfun, fun...",,,


In [8]:
df.to_json('tweets_with_features.json')

In [7]:
print('total number of rows:', len(df))
print(df['brand'].describe())
print()
print(df['color'].describe())
print()
print(df['country'].describe())

total number of rows: 6524
count              548
unique              31
top       [chardonnay]
freq               128
Name: brand, dtype: object

count       441
unique        6
top       [red]
freq        247
Name: color, dtype: object

count           281
unique           13
top       [italian]
freq             60
Name: country, dtype: object


n-grams of: 
- grape
- year
- country
- taste/winetaste

In [8]:
from nltk import ngrams
import collections

# Find all the n-grams in the given dictionary (grams)
# that contain the given word
# Returns a dictionary {(ngram): freq}
def find_words(word, grams):
    res = {}
    for ele in grams:
        if word in ele:
            res[ele] = grams[ele]
    return res

# returns a dictionary with {(ngram): freq}
def extract_ngrams(tweet):
    tokens = word_tokenize(tweet)
    ngram = ngrams(tokens, 2)
    freq = collections.Counter(ngram)
    ngram = ngrams(tokens, 3)
    freq += collections.Counter(ngram)
    
    return freq

ngram_freq = {} # dictionary with {(ngram): freq}

for tweet in df['text']: # extract ngrams for all tweets
    ngrams_found = extract_ngrams(tweet)
    
    for gram in ngrams_found:
        if gram in ngram_freq:
            ngram_freq[gram] += ngrams_found[gram]
        else:
            ngram_freq[gram] = ngrams_found[gram]
            
sorted_ngram_freq = sorted(ngram_freq.items(), key=lambda item: item[1], reverse = True)

In [88]:
# Find all the ngrams that contains the given word
# and prints them in descending frequency order
word = 'country'
elements = find_words(word, ngram_freq)
sorted_elements = sorted(elements.items(), key=lambda item: item[1], reverse = True)
sorted_elements

[(('wine', 'country'), 132),
 (('country', 'layered'), 53),
 (('country', 'scene'), 53),
 (('scene', 'wine', 'country'), 53),
 (('wine', 'country', 'layered'), 53),
 (('country', 'layered', 'art'), 53),
 (('mandala', 'wine', 'country'), 53),
 (('wine', 'country', 'scene'), 53),
 (('country', 'scene', 'wine'), 53),
 (('niagara', 'wine', 'country'), 10),
 (('country', 'high'), 9),
 (('wine', 'country', 'high'), 9),
 (('country', 'high', 'c'), 9),
 (('basque', 'country'), 2),
 (('country', 'barcelona'), 2),
 (('rioja', 'basque', 'country'), 2),
 (('basque', 'country', 'barcelona'), 2),
 (('country', 'barcelona', 'upcoming'), 2),
 (('around', 'country'), 2),
 (('celebritycruises', 'country'), 1),
 (('country', 'dear'), 1),
 (('via', 'celebritycruises', 'country'), 1),
 (('celebritycruises', 'country', 'dear'), 1),
 (('country', 'dear', 'us'), 1),
 (('country', 'find'), 1),
 (('wineart', 'wine', 'country'), 1),
 (('wine', 'country', 'find'), 1),
 (('country', 'find', 'art'), 1),
 (('country

In [71]:
import nltk
import spacy
 
# essential entity models downloads
nltk.downloader.download('maxent_ne_chunker')
nltk.downloader.download('words')
nltk.downloader.download('treebank')
nltk.downloader.download('maxent_treebank_pos_tagger')
nltk.downloader.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/aina/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /home/aina/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package treebank to /home/aina/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[nltk_data] Downloading package maxent_treebank_pos_tagger to
[nltk_data]     /home/aina/nltk_data...
[nltk_data]   Unzipping taggers/maxent_treebank_pos_tagger.zip.
[nltk_data] Downloading package punkt to /home/aina/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/aina/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [74]:
!python -m spacy download en_core_web_sm

2022-10-07 13:55:05.215668: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-10-07 13:55:05.215704: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-10-07 13:55:07.084893: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-10-07 13:55:07.084925: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-10-07 13:55:07.084949: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (aina-X555LJ): /proc/driver/nvidia/version does not exist
Collecting en-core-web-sm==3.4.0
  Downloading https://github.com

In [81]:
# Extract country and city from each location

location_country = []
location_city = []

for location in df['location']:
    if location != '':
        try:
            place_entity = locationtagger.find_locations(text = location)
            location_country.append(place_entity.countries)
            location_city.append(place_entity.cities)
        except:
            print(location)
            location_country.append(None)
            location_city.append(None)
    else:
        location_country.append(None)
        location_city.append(None)
                
df['location_country'] = location_country
df['location_city'] = location_city

Harare, Zimbabwe


In [83]:
# Extract city from location

location_city = []

for location in df['location']:
    if location != '':
        try:
            place_entity = locationtagger.find_locations(text = location)
            location_city.append(place_entity.cities)
        except:
            print(location)
            location_city.append(None)
    else:
        location_city.append(None)
                
df['location_city'] = location_city

Harare, Zimbabwe


In [87]:
df.to_json('tweets_with_features.json')