# Product Group Deepdive
Previously in the **Interproduct Tweet Analysis** we didn't find any substantial conversion between product groups. In this analysis we will attempt to create monthly topic models for tweets from each product groups. The goal is to study how topics differ between product groups

In [1]:
import enum
import itertools as it
import os
import pickle
from types import SimpleNamespace

import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
from wordcloud import WordCloud

CURRENT_DIR = os.path.abspath(os.curdir)
DATA_PICKLE_DIR = os.path.join(CURRENT_DIR, 'Data\\Pickles\\')
ASSETS_DIR = os.path.join(CURRENT_DIR, 'Assets\\')

%matplotlib inline

## Exploring the data through wordclouds
First we will create a word cloud (of onegrams, and bigrams) for each month's tweets for each product group. This will give us a good indication of what the major topics will be. We can then use these as starter points to find more topics, and to classify tweets into different topics.

In the **Interproduct Tweet Analysis** we found that almost 80% of the monthly tweets' users tweet only once (on one of the product groups). This indicates a presence of two groups of users in our dataset (infrequent users, and frequent users). Because of the huge number of users who tweeted only once, we will consider them seperately from the remaining tweets.

First we need to load our grouped tweets. We will do this product wise; starting with *Vape*.

### Vape

In [2]:
class UserActivityStat(enum.Enum):
    InFrequent = 1
    Medium = 2
    Frequent = 3

def load_data(product_group):
    filename = os.path.join(DATA_PICKLE_DIR, f'ProductGroupedDFs-{product_group}.pickle')
    with open(filename, 'rb') as file_handle:
        data = pickle.load(file_handle)
        print('File loaded.')
    # Classify tweet as FromInfrequentUser or not.
    for datum in data:
        user_counts = datum.df.groupby('UserId').size()
        def _get_user_activity_stat(tweet):
            if user_counts.loc[tweet.UserId] > 1:
                return UserActivityStat.Frequent
            else:
                return UserActivityStat.InFrequent
        datum.df['UserActivityStat'] = datum.df.apply(_get_user_activity_stat, axis=1)
        print(f'Processed {datum.date_label}')
    return data

In [4]:
vape = load_data('vape')

File loaded.
Processed Apr 2017
Processed May 2017
Processed Jun 2017
Processed Jul 2017
Processed Aug 2017
Processed Sep 2017
Processed Oct 2017
Processed Nov 2017
Processed Dec 2017
Processed Jan 2018
Processed Feb 2018
Processed Mar 2018


We now need to find the counts for the onegrams and bigrams in the tweets.

In [3]:
def process_ngrams(data):
    stopwords = set(nltk.corpus.stopwords.words('english'))
    for datum in data:
        onegrams, bigrams = {}, {}
        for row in datum.df.itertuples():
            prev = None
            for word in (x for x in row.NormalizedText if x not in stopwords):
                if word not in onegrams:
                    onegrams[word] = SimpleNamespace(frequent=0, infrequent=0, total=0)
                if row.UserActivityStat == UserActivityStat.Frequent:
                    onegrams[word].frequent += 1
                elif row.UserActivityStat == UserActivityStat.InFrequent:
                    onegrams[word].infrequent += 1
                onegrams[word].total += 1

                if prev is not None:
                    bigram = f'{prev}-{word}'
                    if bigram not in bigrams:
                        bigrams[bigram] = SimpleNamespace(frequent=0, infrequent=0, total=0)
                    if row.UserActivityStat == UserActivityStat.Frequent:
                        bigrams[bigram].frequent += 1
                    elif row.UserActivityStat == UserActivityStat.InFrequent:
                        bigrams[bigram].infrequent += 1
                    bigrams[bigram].total += 1
                prev = word
        datum.onegrams = onegrams
        datum.bigrams = bigrams
    return data

In [None]:
vape = process_ngrams(vape)

Finally we can use the ngrams to create the wordclouds.

In [4]:
def create_wordclouds(data, product_name, w=800, h=600):
    onegram_save_folder = os.path.join(ASSETS_DIR, f'WordClouds\\{product_name}\\Onegrams')
    if not os.path.exists(onegram_save_folder):
        os.makedirs(onegram_save_folder)
        os.makedirs(os.path.join(onegram_save_folder, 'Frequent'))
        os.makedirs(os.path.join(onegram_save_folder, 'InFrequent'))
        os.makedirs(os.path.join(onegram_save_folder, 'Total'))
    bigram_save_folder = os.path.join(ASSETS_DIR, f'WordClouds\\{product_name}\\Bigrams')
    if not os.path.exists(bigram_save_folder):
        os.makedirs(bigram_save_folder)
        os.makedirs(os.path.join(bigram_save_folder, 'Frequent'))
        os.makedirs(os.path.join(bigram_save_folder, 'InFrequent'))
        os.makedirs(os.path.join(bigram_save_folder, 'Total'))
    for datum in data:
        wc = WordCloud(width=w, height=h).generate_from_frequencies({key:val.frequent for key, val in datum.onegrams.items()})
        wc.to_file(os.path.join(onegram_save_folder, f'Frequent\\{datum.date_label}.png'))
        wc = WordCloud(width=w, height=h).generate_from_frequencies({key:val.infrequent for key, val in datum.onegrams.items()})
        wc.to_file(os.path.join(onegram_save_folder, f'InFrequent\\{datum.date_label}.png'))
        wc = WordCloud(width=w, height=h).generate_from_frequencies({key:val.total for key, val in datum.onegrams.items()})
        wc.to_file(os.path.join(onegram_save_folder, f'Total\\{datum.date_label}.png'))

        wc = WordCloud(width=w, height=h).generate_from_frequencies({key:val.frequent for key, val in datum.bigrams.items()})
        wc.to_file(os.path.join(bigram_save_folder, f'Frequent\\{datum.date_label}.png'))
        wc = WordCloud(width=w, height=h).generate_from_frequencies({key:val.infrequent for key, val in datum.bigrams.items()})
        wc.to_file(os.path.join(bigram_save_folder, f'InFrequent\\{datum.date_label}.png'))
        wc = WordCloud(width=w, height=h).generate_from_frequencies({key:val.total for key, val in datum.bigrams.items()})
        wc.to_file(os.path.join(bigram_save_folder, f'Total\\{datum.date_label}.png'))
        print(f'Generated wordclouds for {datum.date_label}.')

In [None]:
create_wordclouds(vape, 'vape')

### Cigarette

In [5]:
cigarette = load_data('cigarette')
process_ngrams(cigarette)
create_wordclouds(cigarette, 'cigarette')

File loaded.
Processed Apr 2017
Processed May 2017
Processed Jun 2017
Processed Jul 2017
Processed Aug 2017
Processed Sep 2017
Processed Oct 2017
Processed Nov 2017
Processed Dec 2017
Processed Jan 2018
Processed Feb 2018
Processed Mar 2018
Generated wordclouds for Apr 2017.
Generated wordclouds for May 2017.
Generated wordclouds for Jun 2017.
Generated wordclouds for Jul 2017.
Generated wordclouds for Aug 2017.
Generated wordclouds for Sep 2017.
Generated wordclouds for Oct 2017.
Generated wordclouds for Nov 2017.
Generated wordclouds for Dec 2017.
Generated wordclouds for Jan 2018.
Generated wordclouds for Feb 2018.
Generated wordclouds for Mar 2018.


# Hookah

In [12]:
hookah = load_data('hookah')
# process_ngrams(hookah)
# create_wordclouds(hookah, 'hookah')

File loaded.
Processed Apr 2017
Processed May 2017
Processed Jun 2017
Processed Jul 2017
Processed Aug 2017
Processed Sep 2017
Processed Oct 2017
Processed Nov 2017
Processed Dec 2017
Processed Jan 2018
Processed Feb 2018
Processed Mar 2018


# Swisher

In [11]:
swisher = load_data('swisher')
# process_ngrams(swisher)
# create_wordclouds(swisher, 'swisher')

File loaded.
Processed Apr 2017
Processed May 2017
Processed Jun 2017
Processed Jul 2017
Processed Aug 2017
Processed Sep 2017
Processed Oct 2017
Processed Nov 2017
Processed Dec 2017
Processed Jan 2018
Processed Feb 2018
Processed Mar 2018


### Temp

In [24]:
total = pd.concat([datum.df.UserId for datum in hookah])

In [25]:
total.unique().shape

(154245,)