In [4]:
import pandas as pd
from collections import Counter

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import operator
import nltk

import math
from scipy.stats import zscore

import matplotlib.pyplot as plt


In [5]:
food_companies = ["veganjunkfoodbar", "pastaebasta_amsterdam", "mamakellyamsterdam", "watsonsfood",
"cannibaleroyale", "parkheuvel", "restaurantfred", "hugh_rotterdam", "oldscuola", "restaurantkite",
"wturbankitchen", "thestreetfoodclub", "rumclubutrecht", "lejardinutrecht", "broei.utrecht"]

furniture_companies = ["madedotcom", "vtwonen", "hemanederland", "loods5", "ikeanederland", "homify", "westwingnl", 
"karwei", "kwantum_nederland", "xenos_nl", "homedeco", "bol_com", "leenbakker", "wonenmetlef", "_connox_",
"interiorjunkiecom", "jysknl", "wehkamp", "fonqnl", "konforhome", "basiclabel.nl", "blokker",
"deensnl", "hastensbeds", "eijerkamp", "goossenswonenenslapen", "furn.nl", "stoermetaal", "roomednl", "misterdesignnl",
"dekbeddiscounter", "woonexpress", "zitmaxx", "pronto_wonen", "designbestseller", "barbecueshop.nl",
"flinders.design", "trendhopper", "debommelmeubelen", "otto_nl", "praxis_bouwmarkt", "gamma_nl",
"pietklerkx.nl", "swisssense", "montelwonen", "aupingnl", "hacowonenenslapen", "emma_matras", "hornbachnl",
"lampenlicht.nl", "profijtmeubel", "bianonl", "woonboulevardpoortvliet", "morreswonen", "hubo_nl", "beter_bed",
"hoogenboezem.meubelen", "villajipp_outlet", "vidaxl_nl", "mline_nl"]

sport_companies = ["plutosport.nl", "voetbalshopnl", "all4runningstore", "voetbaldirect",
"dakasport", "hockeydirect.nl", "tennisdirect", "intersportnl", "aktiesport.nl", "sport2000nederland",
"soccerfanshop", "jdsportsnl", "decathlonnederland", "gorillasportsnl", "perrysport.nl"]

company_names = food_companies + furniture_companies + sport_companies

# STEP 4: Creating a dataframe which shows a vector space per company follower profile

## 4.1 Function which extract the TF for hashtags or text for one company follower profile
This function has three different settings. Namely, data_input, post_level and user_level. data_input looks whether only hashtags or the written text or a combination of the both should be extracted. The post level and user level analyse whether we should take the absolute or relative variant. 

In [6]:
def create_count_object(company_name, user_input, hashtag_postLevel, hashtag_userLevel):
    df = pd.read_pickle("../data/noNoise/{}_cleaned_noNoise.pkl".format(company_name))
    
    profile_input = []
    hashtag_column = [value for value in df['hashtags']]
    
    text_column = [value.split() for value in df['cleaned_text']]
    if user_input == 'hashtag' : profile_input = hashtag_column
    elif user_input == 'text' : profile_input = text_column
        
    elif user_input == 'combined':
        for i in range(len(df)):
            profile_input.append(hashtag_column[i] + text_column[i])
    df['input_column'] = profile_input
    
        
    hashtag_post_occurance = []
    ## For each post calculate the percentage each hashtag had in a specific post 
    for post_hashtags in [value for value in df['input_column']]:
        if len(post_hashtags) > 0:        
            one_post = dict(Counter(post_hashtags))
            #Transforming hashtag occurance to a relative level
            if hashtag_postLevel == 'relative':
                one_post = {k:v/len(post_hashtags) for (k,v) in one_post.items()}
            hashtag_post_occurance.append(one_post)
        else:
            hashtag_post_occurance.append({})
    df['post_content_occurance'] = hashtag_post_occurance
    
    occurance_buffer = []   
    if hashtag_userLevel == 'relative':
        usernames = [value for value in set([value for value in df['username']])]
        for username in usernames:
            relative_user_hashtags = {}
            #Create a disctionary with all posts of one user
            df_temp = df.loc[df['username'] == username]
            temp_count = 0
            #Loop through the post_content_occurance column in all his posts
            for hashtag_occurance in [value for value in df_temp['post_content_occurance']]:
                if hashtag_occurance == {}:
                    continue
                else:
                #If the column is not empty dict --> take sum of all values
                    temp_count += 1
                    for key in hashtag_occurance:
                        if key in relative_user_hashtags.keys():
                            relative_user_hashtags[key] += hashtag_occurance[key]
                        else:
                            relative_user_hashtags[key] = hashtag_occurance[key]
            
            relative_user_hashtags = {k:v/temp_count for (k,v) in dict(relative_user_hashtags).items()}
            if relative_user_hashtags != {} : occurance_buffer.append(relative_user_hashtags)
    
    
    elif hashtag_userLevel == 'absolute':
        for hashtag_occurance in [value for value in df['post_content_occurance']]:
            if hashtag_occurance != {} : occurance_buffer.append(hashtag_occurance)
  
    #Finally we have a list of % hashtags used per post or per user. Lets give a last TF as result
    final_percentages = {}
    for one_item in occurance_buffer:
        for key in one_item:
            if key in final_percentages.keys():
                final_percentages[key] += one_item[key]
            else:
                final_percentages[key] = one_item[key]
           
    print("{} - {}".format(company_name, len(occurance_buffer)))
    return {k:v/len(occurance_buffer) for (k,v) in final_percentages.items()}

## 4.2 Function which creates the Invese Document Frequency for words
The Inverse Document Frequency (IDF) can be used to calculate a TF/IDF score for a company follower profile. This will penalize words which occur in every company follower profile.

In [7]:
def computeIDF(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

In [8]:
def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf

## 4.3 Looping through all companies
Here we define the settings of the company profiles we want to create. This drastically changes the outcome of the vector space. Thereafter we transform the values into a pandas dataframe

In [9]:
def main(data_input, post_level, user_level, idf_penalty, company_names=company_names):
    if idf_penalty == 'yes':
        BOW_ABSOLUTE = pd.read_pickle('insta_followerprofile/vector_spaces/vector_{}_ABSOLUTECOUNT.pkl'.format(data_input))
        idfs = computeIDF([ BOW_ABSOLUTE.loc[company].to_dict() for company in company_names ])
        
    company_hashtag_buffer = {}
    for company in company_names:
        tf_values = create_count_object(company, data_input, post_level, user_level)
        #Here we apply the IDFS penalty if we selected yes.
        if idf_penalty == 'yes':
            company_hashtag_buffer[company] = computeTFIDF(tf_values, idfs)
        #Else we only apply the TF value for each hashtag or word
        else:
            company_hashtag_buffer[company] = tf_values
        
    BOW_ABSOLUTE = pd.DataFrame(company_hashtag_buffer).transpose()
    BOW_ABSOLUTE =  BOW_ABSOLUTE.fillna(0)
    return BOW_ABSOLUTE

In [10]:
data_input = 'combined'
post_level = 'absolute'
user_level = 'absolute'
IDF_penalty = 'no'

#4.4 Store the vector space
if IDF_penalty == 'yes': idf = '-TFIDF'
else: idf = ''

vector_space = main(data_input,post_level,user_level, IDF_penalty)
vector_space.to_pickle('vector_spaces/vector_{}_P{}_U{}{}.pkl'.format(data_input, post_level, user_level, idf))

veganjunkfoodbar - 95966
pastaebasta_amsterdam - 36970


KeyboardInterrupt: 

In [11]:
data_input = 'hashtag'
post_level = 'absolute'
user_level = 'absolute'
IDF_penalty = 'yes'

#4.4 Store the vector space
if IDF_penalty == 'yes': idf = '-TFIDF'
else: idf = ''

vector_space = main(data_input,post_level,user_level, IDF_penalty)
vector_space

veganjunkfoodbar - 57959
pastaebasta_amsterdam - 25470
mamakellyamsterdam - 68213
watsonsfood - 67234
cannibaleroyale - 56650
parkheuvel - 50683
restaurantfred - 67220
hugh_rotterdam - 45796
oldscuola - 36688
restaurantkite - 33740
wturbankitchen - 27626
thestreetfoodclub - 69674
rumclubutrecht - 64767
lejardinutrecht - 40751
broei.utrecht - 49551
madedotcom - 48215
vtwonen - 24411
hemanederland - 37501
loods5 - 29191
ikeanederland - 29162
homify - 51441
westwingnl - 29303
karwei - 37726
kwantum_nederland - 35301
xenos_nl - 40636
homedeco - 34408
bol_com - 13553
leenbakker - 37023
wonenmetlef - 39871
_connox_ - 60192
interiorjunkiecom - 66441
jysknl - 45749
wehkamp - 52705
fonqnl - 40282
konforhome - 21222
basiclabel.nl - 58430
blokker - 36516
deensnl - 70025
hastensbeds - 39837
eijerkamp - 49350
goossenswonenenslapen - 46243
furn.nl - 50879
stoermetaal - 79748
roomednl - 74293
misterdesignnl - 76808
dekbeddiscounter - 27976
woonexpress - 40169
zitmaxx - 50387
pronto_wonen - 34380
desi

Unnamed: 0,Unnamed: 1,diorchis,.,..,...,....,.....,......,.......,........,....1,𝚝𝚋𝚝,𝚟𝚊𝚔𝚊𝚗𝚝𝚒𝚎,𝚠𝚊𝚕𝚔𝚒𝚗𝚐,𝚠𝚎𝚍𝚗𝚎𝚜𝚍𝚊𝚢𝚠𝚒𝚜𝚍𝚘𝚖,𝚠𝚎𝚎𝚔𝚎𝚗𝚍,𝚠𝚎𝚛𝚎𝚕𝚍𝚖𝚎𝚒𝚜𝚓𝚎𝚜𝚍𝚊𝚐,𝚠𝚒𝚗𝚎,𝚠𝚒𝚗𝚝𝚎𝚛,𝚠𝚘𝚗𝚍𝚎𝚛𝚏𝚞𝚕,𝚠𝚘𝚛𝚔
veganjunkfoodbar,0.0,0.00000,0.000277,0.000000,0.000000,0.000000,0.000040,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0
pastaebasta_amsterdam,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0
mamakellyamsterdam,0.0,0.00000,0.000042,0.000000,0.000014,0.000022,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0
watsonsfood,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0
cannibaleroyale,0.0,0.00000,0.000000,0.000000,0.000017,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0
parkheuvel,0.0,0.00000,0.000000,0.000000,0.000019,0.000000,0.000000,0.000000,0.000043,0.0,...,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0
restaurantfred,0.0,0.00000,0.000000,0.000000,0.000029,0.000000,0.000000,0.000000,0.000033,0.0,...,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0
hugh_rotterdam,0.0,0.00000,0.000021,0.000000,0.000043,0.000000,0.000000,0.000000,0.000048,0.0,...,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0
oldscuola,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0
restaurantkite,0.0,0.00000,0.000028,0.000068,0.000000,0.000000,0.000068,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0


## 4.4 Finally store the vector space
As last we store the vector space in the vector-spaces folder. The file name is created in such a way that we can derive the initial settings used to create the vector space from it.

## Extra: 4.5 Calculate absolute count for a company
This is required if we want to apply the idf penalty

In [None]:
%%time

total_collection = {}

for company in company_names:
    df = pd.read_pickle("../data/noNoise/{}_cleaned_noNoise.pkl".format(company))
    hashtag_column = [value for value in df['hashtags']]
    
    company_collection = {}
    for post_hashtags in hashtag_column:
        if len(post_hashtags) > 0:
            for hashtag in post_hashtags:
                if hashtag in company_collection.keys():
                    company_collection[hashtag] += 1
                else:
                    company_collection[hashtag] = 1
    total_collection[company] = company_collection
    
#Create the dataframe
df_vectorspace = pd.DataFrame(total_collection).transpose()
df_vectorspace = df_vectorspace.fillna(0)

In [None]:
data_input = ''

df_vectorspace.to_pickle('../data/vector_spaces/vector_{}_P{}_U{}{}.pkl'.format(data_input, post_level, user_level, idf))
df_vectorspace