In [213]:
import numpy as np 
import pandas as pd
import string
import pickle

# Used for text preprocessing/nlp
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import regexp_tokenize

import langid # !pip install langid
import nltk
import re


#!pip install emoji --upgrade
# nltk.download('stopwords')
# nltk.download('punkt')

# Used to disable printing warnings 
import warnings
warnings.filterwarnings("ignore")

In [61]:
def read_pickle(filename):
    with open(filename+".pkl", 'rb') as read_file:
        obj = pickle.load(read_file)
        return obj

In [62]:
df = read_pickle('influencers_data')
df

Unnamed: 0_level_0,biography,captions
username,Unnamed: 1_level_1,Unnamed: 2_level_1
ladyandpups,Our cookbook - THE ART OF ESCAPISM COOKING - I...,[Thank you guys for tuning in on my Instagram ...
nelrestaurant,"Unique dining experience full of surprises, tu...",[Planning a Christmas function? 🎅.\n.\nBe it b...
cookrepublic,Award Winning PHOTOGRAPHER/ BLOGGER/ STYLIST/ ...,[It’s too hot for soup! But nourishing soups a...
annabarnettcooks,COOK | AUTHOR | EVENING STANDARD CONTRIBUTOR |...,[{Collab} As always Ted was star of the show o...
dennistheprescott,"I cook, I photograph, I write. All homemade. M...",[Monday Drip. 🍪🥛💦 (recipe in my cookbook) Have...
eatsmorefun,All 📷 are ours\n📍Currently Featuring: Singapor...,[Black Pepper Cream Crab Linguini \n________\n...
igbrunchclub,Use our hashtag #IGBrunchClub for a chance to ...,[Coffee and cake for breakfast @jacobtheangell...
stevehansenimages,Photographer | Director | Retoucher | CG Artis...,"[Yes, it is national mushroom day.\n-\n#foodph..."
cookieandkate,Celebrating whole foods! Find my vegetarian re...,"[Today, I’m rounding up thirteen of my pumpkin..."


In [77]:
captions_data = df.loc[:,['captions']]

In [78]:
bio_data = df.loc[:,['biography']]

In [79]:
hashtags_data = pd.DataFrame('', index=df.index, columns=['hashtags'])

In [210]:
emojis_data = pd.DataFrame('', index=df.index, columns=['emojis'])

In [211]:
emojis_data

Unnamed: 0_level_0,emojis
username,Unnamed: 1_level_1
ladyandpups,
nelrestaurant,
cookrepublic,
annabarnettcooks,
dennistheprescott,
eatsmorefun,
igbrunchclub,
stevehansenimages,
cookieandkate,


In [81]:
for i in range(captions_data.shape[0]):
    captions_data['captions'][i] = ''.join(captions_data['captions'][i])

#### 2.1 Extract hashtags from captions as a feature 

In [82]:
def extract_hashtags(text):
    '''
    a function for extracting hashtags text from the punctuation
    '''
    hashtags = re.findall(r"#(\w+)", text)
    return hashtags

hashtags_data['hashtags'] = captions_data['captions'].apply(extract_hashtags)
hashtags_data

Unnamed: 0_level_0,hashtags
username,Unnamed: 1_level_1
ladyandpups,"[ladyandpupscookbook, theartofescapismcooking,..."
nelrestaurant,[]
cookrepublic,"[thebakefeed, feedfeedbaking, applerecipes, ap..."
annabarnettcooks,"[HouseOfMolteni, MyVictorinoxTake, Ryvitafibre..."
dennistheprescott,"[TraegerCulinaryWeekend, TraegerCulinaryCase, ..."
eatsmorefun,[]
igbrunchclub,"[igbrunchclub, igbrunchclubrecommends, Circolo..."
stevehansenimages,"[foodphotography, motion, instafood, foodstagr..."
cookieandkate,"[cookieandkate, pumpkin, fall, vegetarian, fee..."


#### 2.1 Extract Emojis from captions as a feature 

In [240]:
def extract_emojis(text):
    '''
    a function for extracting Emojis from the captions text
    '''
    emojis = re.findall(r"#(\w+)", text)
    em = "['\U0001F300-\U0001F5FF'|'\U0001F600-\U0001F64F'|'\U0001F680-\U0001F9FF'|'\u2600-\u26FF\u2700-\u27BF']"
    emojis = regexp_tokenize(text, em)    
    return emojis

emojis_data['emojis'] = captions_data['captions'].apply(extract_emojis)
emojis_data


0x1f373


In [215]:
def remove_hashtags(text):
    text = " ".join(filter(lambda x:x[0]!='#', text.split()))
    return text
captions_data['captions'] = captions_data['captions'].apply(remove_hashtags)
captions_data

Unnamed: 0_level_0,captions
username,Unnamed: 1_level_1
ladyandpups,Thank you guys for tuning in on my Instagram L...
nelrestaurant,Planning a Christmas function 🎅 Be it big or s...
cookrepublic,It’s too hot for soup But nourishing soups are...
annabarnettcooks,Collab As always Ted was star of the show on a...
dennistheprescott,Monday Drip 🍪🥛💦 recipe in my cookbook Have the...
eatsmorefun,Black Pepper Cream Crab Linguini Fremantle Sea...
igbrunchclub,Coffee and cake for breakfast jacobtheangellon...
stevehansenimages,Yes it is national mushroom day If it’s not de...
cookieandkate,Today I’m rounding up thirteen of my pumpkin r...


#### 2.2 Remove hashtags from the captions text

#### 2.3 Remove punctuation / Emojis

In [216]:
def remove_punctuation(text):
    '''
    a function for removing punctuation
    '''
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)#[i.translate(translator) for i in text]



In [217]:
captions_data['captions'] = captions_data['captions'].apply(remove_punctuation)
captions_data

Unnamed: 0_level_0,captions
username,Unnamed: 1_level_1
ladyandpups,Thank you guys for tuning in on my Instagram L...
nelrestaurant,Planning a Christmas function 🎅 Be it big or s...
cookrepublic,It’s too hot for soup But nourishing soups are...
annabarnettcooks,Collab As always Ted was star of the show on a...
dennistheprescott,Monday Drip 🍪🥛💦 recipe in my cookbook Have the...
eatsmorefun,Black Pepper Cream Crab Linguini Fremantle Sea...
igbrunchclub,Coffee and cake for breakfast jacobtheangellon...
stevehansenimages,Yes it is national mushroom day If it’s not de...
cookieandkate,Today I’m rounding up thirteen of my pumpkin r...


In [218]:
# We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data_clean.transcript)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_clean.index
data_dtm

NameError: name 'data_clean' is not defined

In [219]:
bio_data['biography'] = bio_data['biography'].apply(remove_punctuation)
bio_data

Unnamed: 0_level_0,biography
username,Unnamed: 1_level_1
ladyandpups,Our cookbook THE ART OF ESCAPISM COOKING IS ...
nelrestaurant,Unique dining experience full of surprises tuc...
cookrepublic,Award Winning PHOTOGRAPHER BLOGGER STYLIST DES...
annabarnettcooks,COOK AUTHOR EVENING STANDARD CONTRIBUTOR FO...
dennistheprescott,I cook I photograph I write All homemade My co...
eatsmorefun,All 📷 are ours\n📍Currently Featuring Singapore 🇸🇬
igbrunchclub,Use our hashtag IGBrunchClub for a chance to b...
stevehansenimages,Photographer Director Retoucher CG Artist\n...
cookieandkate,Celebrating whole foods Find my vegetarian rec...


In [220]:
em = re.findall('[^\w\s]',captions_data['captions'][0])
# type(re.findall(r'[^\w\s,]', captions_data['captions'][0]))
# def extract_emojis(sentence):
#     return [word for word in sentence.split() if str(word.encode('unicode-escape'))[2] == '\\' ]

# extract_emojis(captions_data['captions'][0])

In [221]:
# Remove Emojis id lan
captions_data["captions"] = captions_data['captions'].str.replace('[^\w\s]','') #remove emojis
bio_data['biography'] = bio_data['biography'].str.replace('[^\w\s]','') #remove emojis
# def identify_language(row):
#     lang = langid.classify(row['captions'])
#     return lang[0]

# dataText['language'] = dataText.apply(identify_language,axis=1)
captions_data.head(20)
bio_data.head(20)

Unnamed: 0_level_0,biography
username,Unnamed: 1_level_1
ladyandpups,Our cookbook THE ART OF ESCAPISM COOKING IS ...
nelrestaurant,Unique dining experience full of surprises tuc...
cookrepublic,Award Winning PHOTOGRAPHER BLOGGER STYLIST DES...
annabarnettcooks,COOK AUTHOR EVENING STANDARD CONTRIBUTOR FO...
dennistheprescott,I cook I photograph I write All homemade My co...
eatsmorefun,All are ours\nCurrently Featuring Singapore
igbrunchclub,Use our hashtag IGBrunchClub for a chance to b...
stevehansenimages,Photographer Director Retoucher CG Artist\n...
cookieandkate,Celebrating whole foods Find my vegetarian rec...


#### 2.2 Make text all lower case / Remove stop words

In [222]:
# everything lower case and removed stopwords
from nltk.corpus import stopwords
sw = stopwords.words('english')
np.array(sw)
def stopwords(text):
    text = [word.lower()for word in text.split() if word.lower() not in sw]
    return " ".join(text)

def lower(text):
    text = [word.lower() for word in text.split()]
    return " ".join(text)
captions_data['captions'] = captions_data['captions'].apply(stopwords)
captions_data

bio_data['biography'] = bio_data['biography'].apply(stopwords)
captions_data


Unnamed: 0_level_0,captions
username,Unnamed: 1_level_1
ladyandpups,thank guys tuning instagram live qa last night...
nelrestaurant,planning christmas function big small bring ma...
cookrepublic,hot soup nourishing soups daily part nicks rec...
annabarnettcooks,collab always ted star show shoot iwe moltenid...
dennistheprescott,monday drip recipe cookbook best day yall also...
eatsmorefun,black pepper cream crab linguini fremantle sea...
igbrunchclub,coffee cake breakfast jacobtheangellondon neal...
stevehansenimages,yes national mushroom day dessert day pie stan...
cookieandkate,today im rounding thirteen pumpkin recipes muf...


#### 2.3 Tokenize text / Stemming text

In [48]:
## first tokenize, then stemmer, then vectorize into number
stemmer = SnowballStemmer('english')
def tokenize_stem(row):
    stem_result = []
    #input_str = str(row['english'])
    input_str = row
    input_str = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", input_str)
    tokens = word_tokenize(input_str)
    for word in tokens:
        stem_result.append(stemmer.stem(word))
    str1 = ' '.join(stem_result)
    return str1
captions_data['tokenStem'] = captions_data['captions'].apply(tokenize_stem)  
bio_data['biography'] = bio_data['biography'].apply(tokenize_stem)  

In [49]:
bio_data

Unnamed: 0_level_0,biography
username,Unnamed: 1_level_1
ladyandpups,cookbook art escap cook readi preorder link pr...
nelrestaurant,uniqu dine experi full surpris tuck away surri...
cookrepublic,award win photograph blogger stylist design co...
annabarnettcooks,cook author even standard contributor food tra...
dennistheprescott,cook photograph write homemad cookbook eat del...
eatsmorefun,current featur singapor
igbrunchclub,use hashtag igbrunchclub chanc featur brunch i...
stevehansenimages,photograph director retouch cg artist shoot fo...
cookieandkate,celebr whole food find vegetarian recip cookie...


In [233]:
count_vectorizer = CountVectorizer(analyzer = "word",tokenizer=None,preprocessor = None,stop_words =None,ngram_range = (1,2))

In [234]:
doc_word = count_vectorizer.fit_transform(captions_data['captions'])

In [235]:
ex_label = [e[:30]+"..." for e in captions_data['captions']]

In [236]:
vectorized = pd.DataFrame(doc_word.toarray(), index=ex_label, columns=count_vectorizer.get_feature_names())

In [237]:
vectorized

Unnamed: 0,0101,0101 224,010506,010506 3e,0162,0162 blk,0213,0213 cluny,0235,0235 funan,...,zero,zero delay,zest,zest anyone,zesty,zesty even,zettertownhouse,zettertownhouse marylebone,zucchini,zucchini yellow
thank guys tuning instagram li...,0,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
planning christmas function bi...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
hot soup nourishing soups dail...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
collab always ted star show sh...,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
monday drip recipe cookbook be...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
black pepper cream crab lingui...,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
coffee cake breakfast jacobthe...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
yes national mushroom day dess...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
today im rounding thirteen pum...,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,1,1
