# Data Preparation

While the data scrapper gets the data related to the instagram users, we are using two datasets from Kaggle with small variations to recreate how the algorithm will work with real data.

In [54]:
# Import libraries
import pandas as pd
import numpy as np

In [55]:
# Upload companies dataset
df_co = pd.read_csv('df_co.csv')
df_co

Unnamed: 0,CompanyID,Name_co,Category_co,Hashtags_co,Country_co,Followers_co
0,0,The Little Shop,Cars,#instadaily#food#followback#instatravel,Brazil,195608
1,1,The Cozy Kitchen,Sports with a ball,#instalike#fashion#holidayseason#skiing#loveit,Egypt,415795
2,2,The Roost,Art,#blessings#foodlife#endurance#fitnessmotivatio...,Turkey,116164
3,3,The Wooden Spoon,Computers,#foodislifee#familytime,Iran,524384
4,4,Sunflower Fields,Management,#health#trendy,Germany,307127
...,...,...,...,...,...,...
95,95,Copper Kettle,Undefined,#baseball#friendshipgoals#skiing,Thailand,354362
96,96,The Crusty Baguette,Food,#instamood#education#beautifuldestinations,China,74286
97,97,The Plaid Pail,Luxury,#goodmorning#cricket#transportation#media#makeup,Nigeria,108265
98,98,The Sugar House,Business,#muscle#lifestyle#instatravel#instadaily,Germany,82077


In [56]:
# Upload users (influencers) dataset
df_influ = pd.read_csv('df_inf.csv')
df_influ

Unnamed: 0,AccountID,Account,Link,Followers,Audience Country,Authentic engagement,Engagement avg,Category1,Hashtags,Cost Story,Cost Post
0,1,cristiano,https://www.instagram.com/cristiano/,400100000.0,India,7800000.0,9500000.0,Sports with a ball,#football#entrepreneur#foodstylist#love#travel...,468000.0,1092000.0
1,2,kyliejenner,https://www.instagram.com/kyliejenner/,308800000.0,United States,6200000.0,10100000.0,Fashion,#blessed#fashionstyle#foodexperience#quotes#fo...,372000.0,868000.0
2,3,leomessi,https://www.instagram.com/leomessi/,306300000.0,Argentina,4800000.0,6500000.0,Sports with a ball,#marathon#foodlover#fashionblogger#motivation#...,288000.0,672000.0
3,4,kendalljenner,https://www.instagram.com/kendalljenner/,217800000.0,United States,3400000.0,5400000.0,Modeling,#food#fitlife#fitnessmodel#vacation#familytime...,204000.0,476000.0
4,5,selenagomez,https://www.instagram.com/selenagomez/,295800000.0,United States,2700000.0,3600000.0,Music,#blackandwhite#fashionblogger#fashion#holidays...,162000.0,378000.0
...,...,...,...,...,...,...,...,...,...,...,...
995,996,senoritasaeva,https://www.instagram.com/senoritasaeva/,7700000.0,Russia,246600.0,318200.0,Lifestyle,#sunset#likeforlike#instagood,14796.0,34524.0
996,997,manuelneuer,https://www.instagram.com/manuelneuer/,11500000.0,Germany,146500.0,210200.0,Sports with a ball,#summer2022#cricket#sundayfunday#beautifulday#...,8790.0,20510.0
997,998,sahilkhan,https://www.instagram.com/sahilkhan/,10100000.0,India,176500.0,239800.0,Fitness,#fitlife#tbt#fashion#fashionable#smilemore#str...,10590.0,24710.0
998,999,mohanshakti,https://www.instagram.com/mohanshakti/,13700000.0,India,146400.0,175500.0,Art,#foodlife#colorful#fit#lifeisgood#telecom#fami...,8784.0,20496.0


We will now merge both dataframes, so that we can have interactions between both the companies and the users.

In [57]:
df2_co = pd.concat([df_co]*1000, ignore_index=True)

In [58]:
df2_influ = pd.DataFrame(np.repeat(df_influ.values, 100, axis=0), columns=["AccountID", "Account", "Link", "Followers", "Audience Country", "Authentic engagement", "Engagement avg", "Category1", "Hashtags", "Cost Story", "Cost Post"])


In [59]:
# Concat both df
df = pd.concat([df2_influ, df2_co], axis=1)

Now, we will transform the variable hashtags as an intenger, remove the # symbol and return a list of words.

In [60]:
def Convert(string):
    li = list(string.split("#"))
    return li

In [61]:
df['Hashtags'] = df['Hashtags'].astype(str)

In [62]:
h = []
h_co = []

for i in range(len(df['Hashtags'])):
    h.append(Convert(df['Hashtags'][i]))
    h_co.append(Convert(df['Hashtags_co'][i]))

df['Hashtags'] = h
df['Hashtags_co'] = h_co

In [63]:
# Remove first empty element
for i in range(len(df['Hashtags'])):
    df['Hashtags'][i].pop(0)
    df['Hashtags_co'][i].pop(0)

In [64]:
df

Unnamed: 0,AccountID,Account,Link,Followers,Audience Country,Authentic engagement,Engagement avg,Category1,Hashtags,Cost Story,Cost Post,CompanyID,Name_co,Category_co,Hashtags_co,Country_co,Followers_co
0,1,cristiano,https://www.instagram.com/cristiano/,400100000.0,India,7800000.0,9500000.0,Sports with a ball,"[football, entrepreneur, foodstylist, love, tr...",468000.0,1092000.0,0,The Little Shop,Cars,"[instadaily, food, followback, instatravel]",Brazil,195608
1,1,cristiano,https://www.instagram.com/cristiano/,400100000.0,India,7800000.0,9500000.0,Sports with a ball,"[football, entrepreneur, foodstylist, love, tr...",468000.0,1092000.0,1,The Cozy Kitchen,Sports with a ball,"[instalike, fashion, holidayseason, skiing, lo...",Egypt,415795
2,1,cristiano,https://www.instagram.com/cristiano/,400100000.0,India,7800000.0,9500000.0,Sports with a ball,"[football, entrepreneur, foodstylist, love, tr...",468000.0,1092000.0,2,The Roost,Art,"[blessings, foodlife, endurance, fitnessmotiva...",Turkey,116164
3,1,cristiano,https://www.instagram.com/cristiano/,400100000.0,India,7800000.0,9500000.0,Sports with a ball,"[football, entrepreneur, foodstylist, love, tr...",468000.0,1092000.0,3,The Wooden Spoon,Computers,"[foodislifee, familytime]",Iran,524384
4,1,cristiano,https://www.instagram.com/cristiano/,400100000.0,India,7800000.0,9500000.0,Sports with a ball,"[football, entrepreneur, foodstylist, love, tr...",468000.0,1092000.0,4,Sunflower Fields,Management,"[health, trendy]",Germany,307127
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,1000,eduincaz,https://www.instagram.com/eduincaz/,6200000.0,Mexico,305600.0,391900.0,Lifestyle,[photo],18336.0,42784.0,95,Copper Kettle,Undefined,"[baseball, friendshipgoals, skiing]",Thailand,354362
99996,1000,eduincaz,https://www.instagram.com/eduincaz/,6200000.0,Mexico,305600.0,391900.0,Lifestyle,[photo],18336.0,42784.0,96,The Crusty Baguette,Food,"[instamood, education, beautifuldestinations]",China,74286
99997,1000,eduincaz,https://www.instagram.com/eduincaz/,6200000.0,Mexico,305600.0,391900.0,Lifestyle,[photo],18336.0,42784.0,97,The Plaid Pail,Luxury,"[goodmorning, cricket, transportation, media, ...",Nigeria,108265
99998,1000,eduincaz,https://www.instagram.com/eduincaz/,6200000.0,Mexico,305600.0,391900.0,Lifestyle,[photo],18336.0,42784.0,98,The Sugar House,Business,"[muscle, lifestyle, instatravel, instadaily]",Germany,82077


Calculate the different number of points (score) for each pair of user-company depending on the match of different variables.

In [65]:
num_coincidente = df.apply(lambda row: len(set(row['Hashtags']).intersection(set(row['Hashtags_co']))), axis=1)
points_cat = np.where((df['Category1'] == df['Category_co']), 10, 0)
points_country = np.where((df['Audience Country'] == df['Country_co']), 5, 0)
points_eng = df['Authentic engagement']/df['Followers']
df['num_coincidentes'] = num_coincidente
df['points_eng'] = points_eng
df['points_cat'] = points_cat
df['points_country'] = points_country
df['Puntos'] = df['points_cat'] + df['points_country'] + df["num_coincidentes"] + df['points_eng']

df = df.drop(['points_cat', 'points_country', 'num_coincidentes', 'Engagement avg', 'points_eng'], axis=1)


Check and change variables types.

In [66]:
df.dtypes

AccountID               object
Account                 object
Link                    object
Followers               object
Audience Country        object
Authentic engagement    object
Category1               object
Hashtags                object
Cost Story              object
Cost Post               object
CompanyID                int64
Name_co                 object
Category_co             object
Hashtags_co             object
Country_co              object
Followers_co             int64
Puntos                  object
dtype: object

In [67]:
df['Followers'] = df['Followers'].astype(int)
df['Authentic engagement'] = df['Authentic engagement'].astype(float)
df['Cost Story'] = df['Cost Story'].astype(float)
df['Followers_co'] = df['Followers_co'].astype(int)
df['Puntos'] = df['Puntos'].astype(float)
df.dtypes

AccountID                object
Account                  object
Link                     object
Followers                 int32
Audience Country         object
Authentic engagement    float64
Category1                object
Hashtags                 object
Cost Story              float64
Cost Post                object
CompanyID                 int64
Name_co                  object
Category_co              object
Hashtags_co              object
Country_co               object
Followers_co              int32
Puntos                  float64
dtype: object

To fit the lightfm model, we will change the name of some columns to fit the model.

In [68]:
# Change the name of columns to fit the model.
df = df.rename(columns={"AccountID": "itemID", "CompanyID": "userID", "Puntos": "rating"})

Fill None values with 'Unknown'

In [69]:
df['Audience Country'] = df['Audience Country'].fillna('Unknown')

In [70]:
df

Unnamed: 0,itemID,Account,Link,Followers,Audience Country,Authentic engagement,Category1,Hashtags,Cost Story,Cost Post,userID,Name_co,Category_co,Hashtags_co,Country_co,Followers_co,rating
0,1,cristiano,https://www.instagram.com/cristiano/,400100000,India,7800000.0,Sports with a ball,"[football, entrepreneur, foodstylist, love, tr...",468000.0,1092000.0,0,The Little Shop,Cars,"[instadaily, food, followback, instatravel]",Brazil,195608,0.019495
1,1,cristiano,https://www.instagram.com/cristiano/,400100000,India,7800000.0,Sports with a ball,"[football, entrepreneur, foodstylist, love, tr...",468000.0,1092000.0,1,The Cozy Kitchen,Sports with a ball,"[instalike, fashion, holidayseason, skiing, lo...",Egypt,415795,10.019495
2,1,cristiano,https://www.instagram.com/cristiano/,400100000,India,7800000.0,Sports with a ball,"[football, entrepreneur, foodstylist, love, tr...",468000.0,1092000.0,2,The Roost,Art,"[blessings, foodlife, endurance, fitnessmotiva...",Turkey,116164,0.019495
3,1,cristiano,https://www.instagram.com/cristiano/,400100000,India,7800000.0,Sports with a ball,"[football, entrepreneur, foodstylist, love, tr...",468000.0,1092000.0,3,The Wooden Spoon,Computers,"[foodislifee, familytime]",Iran,524384,0.019495
4,1,cristiano,https://www.instagram.com/cristiano/,400100000,India,7800000.0,Sports with a ball,"[football, entrepreneur, foodstylist, love, tr...",468000.0,1092000.0,4,Sunflower Fields,Management,"[health, trendy]",Germany,307127,0.019495
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,1000,eduincaz,https://www.instagram.com/eduincaz/,6200000,Mexico,305600.0,Lifestyle,[photo],18336.0,42784.0,95,Copper Kettle,Undefined,"[baseball, friendshipgoals, skiing]",Thailand,354362,0.049290
99996,1000,eduincaz,https://www.instagram.com/eduincaz/,6200000,Mexico,305600.0,Lifestyle,[photo],18336.0,42784.0,96,The Crusty Baguette,Food,"[instamood, education, beautifuldestinations]",China,74286,0.049290
99997,1000,eduincaz,https://www.instagram.com/eduincaz/,6200000,Mexico,305600.0,Lifestyle,[photo],18336.0,42784.0,97,The Plaid Pail,Luxury,"[goodmorning, cricket, transportation, media, ...",Nigeria,108265,0.049290
99998,1000,eduincaz,https://www.instagram.com/eduincaz/,6200000,Mexico,305600.0,Lifestyle,[photo],18336.0,42784.0,98,The Sugar House,Business,"[muscle, lifestyle, instatravel, instadaily]",Germany,82077,0.049290


To recreate a realistic situation, where we won't have all users matched with all companies, we will remove some pairs and add new ones.

In [71]:
# Remove rows
remove_n = 3246
drop_indices = np.random.choice(df.index, remove_n, replace=False)
df = df.drop(drop_indices)

In [72]:
# Add rows
new_row = {'itemID':1001, 'Account':'jionast', 'Link':'https://www.instagram.com/jionast/', 'Followers':173571, 'Audience Country':'Spain', 'Authentic engagement':113456, 'Category1':'Lifestyle', 'Hashtags':['football', 'entrepreneur'], 'Cost Story':1111, 'Cost Post':1233, 'userID':100, 'Name_co':'Mcdonalds', 'Category_co':'Food', 'Hashtags_co':['football', 'food'], 'Country_co':'Spain', 'Followers_co':123445, 'rating':3}
df = df.append(new_row, ignore_index=True)

  df = df.append(new_row, ignore_index=True)


The following dataframe will be the final dataset that will be used with the different algorithms.

In [73]:
df

Unnamed: 0,itemID,Account,Link,Followers,Audience Country,Authentic engagement,Category1,Hashtags,Cost Story,Cost Post,userID,Name_co,Category_co,Hashtags_co,Country_co,Followers_co,rating
0,1,cristiano,https://www.instagram.com/cristiano/,400100000,India,7800000.0,Sports with a ball,"[football, entrepreneur, foodstylist, love, tr...",468000.0,1092000.0,0,The Little Shop,Cars,"[instadaily, food, followback, instatravel]",Brazil,195608,0.019495
1,1,cristiano,https://www.instagram.com/cristiano/,400100000,India,7800000.0,Sports with a ball,"[football, entrepreneur, foodstylist, love, tr...",468000.0,1092000.0,1,The Cozy Kitchen,Sports with a ball,"[instalike, fashion, holidayseason, skiing, lo...",Egypt,415795,10.019495
2,1,cristiano,https://www.instagram.com/cristiano/,400100000,India,7800000.0,Sports with a ball,"[football, entrepreneur, foodstylist, love, tr...",468000.0,1092000.0,2,The Roost,Art,"[blessings, foodlife, endurance, fitnessmotiva...",Turkey,116164,0.019495
3,1,cristiano,https://www.instagram.com/cristiano/,400100000,India,7800000.0,Sports with a ball,"[football, entrepreneur, foodstylist, love, tr...",468000.0,1092000.0,3,The Wooden Spoon,Computers,"[foodislifee, familytime]",Iran,524384,0.019495
4,1,cristiano,https://www.instagram.com/cristiano/,400100000,India,7800000.0,Sports with a ball,"[football, entrepreneur, foodstylist, love, tr...",468000.0,1092000.0,4,Sunflower Fields,Management,"[health, trendy]",Germany,307127,0.019495
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96750,1000,eduincaz,https://www.instagram.com/eduincaz/,6200000,Mexico,305600.0,Lifestyle,[photo],18336.0,42784.0,96,The Crusty Baguette,Food,"[instamood, education, beautifuldestinations]",China,74286,0.049290
96751,1000,eduincaz,https://www.instagram.com/eduincaz/,6200000,Mexico,305600.0,Lifestyle,[photo],18336.0,42784.0,97,The Plaid Pail,Luxury,"[goodmorning, cricket, transportation, media, ...",Nigeria,108265,0.049290
96752,1000,eduincaz,https://www.instagram.com/eduincaz/,6200000,Mexico,305600.0,Lifestyle,[photo],18336.0,42784.0,98,The Sugar House,Business,"[muscle, lifestyle, instatravel, instadaily]",Germany,82077,0.049290
96753,1000,eduincaz,https://www.instagram.com/eduincaz/,6200000,Mexico,305600.0,Lifestyle,[photo],18336.0,42784.0,99,Urban Bites,Shows,"[cricket, lifeisgood]",Tanzania,445319,0.049290
