# Infobae influencer profile analysis

In [None]:
data_path = '../../Projects/jaifp/influ-post-import/data/analysis/'
posts_file = 'kids-infobae_posts.csv'
profile_file = 'kids-infobae_profiles.csv'
lan = 'es'

In [None]:
# !pip install numpy
# !pip install pandas
# !pip install nltk
# !pip install deep-translator
# !pip install seaborn
# !pip install matplotlib
# !pip install plotly

In [None]:
import numpy as np
from numpy import nan
import pandas as pd
from pprint import pprint
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
# sns.set_context("talk")

In [None]:
plt.figure(figsize=(8,3))
sns.set(font_scale=0.9)
sns.set_style('whitegrid')

In [None]:
df = pd.read_csv(data_path + profile_file, sep='|')
print(f"dataset {profile_file} dimension: {df.shape}")

In [None]:
df.sample()

Categorical and numerical variables:

In [None]:
df.info()

In [None]:
df['id'] = df['id'].astype(str)
df.dtypes

In [None]:
df.columns

Missing values:

In [None]:
df.isnull().sum()

Unique values:

In [None]:
cat = df.select_dtypes('object')
cat.nunique()

In [None]:
# For each categorical column the unique values and their frequency
'''for col in cat.columns:
    print(f"Column: {col}\n")
    print(cat[col].value_counts())
    print('\n****************')'''

Shared functions:

In [None]:
def print_value_counts(dataframe, value):
    print(f"Column: {value}")
    print(dataframe[value].value_counts())

Import posts file:

In [None]:
df_posts = pd.read_csv(data_path + posts_file, sep='|')

## 1. keywords
How many of the accounts we have scraped use the word mother (or any forms like mum, mummy, mom, motherhood, mama…) and what percentage they represent from the entire set of accounts we have.

In [None]:
# lowercase
cols = ['username', 'norm_biography']  # fullName is excluded because it often contains non-alphabetic characters
res = lambda x: " ".join(str(x).lower() for x in str(x).split())

for col in cols:
    df[col] = df[col].apply(res)

df[cols].sample()

In [None]:
import nltk
# download to /home/marina/nltk_data
nltk.download('wordnet')
nltk.download('omw-1.4')  # Open Multilingual Wordnet for Italian and Spanish

In [None]:
from nltk.corpus import wordnet as wn
for lang in sorted(wn.langs()):
    print(lang, type(wn.all_lemma_names(lang=lang)))  # list of languages in corpora

In [None]:
# Check the post languages:
print_value_counts(df, 'language')

In [None]:
# Check the “pt” item and update with the correct language if needed:
df.loc[df.language.isin(['pt'])]

In [None]:
if not df.loc[df.language.isin(['pt'])].empty:
    row_index = df.loc[df.language.isin(['pt'])].index[0]
    # print(row_index)
    df.loc[row_index,'language']=lan
print_value_counts(df, 'language')

In [None]:
# Check the “zh” item and update with the correct language if needed:
df.loc[df.language.isin(['zh'])]

In [None]:
if not df.loc[df.language.isin(['zh'])].empty:
    row_index = df.loc[df.language.isin(['zh'])].index[0]
    # print(row_index)
    df.loc[row_index,'language']='en'
print_value_counts(df, 'language')

In [None]:
# store post languages
lang_np_arr = df.language.unique()
lang_list = (lang_np_arr[pd.isnull(lang_np_arr) == False]).tolist()
if lan in lang_list:
    lang_list.remove(lan)
print(lang_list)

In [None]:
# retrieve word synonyms to search in text
# try different words in the same language to refine the match
keywords_list = ['mamá', 'madre', 'papa', 'padre', 'maternidad', 'paternidad']

def synonym_extractor(keyword, language):
    synonyms = []
    for syn in wn.synsets(keyword, lang=language):
        for l in syn.lemmas(language):
            synonyms.append(l.name())
    return list(set(synonyms))

In [None]:
syn_list = []

for word in keywords_list:
    syn_list.extend(synonym_extractor(keyword=word, language="spa"))

unique_synonyms_list = list(set(syn_list))
unique_synonyms_list.sort()

In [None]:
# Manual cleaning of irrelevant words
diff_list = [x for x in syn_list if x not in keywords_list]
diff_list.sort()
print("Synonyms list:")
print(unique_synonyms_list)
print("\nAdded words list:")
print(diff_list)

In [None]:
keywords_list.extend(['mama', 'papá'])
keywords_list = list(set(keywords_list))
keywords_list.sort()
print(keywords_list)

In [None]:
# translate keywords to other post languages
from deep_translator import GoogleTranslator
complete_keywords_list = []
complete_keywords_list.extend(keywords_list)

for lan in lang_list:
    print(f"\nLanguage {lan}:")
    for word in keywords_list:
        translated = GoogleTranslator(source='it', target=lan).translate(word)
        print(word, ' -> ', translated)
        complete_keywords_list.append(translated.lower())

In [None]:
discarted_list = ['pope', 'popes']
complete_keywords_list.extend(['mum', 'mummy', 'mother', 'motherhood', 'mom', 'parent', 'parenting', 'parenthood', 'father', 'dad', 'daddy', 'mama', 'dadda'])
complete_keywords_list = list(set(complete_keywords_list))
complete_keywords_list = [x for x in complete_keywords_list if x not in discarted_list]
complete_keywords_list.sort()
print(complete_keywords_list)

In [None]:
match_str = "|".join(keywords_list)
df_selected = df.loc[df['username'].str.contains(match_str, case=False) | df['norm_biography'].str.contains(match_str, case=False)]
df_selected_unique = df_selected['username'].unique()
print(f"Number of accounts containing keywords: {df_selected_unique.shape[0]}")

In [None]:
df_selected.sample()

In [None]:
print(f"\nProportion: {round(df_selected.shape[0] / df.shape[0] * 100, 1)}%")

## 2. isBusinessAccount
How many (and as a proportion of all the accounts) of the accounts we have selected are business accounts? How many of them has a business contact? How many of them where isBusinessAccount = False have posts with commercial_status =True, or is paid partnership =True or should request ads=True?

In [None]:
print_value_counts(df, 'is_business_account')
print("\nProportion:")
print(f"False: {round(df['is_business_account'].value_counts()[0] / df.shape[0] * 100, 1)}%")
print(f"True: {round(df['is_business_account'].value_counts()[1] / df.shape[0] * 100, 1)}%")

In [None]:
sns.countplot(x='is_business_account', data=df)

In [None]:
print_value_counts(df, 'business_contact')

In [None]:
is_ba_and_bc = df[(df['is_business_account'] == True) & ((df['business_contact'] == 'CALL') | (df['business_contact'] == 'TEXT'))].shape[0]
print(f"Number of business accounts that have a business contact: {is_ba_and_bc}")

In [None]:
df_not_ba = df[df['is_business_account'] == False]

In [None]:
not_ba_list = df_not_ba['username'].unique()
print(f"commerciality_status is always = not_commercial")
not_ba_and_is_paid_posts = df_posts[(df_posts['account'].isin(not_ba_list)) & df_posts['is_paid_partnership'] == True]
print(f"Number of posts of non business account having is_paid_partnership = True is: {not_ba_and_is_paid_posts.shape[0]}")

In [None]:
df_is_ba = df[df['is_business_account'] == False][['username', 'is_business_account']].reset_index()
df_is_ba.rename(columns = {'username':'account'}, inplace = True)
df_n_posts = df_posts.groupby(['account'], sort=False)['caption'].count().to_frame()
df_n_is_paid_posts = df_posts.groupby(['account']).apply(lambda x: x[x['is_paid_partnership'] == True ]['is_paid_partnership'].sum()).reset_index()
df_is_ba_and_paid = df_is_ba.merge(df_n_posts,on='account').merge(df_n_is_paid_posts,on='account')
df_is_ba_and_paid = df_is_ba_and_paid.rename(columns={'caption' : 'n_posts', 0: 'n_is_paid'})
df_is_ba_and_paid['perc'] = df_is_ba_and_paid['n_is_paid'] / df_is_ba_and_paid['n_posts'] * 100
perc_is_paid_50 = df_is_ba_and_paid[df_is_ba_and_paid['perc'] > 50]
perc_is_paid_10 = df_is_ba_and_paid[df_is_ba_and_paid['perc'] > 10]
print(f'The number of accounts with is_business_account=False but with more than 50% of the posts with is_paid_partnership=True is {perc_is_paid_50.shape[0]}')
print(f'The number of accounts with is_business_account=False but with more than 10% of the posts with is_paid_partnership=True is {perc_is_paid_10.shape[0]}')
print(f'(Note: The proportion of is_paid_partnership is just 0.5%)')

## 3. categoryName
What are the most popular categories and which is the relationship between this category and isBusinessAccount: is there a particular category or categories more common when isBusinessAccount is True?

In [None]:
print_value_counts(df, 'category_name')

In [None]:
print_value_counts(df, 'business_category_name')

In [None]:
df_is_business_account = df[df['is_business_account'] == True]
# df_is_business_account.shape
print('Categories of accounts with is_business_account=True are:\n')
print_value_counts(df_is_business_account, 'business_category_name')
print('\n********************************\n')
print_value_counts(df_is_business_account, 'category_name')

## 4. isVerified
How many (and proportion of) are verified accounts.

In [None]:
print_value_counts(df, 'is_verified')
print("\nProportion:")
print(f"False: {round(df['is_verified'].value_counts()[0] / df.shape[0] * 100, 1)}%")
print(f"True: {round(df['is_verified'].value_counts()[1] / df.shape[0] * 100, 1)}%")

In [None]:
sns.countplot(x='is_verified', data=df)

## 5. postsCount
Distribution of the number of posts. Are the accounts with the highest number of posts also business accounts? What is the most common category for the accounts with the highest number of post (we could select 3rd quartile).

In [None]:
df['posts_count'].describe()

In [None]:
sns.displot(df, x="posts_count")

In [None]:
max_posts_count = df['posts_count'].max()
max_posts_acc = df[df['posts_count'] == max_posts_count]['username'].values[0]
df.nlargest(40, ['posts_count'])
threshold = df['posts_count'].describe()['75%']
df_acc_highest_posts_count = df[df['posts_count'] >= threshold].sort_values(by='posts_count', ascending=False)
highest_posts = df_acc_highest_posts_count.shape[0]
highest_posts_is_business = df_acc_highest_posts_count[df_acc_highest_posts_count['is_business_account'] == True].shape[0]
print(f"The maximum value for posts_count is {max_posts_count} and belongs to account {max_posts_acc}")
print(f"Number of accounts with posts_count >= {threshold} is {highest_posts} ({highest_posts_is_business} of them have is_business_account = True)")
print("For these accounts most common categories are:")
for idx, name in enumerate(df_acc_highest_posts_count['category_name'].value_counts().index.tolist()):
    if idx < 2:
        print(f"{name}: {df_acc_highest_posts_count['category_name'].value_counts()[idx]}")

## 6. subscribersCount and Subscribtions
Distribution of the accounts by the number of followers and compared to the subscriptions. Check if those with the highest numbers are business accounts or what is the transparency tag.

In [None]:
sns.pairplot(df, hue='is_business_account', vars=['subscribers_count', 'subscribtions'], height=3.5)

## 7. transparencyProduct
What are the options in this category? At the moment I can only see “STATE_CONTROLLED_MEDIA”. How many of the accounts we have are labeled as “state controlled media”. Are these accounts also business accounts? Are verified? Has these accounts the word mummy in the biography?

In [None]:
csv_files = ['kids-sole_profiles.csv', 'kids-sky-news_profiles.csv', 'kids-infobae_profiles.csv']
df_list = (pd.read_csv(data_path + file, sep='|') for file in csv_files)
df_all = pd.concat(df_list, ignore_index=True)
df_all.shape

In [None]:
print("In the complete profile dataset values are:")
print_value_counts(df_all, 'transparency_product')
print("\n")
print_value_counts(df_all, 'transparency_label')