# Sky News influencer post analysis

In [None]:
data_path = '../../Projects/jaifp/influ-post-import/data/analysis/'
posts_file = 'kids-sky-news_posts.csv'
profile_file = 'kids-sky-news_profiles.csv'
lan = 'en'

In [None]:
# !pip install numpy
# !pip install pandas
# !pip install nltk
# !pip install deep-translator
# !pip install seaborn
# !pip install matplotlib
# !pip install plotly

In [None]:
import numpy as np
from numpy import nan
import pandas as pd
from pprint import pprint
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
# sns.set_context("talk")

In [None]:
plt.figure(figsize=(8,3))
sns.set(font_scale=0.9)
sns.set_style('whitegrid')

In [None]:
df = pd.read_csv(data_path + posts_file, sep='|')
print(f"dataset {posts_file} dimension: {df.shape}")

In [None]:
df.sample()

Categorical and numerical variables:

In [None]:
df.info()

In [None]:
df['owner_id'] = df['owner_id'].astype(str)
df['post_date'] = pd.to_datetime(df['date'])
print(df[['owner_id', 'post_date']].dtypes)
print(df[['date', 'post_date']].sample())

In [None]:
if 'date' in df.columns:
    df = df.drop(['date'], axis=1)

In [None]:
df.columns

Missing values:

In [None]:
df.isnull().sum()

Unique values:

In [None]:
cat = df.select_dtypes('object')
cat.nunique()

In [None]:
# For each categorical column the unique values and their frequency
'''for col in cat.columns:
    print(f"Column: {col}\n")
    print(cat[col].value_counts())
    print('\n****************')'''

Shared functions:

In [None]:
def print_value_counts(dataframe, value):
    print(f"Column: {value}")
    print(dataframe[value].value_counts())

Import profile file:

In [None]:
df_profile = pd.read_csv(data_path + profile_file, sep='|')

In [None]:
df_merged = pd.merge(left=df, right=df_profile, how='left', left_on='account', right_on='username')
df_merged.sample()

In [None]:
df_merged.shape

In [None]:
df_merged.columns

## 1. isVideo
How popular is the use of video?

In [None]:
print_value_counts(df, 'is_video')
print("\nProportion:")
print(f"False: {round(df['is_video'].value_counts()[0] / df.shape[0] * 100, 1)}%")
print(f"True: {round(df['is_video'].value_counts()[1] / df.shape[0] * 100, 1)}%")

In [None]:
sns.countplot(x='is_video', data=df)

In [None]:
print_value_counts(df, 'product_type')

In [None]:
print("\nProportion:")
for idx, name in enumerate(df['product_type'].value_counts().index.tolist()):
    print(f"{name}: {round(df['product_type'].value_counts()[idx] / df.shape[0] * 100, 1)}%")

In [None]:
sns.countplot(x='product_type', data=df)

## 2. Caption
Hashtags and mentioned, but also the position these appear. Is there any difference when we have the word #ad, #adverstisement, #gifted, #advert, #advertising, #sponsored

In [None]:
most_10_freq_hs_group = pd.Series(df['hashtags']).value_counts()[:10].to_frame()
most_10_freq_hs_group

In [None]:
df['hashtags_list'] = df['hashtags'].apply(lambda x: str(x).replace(",", "").split())
hashtags_list = []
for val in df.hashtags_list.values:
    hashtags_list.extend(val)

hashtags_list = [h for h in hashtags_list if not h == 'nan']
unique_hashtags_list = list(set(hashtags_list))
print(f"{len(unique_hashtags_list)} unique hashtags found")

In [None]:
import collections
hs_frequency = collections.Counter(hashtags_list)
hs_freq_dict = dict(hs_frequency)
sorted_hs_freq = sorted(hs_freq_dict.items(), key=lambda x: x[1], reverse=True)
sorted_hs_freq_dict = dict(sorted_hs_freq[:20])
print("Most frequent hashtags:")
print(sorted_hs_freq_dict)

In [None]:
adv_list = ['#ad ', '#adverstisement ', '#gifted', '#collaborazione ', '#advert ', '#advertising',
            '#sponsored']  # keep space after #ad avoids matching with hashtags such as #adorable

def check_adv(text, w_list):
    if any(word in text for word in w_list):
        return True
    else:
        return False

df['caption_adv'] = df.apply(lambda x: check_adv(str(x['caption']), adv_list), axis=1)
print_value_counts(df, 'caption_adv')

In [None]:
df[df['caption_adv'] == True].sample()

In [None]:
# check in the first characters because hashtag can be at the beginning of the caption but not the first word.
def start_with_adv(text, w_list):
    first_characters = text[:40]
    if any(word in first_characters for word in w_list):
        return True
    else:
        return False

df['start_with_adv'] = df.apply(lambda x: start_with_adv(str(x['caption']), adv_list), axis=1)
print_value_counts(df, 'start_with_adv')

In [None]:
df[df['start_with_adv'] == True].sample()

## 3. Should_request_ads
Should_request_ads field: for how many accounts this is = True? Are those posts with this = True business accounts? Which are the tags and mentions more commons?
[Meta - About Advertising Restrictions](https://www.facebook.com/business/help/975570072950669?id=434838534925385)

In [None]:
print_value_counts(df, 'should_request_ads')
print("\nProportion:")
print(f"False: {round(df['should_request_ads'].value_counts()[0] / df.shape[0] * 100, 1)}%")

## 4. Commerciality_status
For how many of them this is = True? Are those posts with this = True business accounts? Is there any advertisement hashtag used when this is True?

In [None]:
print_value_counts(df, 'commerciality_status')
print("\nProportion:")
print(f"Not commercial: {round(df['commerciality_status'].value_counts()[0] / df.shape[0] * 100, 1)}%")

## 5. is_paid_partnership
For how many of them this is = True? For those with this = True, how many of them has also commerciality_status = commercial, and how many has should request ads = True? How many are business accounts? Is there any advertisement hashtag used when this is True? How many have the word mum in the bio/username?

In [None]:
print_value_counts(df, 'is_paid_partnership')
print("\nProportion:")
print(f"False: {round(df['is_paid_partnership'].value_counts()[0] / df.shape[0] * 100, 1)}%")
print(f"True: {round(df['is_paid_partnership'].value_counts()[1] / df.shape[0] * 100, 1)}%")

In [None]:
pp_accounts = df[df['is_paid_partnership'] == True]
pp_accounts.sample()

In [None]:
pp_accounts_list = pp_accounts['account'].unique()
print(f"Number of unique accounts having is_paid_partnership = True is: {len(pp_accounts_list)}")

In [None]:
is_pp_and_ba = df_profile[(df_profile['username'].isin(pp_accounts_list)) & df_profile['is_business_account'] == True].shape[0]
print(f"Number of accounts having both is_paid_partnership and is_business_account = True is: {is_pp_and_ba}")