# Data Analysis

## Dataset - Facebook metrics Data Set

(Dataset: http://archive.ics.uci.edu/ml/machine-learning-databases/00368/Facebook_metrics.zip)

In [2]:
import pandas as pd

In [16]:
file_name = 'dataset_Facebook.csv'  # dataset filename
# check file existence
try:
    df = pd.read_csv(file_name, delimiter=';')  # read data from csv file
except FileNotFoundError as err:
    print(err)  # output error when there is no file in the "notebook"-file directory

In [21]:
print(f'Rows number: {df.shape[0]}.')
print(f'Columns number: {df.shape[1]}.')

Rows number: 500.
Columns number: 19.


In [24]:
df.head(10)

Unnamed: 0,Page total likes,Type,Category,Post Month,Post Weekday,Post Hour,Paid,Lifetime Post Total Reach,Lifetime Post Total Impressions,Lifetime Engaged Users,Lifetime Post Consumers,Lifetime Post Consumptions,Lifetime Post Impressions by people who have liked your Page,Lifetime Post reach by people who like your Page,Lifetime People who have liked your Page and engaged with your post,comment,like,share,Total Interactions
0,139441,Photo,2,12,4,3,0.0,2752,5091,178,109,159,3078,1640,119,4,79.0,17.0,100
1,139441,Status,2,12,3,10,0.0,10460,19057,1457,1361,1674,11710,6112,1108,5,130.0,29.0,164
2,139441,Photo,3,12,3,3,0.0,2413,4373,177,113,154,2812,1503,132,0,66.0,14.0,80
3,139441,Photo,2,12,2,10,1.0,50128,87991,2211,790,1119,61027,32048,1386,58,1572.0,147.0,1777
4,139441,Photo,2,12,2,3,0.0,7244,13594,671,410,580,6228,3200,396,19,325.0,49.0,393
5,139441,Status,2,12,1,9,0.0,10472,20849,1191,1073,1389,16034,7852,1016,1,152.0,33.0,186
6,139441,Photo,3,12,1,3,1.0,11692,19479,481,265,364,15432,9328,379,3,249.0,27.0,279
7,139441,Photo,3,12,7,9,1.0,13720,24137,537,232,305,19728,11056,422,0,325.0,14.0,339
8,139441,Status,2,12,7,3,0.0,11844,22538,1530,1407,1692,15220,7912,1250,0,161.0,31.0,192
9,139441,Photo,3,12,6,10,0.0,4694,8668,280,183,250,4309,2324,199,3,113.0,26.0,142


## Statistics calculation (without "Type" - numerical signs)

In [36]:
# columns required to calculation
req_cols = [col for col in df.head(0) if (col != 'Type')]
indices = ['Mean', 'Max', 'Min', 'Median', 'Mode']
# new dataframe without 'Type' column
description1 = pd.DataFrame(index=indices, columns=req_cols)
# filling new dataframe
for column in req_cols:
    mean = df[column].mean()
    max_value = df[column].max()
    min_value = df[column].min()
    median = df[column].median()
    mode = df[column].mode()[0]
    
    signature = [mean, max_value, min_value, median, mode]
    description1[column] = signature  # adding value to dataframe

In [37]:
description1

Unnamed: 0,Page total likes,Category,Post Month,Post Weekday,Post Hour,Paid,Lifetime Post Total Reach,Lifetime Post Total Impressions,Lifetime Engaged Users,Lifetime Post Consumers,Lifetime Post Consumptions,Lifetime Post Impressions by people who have liked your Page,Lifetime Post reach by people who like your Page,Lifetime People who have liked your Page and engaged with your post,comment,like,share,Total Interactions
Mean,123194.176,1.88,7.038,4.15,7.84,0.278557,13903.36,29585.948,920.344,798.772,1415.13,16766.376,6585.488,609.986,7.482,177.945892,27.266129,212.12
Max,139441.0,3.0,12.0,7.0,23.0,1.0,180480.0,1110282.0,11452.0,11328.0,19779.0,1107833.0,51456.0,4376.0,372.0,5172.0,790.0,6334.0
Min,81370.0,1.0,1.0,1.0,1.0,0.0,238.0,570.0,9.0,9.0,9.0,567.0,236.0,9.0,0.0,0.0,0.0,0.0
Median,129600.0,2.0,7.0,4.0,9.0,0.0,5281.0,9051.0,625.5,551.5,851.0,6255.5,3417.0,412.0,3.0,101.0,19.0,123.5
Mode,136393.0,1.0,10.0,7.0,3.0,0.0,677.0,4372.0,537.0,182.0,431.0,1210.0,690.0,403.0,0.0,98.0,13.0,0.0


## Statistics calculation for "Type" (non-numerical sign)

In [53]:
# new dataframe with only 'Type' column
non_numerical_indices = ['Unique elements', 'The most freqent elements', 
           'The least frequent elements']
description2 = pd.DataFrame(index=non_numerical_indices, columns=['Type'])
the_only_column = 'Type'

num_unique = df[the_only_column].value_counts().count()  # the number of unique elements
most_freq = [df[the_only_column].value_counts().axes[0][i] for i in range(3)]
most_freq = ', '.join(most_freq)  # 3 most frequent elements
least_freq = [df[the_only_column].value_counts().axes[0][-i] for i in range(-3, 0)]
least_freq = ', '.join(least_freq)  # 3 least frequent elements
signature = [num_unique, most_freq, least_freq] 
description2[the_only_column] = signature  # adding value to dataframe

In [54]:
description2

Unnamed: 0,Type
Unique elements,4
The most freqent elements,"Photo, Status, Link"
The least frequent elements,"Video, Link, Status"


## The most popular object in dataset

I decided that the popularity of the object can be estimated as the number_of_likes + number_of_reposts + number_of_comments. Thus, the most popular object is given below.

In [57]:
df[df['Total Interactions'] == df['Total Interactions'].max()]

Unnamed: 0,Page total likes,Type,Category,Post Month,Post Weekday,Post Hour,Paid,Lifetime Post Total Reach,Lifetime Post Total Impressions,Lifetime Engaged Users,Lifetime Post Consumers,Lifetime Post Consumptions,Lifetime Post Impressions by people who have liked your Page,Lifetime Post reach by people who like your Page,Lifetime People who have liked your Page and engaged with your post,comment,like,share,Total Interactions
244,130791,Photo,2,7,3,5,1.0,180480,319133,8072,4010,6242,108752,51456,3316,372,5172.0,790.0,6334


So, the most popular object is a photo with 5172 likes, 790 shares and 372 comments.