In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
internet=pd.read_csv("./Data/datacamp Internet News and Consumer Engagement.csv")

## Data dictionary

|    | Variable                        | Description                                                                  |
|---:|:--------------------------------|:-----------------------------------------------------------------------------|
|  0 | source_id                       | publisher unique identifier                                                  |
|  1 | source_name                     | human-readable publisher name                                                |
|  2 | author                          | article author                                                               |
|  3 | title                           | article headline                                                             |
|  4 | description                     | article short description                                                    |
|  5 | url                             | article URL from publisher website                                           |
|  6 | url_to_image                    | url to main image associated with the article                                |
|  7 | published_at                    | exact time and date of publishing the article                                |
|  8 | content                         | unformatted content of the article truncated to 260 characters               |
|  9 | top_article                     | value indicating if article was listed as a top article on publisher website |
| 10 | engagement_reaction_count       | users reactions count for posts on Facebook involving article URL            |
| 11 | engagement_comment_count        | users comments count for posts on Facebook involving article URL             |
| 12 | engagement_share_count          | users shares count for posts on Facebook involving article URL               |
| 13 | engagement_comment_plugin_count | Users comments count for Facebook comment plugin on article website          |

[Source](https://www.kaggle.com/szymonjanowski/internet-articles-data-with-users-engagement) of dataset.

In [8]:
internet.head(2)

Unnamed: 0,source_id,source_name,author,title,description,url,url_to_image,published_at,content,top_article,engagement_reaction_count,engagement_comment_count,engagement_share_count,engagement_comment_plugin_count
0,reuters,Reuters,Reuters Editorial,NTSB says Autopilot engaged in 2018 California...,"""The National Transportation Safety Board said...",https://www.reuters.com/article/us-tesla-crash...,https://s4.reutersmedia.net/resources/r/?m=02&...,2019-09-03T16:22:20Z,"""WASHINGTON (Reuters) - The National Transport...",0.0,0.0,0.0,2528.0,0.0
1,the-irish-times,The Irish Times,Eoin Burke-Kennedy,Unemployment falls to post-crash low of 5.2%,Latest monthly figures reflect continued growt...,https://www.irishtimes.com/business/economy/un...,https://www.irishtimes.com/image-creator/?id=1...,2019-09-03T10:32:28Z,"""The States jobless rate fell to 5.2 per cent ...",0.0,6.0,10.0,2.0,0.0


In [9]:
internet.shape

(1428, 14)

In [11]:
internet.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1428 entries, 0 to 1427
Data columns (total 14 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   source_id                        1428 non-null   object 
 1   source_name                      1428 non-null   object 
 2   author                           1317 non-null   object 
 3   title                            1428 non-null   object 
 4   description                      1424 non-null   object 
 5   url                              1427 non-null   object 
 6   url_to_image                     1327 non-null   object 
 7   published_at                     1427 non-null   object 
 8   content                          1262 non-null   object 
 9   top_article                      1426 non-null   float64
 10  engagement_reaction_count        1414 non-null   float64
 11  engagement_comment_count         1414 non-null   float64
 12  engagement_share_cou

In [10]:
internet.describe()

Unnamed: 0,top_article,engagement_reaction_count,engagement_comment_count,engagement_share_count,engagement_comment_plugin_count
count,1426.0,1414.0,1414.0,1414.0,1414.0
mean,0.112903,395.947666,152.398868,229.954031,0.015559
std,0.316585,3018.573037,1126.310461,1331.978488,0.404904
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,1.0,0.0
50%,0.0,1.0,0.0,7.0,0.0
75%,0.0,33.75,10.0,44.0,0.0
max,1.0,60117.0,22147.0,36720.0,15.0


In [12]:
internet.isnull().sum()

source_id                            0
source_name                          0
author                             111
title                                0
description                          4
url                                  1
url_to_image                       101
published_at                         1
content                            166
top_article                          2
engagement_reaction_count           14
engagement_comment_count            14
engagement_share_count              14
engagement_comment_plugin_count     14
dtype: int64

In [24]:
post_with_wam_shape=internet['engagement_share_count'].max()
publisher=internet[internet['engagement_share_count']==post_with_wam_shape]['source_id']
author=internet[internet['engagement_share_count']==post_with_wam_shape]['author']
print(publisher)
print(author)

203    bbc-news
Name: source_id, dtype: object
203    BBC News
Name: author, dtype: object


In [42]:
top=internet.sort_values(by=['engagement_share_count']).head(5)
top5_publishers=top['source_id'].tolist()
top5_authors=top['author'].tolist()

In [46]:
data['publishers']=top5_publishers
data['authors']=top5_authors
print(data)

{'publishers': ['al-jazeera-english', 'abc-news', 'reuters', 'the-irish-times', 'newsweek'], 'authors': ['Al Jazeera', 'The Associated Press', 'Brendan Pierson', 'Barry Roche', 'Dan Cancian']}


In [49]:
df = pd.DataFrame(data)
df

Unnamed: 0,publishers,authors
0,al-jazeera-english,Al Jazeera
1,abc-news,The Associated Press
2,reuters,Brendan Pierson
3,the-irish-times,Barry Roche
4,newsweek,Dan Cancian


The Top 5  publishers and authors

- 📊 **Visualize**: Create two words clouds for the title and description of the articles to find the most popular words. Make sure to remove stop words!

In [51]:
def common_member(a, b):
	result = [i for i in a if i in b]
	return result

a = [1, 2, 3, 4, 5]
b = [5, 6, 7, 8]

print("The common elements in the two lists are: ")
print(common_member(a, b))


The common elements in the two lists are: 
[5]


In [60]:
d = {'word': ['he'], 'number': [3]}
words = pd.DataFrame(data=d)
words

Unnamed: 0,word,number
0,he,3


In [61]:
words['word']=

0    he
Name: word, dtype: object