In [2]:
import pandas as pd
from urllib.parse import urlparse
from src.loader import load_data
# Specify the path to the csv files
article_path = '../data/data.csv'
traffic_path = '../data/traffic.csv'
rating_path = '../data/rating.csv'

# Load the JSON file into a DataFrame
df = load_data(article_path)
rating_df = load_data(rating_path)
traffic_df = load_data(traffic_path)
df['Domain'] = df['url'].apply(lambda x: urlparse(x).netloc)
# Display the DataFrame
merged_df = pd.merge(df, traffic_df, left_on='Domain', right_on='Domain', how='left')
# Group by 'source_name' and sum the traffic metrics (e.g., 'RefIPs')
most_visited_news_sites = merged_df.groupby('source_name')['RefIPs'].sum().sort_values(ascending=False)
# Convert the Series to a DataFrame
most_visited_news_sites_df = most_visited_news_sites.reset_index()
# Rename the columns for clarity
most_visited_news_sites_df.columns = ['source_name', 'RefIPs']
# Print the top 10 news sites with the highest traffic
print(most_visited_news_sites_df.head(10))


          source_name       RefIPs
0  The Times of India  135680860.0


In [35]:
# Sentiment

# Group data by 'source_name' and 'title_sentiment'
grouped_data = rating_df.groupby(['source_name', 'title_sentiment']).size()

# Calculate mean for each
mean_statistics = grouped_data.groupby(['source_name', 'title_sentiment']).mean()

# Get the top 10 websites with mean for positive
top_positive_websites = mean_statistics.loc[(slice(None), 'Positive')].nlargest(10)

# Get the top 10 websites with mean for neutral
top_neutral_websites = mean_statistics.loc[(slice(None), 'Neutral')].nlargest(10)

# Get the top 10 websites with mean for negative
top_negative_websites = mean_statistics.loc[(slice(None), 'Negative')].nlargest(10)

# Print the top 10 websites based on mean for positive
print("Top 10 Mean for Positive")
print(top_positive_websites)
print()

# Print the top 10 websites based on mean for neutral sentiment
print("Top 10 Mean for Neutral")
print(top_neutral_websites)
print()

# Print the top 10 websites based on mean for negative sentiment
print("Top 10 Mean Negative")
print(top_negative_websites)

Top 10 Mean for Positive
source_name
The Times of India    1145.0
ETF Daily News         992.0
GlobeNewswire          905.0
Forbes                 635.0
Digital Trends         406.0
Android Central        272.0
Business Insider       237.0
Boing Boing            170.0
Marketscreener.com     153.0
Phys.Org               143.0
dtype: float64

Top 10 Mean for Neutral
source_name
ETF Daily News        15194.0
The Times of India     5149.0
GlobeNewswire          4491.0
Globalsecurity.org     2255.0
Forbes                 1933.0
The Punch              1229.0
ABC News               1193.0
BBC News               1185.0
Marketscreener.com     1167.0
Al Jazeera English      927.0
dtype: float64

Top 10 Mean Negative
source_name
The Times of India              1210.0
Business Insider                 890.0
BBC News                         854.0
Globalsecurity.org               747.0
ABC News                         726.0
Al Jazeera English               706.0
ETF Daily News                   560.0

In [26]:
print(traffic_df.columns)


Index(['GlobalRank', 'TldRank', 'Domain', 'TLD', 'RefSubNets', 'RefIPs',
       'IDN_Domain', 'IDN_TLD', 'PrevGlobalRank', 'PrevTldRank',
       'PrevRefSubNets', 'PrevRefIPs'],
      dtype='object')


In [6]:
column = df.columns
column

Index(['article_id', 'source_id', 'source_name', 'author', 'title',
       'description', 'url', 'url_to_image', 'published_at', 'content',
       'category', 'full_content'],
      dtype='object')

In [14]:
top_10_websites = df['source_name'].value_counts().head(10)
print(top_10_websites)

source_name
ETF Daily News        16631
The Times of India     7629
GlobeNewswire          5485
Forbes                 4423
Biztoc.com             3968
BBC News               3342
Globalsecurity.org     3093
Business Insider       2746
ABC News               2188
Marketscreener.com     1948
Name: count, dtype: int64


In [15]:
least_10_websites = df['source_name'].value_counts(ascending=True).head(10)
print(least_10_websites)

source_name
Omnigroup.com            1
Fresno Bee               1
Cs.ru.nl                 1
Labnol.org               1
Theedgemalaysia.com      1
Schwab.com               1
Amd.com                  1
Bubbyandbean.com         1
Thriftydecorchick.com    1
Phpclasses.org           1
Name: count, dtype: int64


In [9]:
top = df.head

<bound method NDFrame.head of         article_id source_id                   source_name  \
0            89541       NaN  International Business Times   
1            89542       NaN                    Prtimes.jp   
2            89543       NaN                      VOA News   
3            89545       NaN            The Indian Express   
4            89547       NaN           The Times of Israel   
...            ...       ...                           ...   
105370      781108       NaN            The Indian Express   
105371      781129       NaN                        Forbes   
105372      781235       NaN                           NPR   
105373      781240       NaN                        Forbes   
105374      781308       NaN                        Forbes   

                                                   author  \
0                                          Paavan MATHEMA   
1                                                     NaN   
2              webdesk@voanews.com (Agence

In [18]:
traffic_df

Unnamed: 0,GlobalRank,TldRank,Domain,TLD,RefSubNets,RefIPs,IDN_Domain,IDN_TLD,PrevGlobalRank,PrevTldRank,PrevRefSubNets,PrevRefIPs
0,1,1,google.com,com,471274,2151358,google.com,com,1,1,471248,2147402
1,2,2,facebook.com,com,465424,2250631,facebook.com,com,2,2,465297,2247135
2,3,3,youtube.com,com,420579,1852995,youtube.com,com,3,3,420483,1848940
3,4,4,twitter.com,com,404985,1757127,twitter.com,com,4,4,404998,1753939
4,5,5,instagram.com,com,364746,1598085,instagram.com,com,5,5,364618,1594892
...,...,...,...,...,...,...,...,...,...,...,...,...
999995,999996,485324,kireie.com,com,222,280,kireie.com,com,-1,-1,-1,-1
999996,999997,485325,mt-lock.com,com,222,280,mt-lock.com,com,952633,461429,232,284
999997,999998,485326,pinkwater.com,com,222,280,pinkwater.com,com,-1,-1,-1,-1
999998,999999,485327,soderhomes.com,com,222,280,soderhomes.com,com,-1,-1,-1,-1


In [19]:
traffic_column = traffic_df.columns
traffic_column

Index(['GlobalRank', 'TldRank', 'Domain', 'TLD', 'RefSubNets', 'RefIPs',
       'IDN_Domain', 'IDN_TLD', 'PrevGlobalRank', 'PrevTldRank',
       'PrevRefSubNets', 'PrevRefIPs'],
      dtype='object')

In [20]:
df_sorted_by_unique_visitors = traffic_df.sort_values(by='RefIPs', ascending=False)

# Print the website with the highest number of unique visitors
print(df_sorted_by_unique_visitors[['Domain', 'RefIPs']].head(10))

                  Domain   RefIPs
1           facebook.com  2250631
0             google.com  2151358
2            youtube.com  1852995
3            twitter.com  1757127
4          instagram.com  1598085
5           linkedin.com  1319082
8   googletagmanager.com   863843
10         wordpress.org   835948
9          wikipedia.org   831384
6              apple.com   804726
