In [1]:
# !pip install pandas matplotlib seaborn
# !pip freeze > ../requirements.txt


In [7]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
dataset_path = os.path.join(r"../datasets/cleaned", "analyst_ratings_cleaned.csv")
df = pd.read_csv(dataset_path)

df.info()

2024-12-13 11:29:56,622 - INFO - Loading analyst ratings from ../datasets/cleaned\analyst_ratings_cleaned.csv
2024-12-13 11:30:05,353 - INFO - Analyst ratings loaded successfully.


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1407328 entries, 0 to 1407327
Data columns (total 5 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   headline   1407328 non-null  object
 1   url        1407328 non-null  object
 2   publisher  1407328 non-null  object
 3   date       1407328 non-null  object
 4   stock      1407328 non-null  object
dtypes: object(5)
memory usage: 53.7+ MB


In [None]:
df

In [None]:
df.columns

In [None]:
unique_publishers = df['publisher'].unique()
unique_publishers_num = df['publisher'].nunique()
unique_publishers_num, unique_publishers

In [None]:
unique_stocks = df['stock'].unique()
unique_stocks_num = df['stock'].nunique()
unique_stocks_num, unique_stocks


In [None]:
df.describe()

In [None]:
# Check for missing values
df.isnull().sum()

In [None]:
# Check for duplicates
df.duplicated().sum()

# Statistics summary

In [None]:
# Summary statistics for headline, url, publisher, date, stock columns
df[['headline', 'url', 'publisher', 'date', 'stock']].describe()

In [None]:
# articles per stocks
articles_per_stocks = df['stock'].value_counts()
articles_per_stocks

In [None]:
(articles_per_stocks == 1).sum()

In [None]:
# Articles per Publisher
articles_per_publisher = df['publisher'].value_counts().reset_index()
articles_per_publisher.columns = ['publisher', 'num_articles_published']

# Top 10 active publishers
top_publishers = articles_per_publisher.head(10)
top_publishers

In [None]:
# Analyze the type of news reported by top publishers
for publisher in top_publishers.index:
    publisher_articles = df[df['publisher'] == publisher]
    print(f"\nArticles by {publisher}:")
    print(publisher_articles['headline'].head(5))

In [None]:
(articles_per_publisher == 1).sum()

In [None]:
# Check if email addresses are used as publisher names
email_publishers = df[df['publisher'].str.contains('@', na=False)]
unique_domains = email_publishers['publisher'].str.extract(r'@([\w\.-]+)')[0].value_counts()
unique_domains

# Publication Time Analysis

In [None]:
# Convert all dates to the same timezone with UTC handling before creating the date range
df['date'] = pd.to_datetime(df['date'], format='mixed', utc=True)
# # Convert all dates to naive datetime objects
# df['date'] = df['date'].dt.tz_localize(None)

# Now create the date range
date_range = pd.date_range(start=df['date'].min(), end=df['date'].max(), freq='D')
missing_dates = date_range.difference(df['date'])
missing_dates, df['date'].min(), df['date'].max()

In [19]:
# Analyze the publication dates to see trends over time
df['publication_year'] = df['date'].dt.year
df['publication_month'] = df['date'].dt.month
df['publication_day'] = df['date'].dt.day
df['publication_weekday'] = df['date'].dt.dayofweek
df['publication_weekday_name'] = df['date'].dt.day_name()
df['publication_hour'] = df['date'].dt.hour

In [None]:
# Articles per Year
articles_per_year = df['publication_year'].value_counts().sort_index()
articles_per_year

In [None]:
# Articles per Month
articles_per_month = df['publication_month'].value_counts().sort_index()
articles_per_month

In [None]:
# Articles per Day
articles_per_day = df['publication_day'].value_counts().sort_index()
articles_per_day

In [None]:
# Articles per Day of the Week
articles_per_weekday = df['publication_weekday'].value_counts().sort_index()
articles_per_weekday

In [None]:
# Articles per Day of the Week namely
articles_per_weekday_name = df['publication_weekday_name'].value_counts().sort_index()
articles_per_weekday_name

In [None]:
# Articles per Hour
articles_per_hour = df['publication_hour'].value_counts().sort_index()
articles_per_hour

In [None]:

# Plotting the trends
plt.figure(figsize=(12, 6))

plt.subplot(3, 1, 1)
articles_per_year.plot(kind='bar', title='Articles per Year')
plt.xlabel('Year')
plt.ylabel('Number of Articles')

plt.subplot(3, 1, 2)
articles_per_month.plot(kind='bar', title='Articles per Month')
plt.xlabel('Month')
plt.ylabel('Number of Articles')

plt.subplot(3, 1, 3)
articles_per_weekday_name.plot(kind='bar', title='Articles per Weekday')
# articles_per_weekday.plot(kind='bar', title='Articles per Weekday')
plt.xlabel('Day of the Week')
plt.ylabel('Number of Articles')

plt.tight_layout()
plt.show()

In [None]:
# Articles per Month of Each Year
articles_per_month_year = df.groupby(['publication_year', 'publication_month']).size().unstack(fill_value=0)
articles_per_month_year


In [None]:
# Plotting the number of articles per month of each year
plt.figure(figsize=(12, 8))
sns.heatmap(articles_per_month_year, annot=True, fmt="d", cmap="YlGnBu")
plt.title('Number of Articles per Month of Each Year')
plt.xlabel('Month')
plt.ylabel('Year')
plt.show()



In [None]:
# Alternative plot for the number of articles per month of each year
plt.figure(figsize=(12, 8))
articles_per_month_year.plot(kind='bar', stacked=True, colormap='tab20')
plt.title('Number of Articles per Month of Each Year')
plt.xlabel('Year')
plt.ylabel('Number of Articles')
plt.legend(title='Month')
plt.tight_layout()
plt.show()

In [None]:
# Another alternative figure for the above
plt.figure(figsize=(12, 6))

plt.subplot(3, 1, 1)
articles_per_year.plot(kind='line', marker='o', title='Articles per Year')
plt.xlabel('Year')
plt.ylabel('Number of Articles')

plt.subplot(3, 1, 2)
articles_per_month.plot(kind='line', marker='o', title='Articles per Month')
plt.xlabel('Month')
plt.ylabel('Number of Articles')

plt.subplot(3, 1, 3)
articles_per_weekday_name.plot(kind='line', marker='o', title='Articles per Weekday')
# articles_per_weekday.plot(kind='line', marker='o', title='Articles per Weekday')
plt.xlabel('Day')
plt.ylabel('Number of Articles')

plt.tight_layout()
plt.show()


In [None]:
# Distribution of articles by years
articles_per_year.plot(kind='line', marker='o', color='blue')
plt.title('Distribution of Articles by years')
plt.xlabel('Year')
plt.ylabel('Number of Articles')
plt.show()

In [None]:
# Distribution of articles by months
articles_per_month.plot(kind='line', marker='o', color='blue')
plt.title('Distribution of Articles by months')
plt.xlabel('Month')
plt.ylabel('Number of Articles')
plt.show()

In [None]:
# Distribution of articles by days of the month
articles_per_day.plot(kind='line', marker='o', color='purple')
plt.title('Distribution of Articles by days of the month')
plt.xlabel('day')
plt.ylabel('Number of Articles')
plt.show()

In [None]:
# Distribution of articles by days of the week
articles_per_weekday_name.plot(kind='bar', color='skyblue')
# articles_per_weekday.plot(kind='bar', color='skyblue')
plt.title('Distribution of Articles by days of the week')
plt.xlabel('Weekday')
plt.ylabel('Number of Articles')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Distribution of articles by hour of the Day
plt.figure(figsize=(12, 6))
articles_per_hour.plot(kind='bar', color='green')
plt.title('Distribution of Articles Times by Hour of the Day')
plt.xlabel('Hour of the Day')
plt.ylabel('Number of Articles')
plt.xticks(rotation=45)
plt.show()

In [None]:
articles_per_hour

In [None]:
# Extract year and month from the date
df['year_month'] = df['date'].dt.to_period('M')

# Group by year and month to get the count of articles published monthly each year
publication_counts = df.groupby('year_month').size()
publication_counts

In [None]:
# Plot the publication frequency over time
plt.figure(figsize=(14, 7))
publication_counts.plot()
plt.title('Article Publication Frequency Over Time')
plt.xlabel('Year-Month')
plt.ylabel('Number of Articles')
plt.grid(True)
plt.show()

In [None]:
# Identify spikes in article publications
spikes = publication_counts[publication_counts > publication_counts.mean() + 2 * publication_counts.std()]
spikes