In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
import yfinance as yf

# Load the data
df = pd.read_csv('path_to_your_data.csv')

# Display basic information about the dataset
print(df.info())

# Display the first few rows
print(df.head())

# Basic statistics
print(df.describe())

# Check for missing values
print(df.isnull().sum())

# Analyze headline lengths
df['headline_length'] = df['headline'].str.len()
plt.figure(figsize=(10, 6))
sns.histplot(df['headline_length'])
plt.title('Distribution of Headline Lengths')
plt.xlabel('Length')
plt.ylabel('Count')
plt.show()

# Count articles per publisher
publisher_counts = df['publisher'].value_counts()
plt.figure(figsize=(12, 6))
publisher_counts[:10].plot(kind='bar')
plt.title('Top 10 Publishers by Article Count')
plt.xlabel('Publisher')
plt.ylabel('Number of Articles')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Analyze publication dates
df['date'] = pd.to_datetime(df['date'])
df['day_of_week'] = df['date'].dt.day_name()
day_counts = df['day_of_week'].value_counts()
plt.figure(figsize=(10, 6))
day_counts.plot(kind='bar')
plt.title('Article Publication by Day of Week')
plt.xlabel('Day of Week')
plt.ylabel('Number of Articles')
plt.tight_layout()
plt.show()

# Perform basic sentiment analysis
def get_sentiment(text):
    return TextBlob(text).sentiment.polarity

df['sentiment'] = df['headline'].apply(get_sentiment)

plt.figure(figsize=(10, 6))
sns.histplot(df['sentiment'], kde=True)
plt.title('Distribution of Headline Sentiment')
plt.xlabel('Sentiment Score')
plt.ylabel('Count')
plt.show()

# Analyze sentiment by publisher
plt.figure(figsize=(12, 6))
sns.boxplot(x='publisher', y='sentiment', data=df.sort_values('publisher').head(50))
plt.title('Sentiment Distribution by Top Publishers')
plt.xlabel('Publisher')
plt.ylabel('Sentiment Score')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

# Time series of publication frequency
df['date'].value_counts().sort_index().plot(figsize=(12, 6))
plt.title('Publication Frequency Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Articles')
plt.tight_layout()
plt.show()