### Publisher_analysis
Load and Prepare Data


In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv("../datasets/raw_analyst_ratings.csv")

# Display the first few rows to understand the structure
print(df.head())

   Unnamed: 0                                           headline  \
0           0            Stocks That Hit 52-Week Highs On Friday   
1           1         Stocks That Hit 52-Week Highs On Wednesday   
2           2                      71 Biggest Movers From Friday   
3           3       46 Stocks Moving In Friday's Mid-Day Session   
4           4  B of A Securities Maintains Neutral on Agilent...   

                                                 url          publisher  \
0  https://www.benzinga.com/news/20/06/16190091/s...  Benzinga Insights   
1  https://www.benzinga.com/news/20/06/16170189/s...  Benzinga Insights   
2  https://www.benzinga.com/news/20/05/16103463/7...         Lisa Levin   
3  https://www.benzinga.com/news/20/05/16095921/4...         Lisa Levin   
4  https://www.benzinga.com/news/20/05/16095304/b...         Vick Meyer   

                        date stock  
0  2020-06-05 10:30:54-04:00     A  
1  2020-06-03 10:45:20-04:00     A  
2  2020-05-26 04:30:07-04:00 

Count Articles per Publisher


In [3]:
# Count the number of articles per publisher
publisher_counts = df["publisher"].value_counts()

# Display the top publishers
print(publisher_counts)

publisher
Paul Quintaro                      228373
Lisa Levin                         186979
Benzinga Newsdesk                  150484
Charles Gross                       96732
Monica Gerson                       82380
                                    ...  
Shazir Mucklai - Imperium Group         1
Laura Jennings                          1
Eric Martin                             1
Jose Rodrigo                            1
Jeremie Capron                          1
Name: count, Length: 1034, dtype: int64


Analyze the Type of News Reported by Each Publisher


In [10]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# Load dataset
df = pd.read_csv("../datasets/raw_analyst_ratings.csv")

# Preprocess text: Tokenize and remove stopwords
def preprocess_text(text):
    return " ".join([word.lower() for word in text.split() if word.isalnum()])

# Apply preprocessing
df["clean_headline"] = df["headline"].apply(preprocess_text)

# Vectorize the headlines
vectorizer = CountVectorizer(stop_words="english")
X = vectorizer.fit_transform(df["clean_headline"])

# Get feature names
feature_names = vectorizer.get_feature_names_out()

# Convert sparse matrix to DataFrame (if necessary, e.g., for small datasets)
# features_df = pd.DataFrame(X.toarray(), columns=feature_names, index=df.index)

# Group by publisher and sum the counts of each word
def get_word_counts(publisher_group):
    # Extract the rows corresponding to this publisher
    word_counts = np.asarray(X[publisher_group.index].sum(axis=0)).flatten()
    return pd.Series(word_counts, index=feature_names)

publisher_word_counts = df.groupby("publisher").apply(get_word_counts).fillna(0)

# Display the top words for each publisher
for publisher in publisher_word_counts.index:
    word_counts = publisher_word_counts.loc[publisher]
    top_words = word_counts.sort_values(ascending=False).head(10)
    print(f"Top words for {publisher}:")
    print(top_words)
    print()


  publisher_word_counts = df.groupby("publisher").apply(get_word_counts).fillna(0)


Top words for 47ertrends:
trends      18
february    18
stock       18
market      18
2012        18
000          0
optune       0
opxa         0
opw          0
opvido       0
Name: 47ertrends, dtype: int64

Top words for AARP:
ways          7
businesses    5
right         5
help          5
small         5
emergency     5
worker        2
home          2
money         2
earn          2
Name: AARP, dtype: int64

Top words for ABNNewswire:
limited       8
nl            4
resources     3
celamin       3
million       3
update        3
companies     3
projects      3
activities    3
holdings      3
Name: ABNNewswire, dtype: int64

Top words for Aakin:
report      2
airport     2
security    2
sony        1
dslr        1
live        1
ups         1
ante        1
latest      1
audiovox    1
Name: Aakin, dtype: int64

Top words for Aaron Jackson.Ed:
week           15
things          9
read            9
reads           6
best            5
financial       4
blogosphere     4
stocks          1
pr

Identify Unique Domains from Email Addresses


In [11]:
import re


# Function to extract domain from email address
def extract_domain(email):
    match = re.search(r"@(\S+)", email)
    return match.group(1) if match else None


# Apply domain extraction
df["domain"] = df["publisher"].apply(extract_domain)

# Count the number of articles per domain
domain_counts = df["domain"].value_counts()

# Display the top domains
print(domain_counts)

domain
benzinga.com              7937
gmail.com                  139
andyswan.com                 5
investdiva.com               2
tothetick.com                2
eosdetroit.io                1
forextraininggroup.com       1
stockmetrix.net              1
Name: count, dtype: int64
