In [1]:
import pandas as pd

# 1. Load the raw data (no validation)
df_raw = pd.read_csv("../data/raw/raw_analyst_ratings.csv")

# 2. Try to parse the date column
parsed_dates = pd.to_datetime(df_raw['date'], errors='coerce')

# 3. Find and print problematic dates
bad_dates = df_raw.loc[parsed_dates.isnull(), 'date'].unique()
print("Unparseable dates:", bad_dates)

# 4. Drop rows with unparseable dates
df_clean = df_raw[parsed_dates.notnull()].copy()
df_clean['date'] = pd.to_datetime(df_clean['date'], errors='coerce')

# 5. (Optional) Drop rows with missing values in critical columns
df_clean = df_clean.dropna(subset=['headline', 'stock', 'date'])

# 6. Save the cleaned data
df_clean.to_csv("../data/processed/clean_analyst_ratings.csv", index=False)
print("Cleaned data saved to data/processed/clean_analyst_ratings.csv")


Unparseable dates: ['2020-05-22 00:00:00' '2020-05-21 00:00:00' '2020-05-18 00:00:00' ...
 '2012-02-18 00:00:00' '2010-02-07 00:00:00' '2017-08-26 00:00:00']
Cleaned data saved to data/processed/clean_analyst_ratings.csv
