# Task-1: Exploratory Data Analysis (EDA) 

In [15]:
# This file demonstrates how the notebook would look.
# It imports functions from eda_functions.py and runs the analysis.

# -------------------------------------------------
# 1. Import Libraries and Functions
# -------------------------------------------------


import os
import sys
import pandas as pd

# Ensure the local 'src' package is importable when running in a notebook.
# Adjust the path if your notebook sits in a different subfolder.
# Add the project root (parent of 'src') to sys.path so 'import src.*' works.
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from src.eda_functions import (load_data, clean_data,
                                 headline_length_stats, 
                                 plot_headline_length,
                                 publisher_counts, 
                                 plot_top_publishers,
                                 save_processed,monthly_trends,
                                 plot_monthly_trends)

# -------------------------------------------------
# 2. File Paths
# -------------------------------------------------
raw_file = r"D:\Python\Week-1\Data-Week-1\raw_analyst_ratings.csv"
out_file = r"D:\Python\Week-1\Data-Week-1\processed_analyst_ratings.csv"


## Load and Clean Data Set

In [None]:

# -------------------------------------------------
# 3. Load Dataset
# -------------------------------------------------
print("Loading dataset...")

# If df is already present in the notebook session, reuse it to avoid reloading large file
if 'df' in globals() and isinstance(df, (pd.DataFrame,)):
	print("DataFrame 'df' already in memory. Using existing df.")
else:
	try:
		# Primary (preferred) loading path using helper from src
		df = load_data(raw_file)
	except (pd.errors.ParserError, MemoryError, OverflowError) as e:
		# Fallback: read in chunks with the Python engine (more forgiving) to avoid C engine tokenizer OOM
		print("Standard load failed: {}".format(e))
		print("Falling back to chunked read using engine='python'...")
		chunks = []
		try:
			for chunk in pd.read_csv(raw_file, engine='python', chunksize=50000):
				chunks.append(chunk)
			df = pd.concat(chunks, ignore_index=True)
			# Attempt to apply cleaning steps if available
			try:
				df = clean_data(df)
			except Exception:
				# If clean_data expects the original raw format and fails, continue with the raw dataframe
				pass
		except Exception as e2:
			# Last resort: try a very low-memory read with all columns as strings
			print("Chunked read failed: {}".format(e2))
			print("Attempting low-memory read with dtype=str...")
			df = pd.read_csv(raw_file, engine='python', dtype=str)
			try:
				df = clean_data(df)
			except Exception:
				pass

print("Dataset loaded. Shape:", df.shape)
display(df.head())


Loading dataset...


ParserError: Error tokenizing data. C error: out of memory

In [None]:

# -------------------------------------------------
# 4. Clean Dataset
# -------------------------------------------------
print("Cleaning dataset...")
df = clean_data(df)
print("Cleaned dataset shape:", df.shape)


## Descriptive Statistics
### Headline Length Statistics

In [None]:

# -------------------------------------------------
# 5. Descriptive Statistics
# -------------------------------------------------
display(df.describe())
print(df.isna().sum())
null_threshold = 0.05
cols_with_nulls = df.isna().mean()[df.isna().mean() > null_threshold].index.tolist()
print("Columns with >5% nulls:", cols_with_nulls)
print("Headline Length Statistics:")
print(headline_length_stats(df))

# Plot headline length
plot_headline_length(df)


### Top Publishers

In [None]:

# Publisher counts
print("Top Publishers:")
print(publisher_counts(df).head(20))

plot_top_publishers(df, top_n=20)


### Monthly Publication Trends

In [None]:

# -------------------------------------------------
# 6. Publication Date Trends
# -------------------------------------------------
print("Monthly Trends:")
print(monthly_trends(df).tail())

plot_monthly_trends(df)


### Save Processed Dataset

In [None]:

# -------------------------------------------------
# 7. Save Processed Dataset
# -------------------------------------------------
print("Saving processed dataset...")
save_processed(df, out_file)
print("Saved at:", out_file)
