
## this notebook explores the distribution of sequence lengths in the dataset, number of articles per year and sequence length of the title and excerpt

In [None]:
import pandas as pd 
df = pd.read_parquet("nyt_data.parquet")
df.head()

In [68]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

def plot_year_dist(df, title="articles for each year"): 
    counts = df.groupby("year").size()
    plt.figure(figsize=(12, 6))
    counts.plot(kind="bar", width=0.8,title=title)
    # Set x-axis ticks 5 years apart
    plt.gca().xaxis.set_major_locator(ticker.MultipleLocator(5))
    #set y-axis 1000 -> 1k 
    plt.gca().yaxis.set_major_formatter(ticker.FuncFormatter(lambda y_value, pos: '{:.0f}K'.format(y_value / 1000)))

    plt.xlabel("Year")
    plt.ylabel("Count")
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.show()

In [69]:
def filter_on_str_length(df, column_key, max_len=1):
    #returns a new df that satisfies the condition df[column_key] has a str of len > max_len
    return df[df[column_key].str.len() > max_len]

In [70]:
df_excerpt = filter_on_str_length(df, "excerpt", max_len=1) #excerpt has more then 1 char (filters out all non or 1 item excerpts)
df_title  = filter_on_str_length(df, "title", max_len=10) # titles longer then 10 chars

In [None]:
plot_year_dist(df_title, title="News headlines/Year")

In [None]:
plot_year_dist(df_excerpt, title="News headlines/Year, with excerpts")

In [None]:
# proportion of original size
len(df_title)/len(df)
#len(df_excerpt)/len(df)