In [29]:
import pandas as pd
import numpy as np
import os
import glob

In [13]:
from datasets import load_dataset

ds = load_dataset("gopalkalpande/bbc-news-summary")

In [14]:
ds

DatasetDict({
    train: Dataset({
        features: ['File_path', 'Articles', 'Summaries'],
        num_rows: 2224
    })
})

In [19]:
from datasets import Dataset

In [24]:
df = ds["train"].to_pandas()

In [25]:
df

Unnamed: 0,File_path,Articles,Summaries
0,politics,Budget to set scene for election..Gordon Brown...,- Increase in the stamp duty threshold from £6...
1,politics,Army chiefs in regiments decision..Military ch...,"""They are very much not for the good and will ..."
2,politics,Howard denies split over ID cards..Michael How...,Michael Howard has denied his shadow cabinet w...
3,politics,Observers to monitor UK election..Ministers wi...,The report said individual registration should...
4,politics,Kilroy names election seat target..Ex-chat sho...,"UKIP's leader, Roger Knapman, has said he is g..."
...,...,...,...
2219,business,India opens skies to competition..India will a...,India will allow domestic commercial airlines ...
2220,business,Yukos bankruptcy 'not US matter'..Russian auth...,Yukos says a US court was entitled to declare ...
2221,business,Survey confirms property slowdown..Government ...,House prices were 11.8% higher on the year in ...
2222,business,High fuel prices hit BA's profits..British Air...,"Rod Eddington, BA's chief executive, said the ..."


In [27]:
os.listdir('BBC News Summary/Summaries')

['business', 'entertainment', 'politics', 'sport', 'tech']

In [28]:
articles_path = 'BBC News Summary/News Articles'
summaries_path = 'BBC News Summary/Summaries'
categories_list = ['politics', 'sport', 'tech', 'entertainment', 'business']

In [30]:
def read_files_from_folders(articles_path, summaries_path, categories_list=['tech', 'sport'], encoding = "ISO-8859-1"):
    articles = []
    summaries = []
    categories = []
    for category in categories_list:
        article_paths = glob.glob(os.path.join(articles_path, category, '*.txt'), recursive=True)
        summary_paths = glob.glob(os.path.join(summaries_path, category, '*.txt'), recursive=True)
        
        if len(article_paths) != len(summary_paths):
            print('number of files is not equal')
            return
        for i in range(len(article_paths)):
            categories.append(category)
            with open(article_paths[i], mode='r', encoding = encoding) as file:
                articles.append(file.read())
            
            with open(summary_paths[i], mode='r', encoding = encoding) as file:
                summaries.append(file.read())
    return articles, summaries, categories

In [31]:
articles, summaries, categories = read_files_from_folders(articles_path, summaries_path, categories_list)

In [36]:
df1 = pd.DataFrame({'Articles':articles, 'Summaries':summaries, 'File_path':categories})


In [37]:
df1

Unnamed: 0,Articles,Summaries,File_path
0,Labour plans maternity pay rise\n\nMaternity p...,She said her party would boost maternity pay i...,politics
1,Watchdog probes e-mail deletions\n\nThe inform...,All e-mails are subject to the freedom of info...,politics
2,Hewitt decries 'career sexism'\n\nPlans to ext...,Ms Hewitt also announced a new drive to help w...,politics
3,Labour chooses Manchester\n\nThe Labour Party ...,The Labour Party will hold its 2006 autumn con...,politics
4,Brown ally rejects Budget spree\n\nChancellor ...,"But Mr Balls, a prospective Labour MP, said he...",politics
...,...,...,...
2220,Trial begins of Spain's top banker\n\nThe tria...,Both executives helped Mr Botin orchestrate Sp...,business
2221,UK economy ends year with spurt\n\nThe UK econ...,"Simon Rubinsohn, chief economist at Gerrard, s...",business
2222,HealthSouth ex-boss goes on trial\n\nThe forme...,Several former HealthSouth employees have alre...,business
2223,Euro firms miss out on optimism\n\nMore than 9...,"Possibly as a result, the worry about low-cost...",business


In [38]:
result_df = pd.concat([df, df1], ignore_index=True)

In [39]:
result_df

Unnamed: 0,File_path,Articles,Summaries
0,politics,Budget to set scene for election..Gordon Brown...,- Increase in the stamp duty threshold from £6...
1,politics,Army chiefs in regiments decision..Military ch...,"""They are very much not for the good and will ..."
2,politics,Howard denies split over ID cards..Michael How...,Michael Howard has denied his shadow cabinet w...
3,politics,Observers to monitor UK election..Ministers wi...,The report said individual registration should...
4,politics,Kilroy names election seat target..Ex-chat sho...,"UKIP's leader, Roger Knapman, has said he is g..."
...,...,...,...
4444,business,Trial begins of Spain's top banker\n\nThe tria...,Both executives helped Mr Botin orchestrate Sp...
4445,business,UK economy ends year with spurt\n\nThe UK econ...,"Simon Rubinsohn, chief economist at Gerrard, s..."
4446,business,HealthSouth ex-boss goes on trial\n\nThe forme...,Several former HealthSouth employees have alre...
4447,business,Euro firms miss out on optimism\n\nMore than 9...,"Possibly as a result, the worry about low-cost..."


In [41]:
result_df.groupby("File_path").size()

File_path
business         1020
entertainment     772
politics          834
sport            1021
tech              802
dtype: int64

In [42]:
df2 = pd.read_excel("dataset.xlsx")

In [43]:
df2

Unnamed: 0.1,Unnamed: 0,id,human_summary,publication,author,date,year,month,theme,content
0,0,17283,In successfully seeking a temporary halt in th...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,politics,WASHINGTON — Congressional Republicans have...
1,0,17284,Officers put her in worse danger some months l...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,crime,"After the bullet shells get counted, the blood..."
2,0,17285,The film striking appearance had been created ...,New York Times,Margalit Fox,2017-01-06,2017.0,1.0,entertainment,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,0,17286,The year was only days old when the news came ...,New York Times,William McDonald,2017-04-10,2017.0,4.0,entertainment,"Death may be the great equalizer, but it isn’t..."
4,0,17287,If North Korea conducts a test in coming month...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,politics,"SEOUL, South Korea — North Korea’s leader, ..."
...,...,...,...,...,...,...,...,...,...,...
995,259,18460,more than 500 rescuers tried frantically to se...,New York Times,Gerry Mullany,2017-03-14,2017.0,3.0,accidents,HONG KONG — Hundreds of pilot whales that s...
996,259,18461,Signing balotelli was not just a way to garner...,New York Times,Rory Smith,2017-02-10,2017.0,2.0,sports,"NICE, France — Rivère accepts the complim..."
997,259,18462,Although there was no evidence of that the bun...,New York Times,Jack Ewing,2017-02-09,2017.0,2.0,business,FRANKFURT — Germans who never really warmed...
998,259,18463,He questioned why any n. b. a. free agent woul...,New York Times,Scott Cacciola,2017-02-10,2017.0,2.0,sports,Charles Oakley has strong feelings about compe...


In [47]:
tempdf = df2.drop(columns="id")

In [51]:
df2 = tempdf.drop(columns="Unnamed: 0")

In [52]:
tempdf = df2.drop(columns="publication")

In [53]:
df2 = tempdf.drop(columns="date")

In [54]:
tempdf = df2.drop(columns="year")

In [55]:
df2 = tempdf.drop(columns="month")

In [56]:
tempdf = df2.drop(columns="author")

In [58]:
df3 =tempdf.rename(columns={'human_summary': 'Summaries', 'theme': 'File_path','content': 'Articles'})

In [59]:
df3

Unnamed: 0,Summaries,File_path,Articles
0,In successfully seeking a temporary halt in th...,politics,WASHINGTON — Congressional Republicans have...
1,Officers put her in worse danger some months l...,crime,"After the bullet shells get counted, the blood..."
2,The film striking appearance had been created ...,entertainment,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,The year was only days old when the news came ...,entertainment,"Death may be the great equalizer, but it isn’t..."
4,If North Korea conducts a test in coming month...,politics,"SEOUL, South Korea — North Korea’s leader, ..."
...,...,...,...
995,more than 500 rescuers tried frantically to se...,accidents,HONG KONG — Hundreds of pilot whales that s...
996,Signing balotelli was not just a way to garner...,sports,"NICE, France — Rivère accepts the complim..."
997,Although there was no evidence of that the bun...,business,FRANKFURT — Germans who never really warmed...
998,He questioned why any n. b. a. free agent woul...,sports,Charles Oakley has strong feelings about compe...


In [60]:
result_df2 = pd.concat([result_df, df3], ignore_index=True)

In [62]:
result_df2.groupby("File_path").size()

File_path
accidents           4
architecture        4
art                 2
business         1228
crime             110
entertainment     925
environment         1
health              2
law                41
lifestyle          78
politics         1158
science            25
sport            1021
sports             30
tech              802
technology         18
dtype: int64

In [63]:
result_df2.to_csv("merge_df.csv")