In [1]:
# this mounts your Google Drive to the Colab VM.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# enter the foldername in your Drive where you have saved the unzipped
# assignment folder, e.g. 'CS221/Project/'
FOLDERNAME = 'CS221/CS221Project/'
assert FOLDERNAME is not None, "[!] Enter the foldername."

# now that we've mounted your Drive, this ensures that
# the Python interpreter of the Colab VM can load
# python files from within it.
import sys
sys.path.append('/content/drive/My Drive/{}'.format(FOLDERNAME))

Mounted at /content/drive


In [11]:
import os
import re
import pandas as pd

# Part 0: define the names of directories, output files, etc.

In [12]:
directory_text = '/content/drive/My Drive/CS221/CS221Project/Data/BBC News Summary/News Articles/'
directory_summary = '/content/drive/My Drive/CS221/CS221Project/Data/BBC News Summary/Summaries/'
output_csv = 'bbc_data_clean_summary.csv'

For the output csv files:
- bbc_data_original.csv : both the *text* and *summary* are using the original data from the txt files.
- bbc_data_clean_summary.csv : *text* is using the original data; while the *summary* has been transformed to lowercase and removed the special characters and remain A-Z, a-z, 0-9, and space.
- bbc_data_clean_text_and_summary.csv : both the *text* and *summary* have been transformed to lowercase and removed the special characters and remain A-Z, a-z, 0-9, and space.

Note that if you need to get different combinations, play with the `clean_text` parameter in the `generate_data()` method, and you can set the parameters of it at `merge_multiple_data_frames()` method.

# Part 1 : Collecting data

In [13]:
def remove_special_characters_and_to_lowercase(s):
    # remove special characters except A-Z, a-z, 0-9, and space.
    s = re.sub(r'[^A-Za-z0-9 ]+', '', s)
    # to lowercase
    return s.lower()

In [17]:
def generate_data(folder, sect, clean_text=True):

    print("Collecting data.... ", end='')
    data = []
    count = 0
    
    for file in os.listdir(folder):
        if file == sect:
            for file in os.listdir(folder + sect):
                try:
                    text = ''
                    name = file
                    myfile = open(folder+sect+'/'+file, "r")
                    text = myfile.read()
                    if clean_text == True:
                        text = remove_special_characters_and_to_lowercase(text)
                    mylist = [name, text]
                    count +=1
                    data.append(mylist)
                except:
                    continue

    print("collected!")
    print(str(count) + " text files found in "+ sect + " folder.")
    print("Data generated")
    return (data, count)

# Part 2 and 3 : Match Data and convert to csv file

In [18]:
def match_data(data_text, data_summary, count, name):
    df_text = pd.DataFrame(data_text, columns = ['File', 'Text'])
    df_sum = pd.DataFrame(data_summary, columns = ['File', 'Summary'])
    df_final = pd.merge(df_text, df_sum, on='File')
    return df_final

In [20]:
def merge_multiple_data_frames(clean_text=False, clean_summary=True):
    print("Creating dataframe.....", end='')
    df = pd.DataFrame(columns = ['File','Text', 'Summary'])

    print("Merging dataframes.....", end='')
    dirs = ['business', 'entertainment', 'politics', 'sport', 'tech']

    for dir in dirs:
        # set the clean_data = true / false here
        data_text, count = generate_data(directory_text, dir, clean_text)
        data_summary, count = generate_data(directory_summary, dir, clean_summary)
        df_matched = match_data(data_text, data_summary, count, dir)
        df = df.append(df_matched, ignore_index=True)

    df.to_csv(output_csv)

    print('csv saved!')

In [21]:
merge_multiple_data_frames(clean_text=False, clean_summary=True)

Creating dataframe.....Merging dataframes.....Collecting data.... collected!
510 text files found in business folder.
Data generated
Collecting data.... collected!
510 text files found in business folder.
Data generated
Collecting data.... collected!
386 text files found in entertainment folder.
Data generated
Collecting data.... collected!
386 text files found in entertainment folder.
Data generated
Collecting data.... collected!
417 text files found in politics folder.
Data generated
Collecting data.... collected!
417 text files found in politics folder.
Data generated
Collecting data.... collected!
510 text files found in sport folder.
Data generated
Collecting data.... collected!
511 text files found in sport folder.
Data generated
Collecting data.... collected!
401 text files found in tech folder.
Data generated
Collecting data.... collected!
401 text files found in tech folder.
Data generated
csv saved!
