** **
# Step 1: Load Package
** **

In [20]:
import numpy as np 
import pandas as pd 
import requests
import io

** **
# Step 2: Load Data
** **

In [22]:
# Downloading the csv file from your GitHub account

url = ("https://raw.githubusercontent.com/Kensuzuki95/Corporate_AI_Ethics_Guideline_Analysis/main/Dataset/Dataset_Filtered.csv")
download = requests.get(url).content

dataset = pd.read_csv(io.StringIO(download.decode('utf-8')))

dataset.head()

Unnamed: 0,No.,Company Name,Country,Industry,Published Year,Last Revised,Link,Document Name,Main Text,Comment
0,1,Accenture,Ireland,Consulting,03-30-2021,03-30-2021,https://www.accenture.com/content/dam/accentur...,Responsible AI From principles to practice,Responsible AI\r\nFrom principles to practice\...,Addtional Details: https://www.accenture.com/u...
1,2,Adobe,United States of America,Software,,,https://www.adobe.com/content/dam/cc/en/ai-eth...,Adobe’s Commitment to AI Ethics,"Adobe’s Commitment to AI Ethics\r\nAt Adobe, o...",Addtional Details: https://www.adobe.com/conte...
2,3,Alphabet,United States of America,Software,,,https://ai.google/responsibilities/responsible...,Responsible AI practices,Responsible AI practices\r\nThe development of...,Addtional Information: https://ai.google/princ...
3,4,Amazon,United States of America,Software,,,https://d1.awsstatic.com/responsible-machine-l...,Responsible Use of Machine Learning,"Responsible Use of Machine Learning\r\nAt AWS,...",
4,5,Atos,France,Consulting,,,https://atos.net/en/lp/cybersecurity-magazine-...,The Atos Blueprint for Responsible AI,AI is a broad topic encompassing many differen...,


## Clean the Dataset Format

In [30]:
#Check for unecesarry columns
dataset.columns

Index(['No.', 'Company Name', 'Country', 'Industry', 'Published Year',
       'Last Revised', 'Link', 'Document Name', 'Main Text', 'Comment'],
      dtype='object')

In [35]:
text_data = dataset.drop(columns=['No.','Country', 'Industry', 'Published Year', 'Last Revised', 'Link', 'Comment'], axis=1)
text_data.info()
text_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Company Name   49 non-null     object
 1   Document Name  49 non-null     object
 2   Main Text      49 non-null     object
dtypes: object(3)
memory usage: 1.3+ KB


Unnamed: 0,Company Name,Document Name,Main Text
0,Accenture,Responsible AI From principles to practice,Responsible AI\r\nFrom principles to practice\...
1,Adobe,Adobe’s Commitment to AI Ethics,"Adobe’s Commitment to AI Ethics\r\nAt Adobe, o..."
2,Alphabet,Responsible AI practices,Responsible AI practices\r\nThe development of...
3,Amazon,Responsible Use of Machine Learning,"Responsible Use of Machine Learning\r\nAt AWS,..."
4,Atos,The Atos Blueprint for Responsible AI,AI is a broad topic encompassing many differen...


** **
#Step 3: Data Cleaning
** **

Since the goal of this analysis is to perform topic modeling, we will solely focus on the text data from each paper, and drop other metadata columns

## Remove punctuation/lower casing

Next, let’s perform a simple preprocessing on the content of paper_text column to make them more amenable for analysis, and reliable results. To do that, we’ll use a regular expression to remove any punctuation, and then lowercase the text

In [36]:
# Load the regular expression library 
import re

# Remove punctuation
text_data['main_text_processed'] = text_data['Main Text'].map(lambda x: re.sub('[,\.!?]', '', x))

# Convert the text to lowercase
text_data['main_text_processed'] = text_data['main_text_processed'].map(lambda x: x.lower())

# Print out the first rows of papers
text_data['main_text_processed'].head()

0    responsible ai\r\nfrom principles to practice\...
1    adobe’s commitment to ai ethics\r\nat adobe ou...
2    responsible ai practices\r\nthe development of...
3    responsible use of machine learning\r\nat aws ...
4    ai is a broad topic encompassing many differen...
Name: main_text_processed, dtype: object

## Tokenize words and further clean-up text

Let’s tokenize each sentence into a list of words, removing punctuations and unnecessary characters altogether.

In [37]:
text_data.head()

Unnamed: 0,Company Name,Document Name,Main Text,main_text_processed
0,Accenture,Responsible AI From principles to practice,Responsible AI\r\nFrom principles to practice\...,responsible ai\r\nfrom principles to practice\...
1,Adobe,Adobe’s Commitment to AI Ethics,"Adobe’s Commitment to AI Ethics\r\nAt Adobe, o...",adobe’s commitment to ai ethics\r\nat adobe ou...
2,Alphabet,Responsible AI practices,Responsible AI practices\r\nThe development of...,responsible ai practices\r\nthe development of...
3,Amazon,Responsible Use of Machine Learning,"Responsible Use of Machine Learning\r\nAt AWS,...",responsible use of machine learning\r\nat aws ...
4,Atos,The Atos Blueprint for Responsible AI,AI is a broad topic encompassing many differen...,ai is a broad topic encompassing many differen...


In [38]:
import gensim
from gensim.utils import simple_preprocess

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data = text_data.main_text_processed.values.tolist()
data_words = list(sent_to_words(data))

print(data_words[:1][0][:30])

['responsible', 'ai', 'from', 'principles', 'to', 'practice', 'contents', 'responsible', 'ai', 'in', 'practice', 'essential', 'but', 'not', 'easy', 'growing', 'imperative', 'practitioners', 'insights', 'the', 'realities', 'of', 'responsible', 'ai', 'moving', 'from', 'principles', 'to', 'practice', 'the']
