** **
# Load Package
** **

In [5]:
import numpy as np 
import pandas as pd 

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


** **
# Load Data
** **

In [13]:
data = pd.read_csv('/content/drive/MyDrive/Research/Dataset/Dataset.csv')
data.head()

Unnamed: 0,No.,Company Name,Country,Industry,Published Year,Last Revised,Link,Document Name,Main Text,Comment
0,1,Sony Group,25-09-2018,,26-09-2018,01-04-2021,https://www.sony.com/en/SonyInfo/csr_report/hu...,Sony Group AI Ethics Guidelines,AI Engagement within Sony Group\nThrough the u...,
1,2,Samsung,,,,,https://www.samsungsds.com/en/digital_responsi...,AI Ethics Principles,AI Ethics Principles\nAI is a rapidly developi...,
2,4,Accenture,Ireland,Consulting,30-03-2021,31-03-2021,https://www.accenture.com/content/dam/accentur...,Responsible AI From principles to practice,Responsible AI\nFrom principles to practice\nC...,Addtional Details: https://www.accenture.com/u...
3,5,Acer,Taiwan,,,,,,,
4,6,Adobe,United States of America,,,,https://www.adobe.com/content/dam/cc/en/ai-eth...,Adobe’s Commitment to AI Ethics,"Adobe’s Commitment to AI Ethics\nAt Adobe, our...",Addtional Details: https://www.adobe.com/conte...


In [12]:
text_data = data.drop(columns=['No.','Country', 'Industry', 'Published Year', 'Last Revised', 'Link', 'Comment'], axis=1)
text_data.head()

Unnamed: 0,Company Name,Document Name,Main Text
0,Sony Group,Sony Group AI Ethics Guidelines,AI Engagement within Sony Group\nThrough the u...
1,Samsung,AI Ethics Principles,AI Ethics Principles\nAI is a rapidly developi...
2,Accenture,Responsible AI From principles to practice,Responsible AI\nFrom principles to practice\nC...
3,Acer,,
4,Adobe,Adobe’s Commitment to AI Ethics,"Adobe’s Commitment to AI Ethics\nAt Adobe, our..."


In [15]:
#Exclude firms without AI Ethics or Responsible AI Guideline Document
text_data = text_data.dropna()
text_data

Unnamed: 0,Company Name,Document Name,Main Text
0,Sony Group,Sony Group AI Ethics Guidelines,AI Engagement within Sony Group\nThrough the u...
1,Samsung,AI Ethics Principles,AI Ethics Principles\nAI is a rapidly developi...
2,Accenture,Responsible AI From principles to practice,Responsible AI\nFrom principles to practice\nC...
4,Adobe,Adobe’s Commitment to AI Ethics,"Adobe’s Commitment to AI Ethics\nAt Adobe, our..."
8,Alphabet,Responsible AI practices,Responsible AI practices\nThe development of A...


** **
#Step 2: Data Cleaning
** **

Since the goal of this analysis is to perform topic modeling, we will solely focus on the text data from each paper, and drop other metadata columns

## Remove punctuation/lower casing

Next, let’s perform a simple preprocessing on the content of paper_text column to make them more amenable for analysis, and reliable results. To do that, we’ll use a regular expression to remove any punctuation, and then lowercase the text

In [22]:
# Load the regular expression library 
import re

# Remove punctuation
text_data['main_text_processed'] = text_data['Main Text'].map(lambda x: re.sub('[,\.!?]', '', x))

# Convert the text to lowercase
text_data['main_text_processed'] = text_data['main_text_processed'].map(lambda x: x.lower())

# Print out the first rows of papers
text_data['main_text_processed'].head()

0    ai engagement within sony group\nthrough the u...
1    ai ethics principles\nai is a rapidly developi...
2    responsible ai\nfrom principles to practice\nc...
4    adobe’s commitment to ai ethics\nat adobe our ...
8    responsible ai practices\nthe development of a...
Name: main_text_processed, dtype: object

## Tokenize words and further clean-up text

Let’s tokenize each sentence into a list of words, removing punctuations and unnecessary characters altogether.

In [23]:
text_data.head()

Unnamed: 0,Company Name,Document Name,Main Text,main_text_processed
0,Sony Group,Sony Group AI Ethics Guidelines,AI Engagement within Sony Group\nThrough the u...,ai engagement within sony group\nthrough the u...
1,Samsung,AI Ethics Principles,AI Ethics Principles\nAI is a rapidly developi...,ai ethics principles\nai is a rapidly developi...
2,Accenture,Responsible AI From principles to practice,Responsible AI\nFrom principles to practice\nC...,responsible ai\nfrom principles to practice\nc...
4,Adobe,Adobe’s Commitment to AI Ethics,"Adobe’s Commitment to AI Ethics\nAt Adobe, our...",adobe’s commitment to ai ethics\nat adobe our ...
8,Alphabet,Responsible AI practices,Responsible AI practices\nThe development of A...,responsible ai practices\nthe development of a...


In [24]:
import gensim
from gensim.utils import simple_preprocess

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data = text_data.main_text_processed.values.tolist()
data_words = list(sent_to_words(data))

print(data_words[:1][0][:30])

['ai', 'engagement', 'within', 'sony', 'group', 'through', 'the', 'utilization', 'of', 'artificial', 'intelligence', 'ai', 'sony', 'aims', 'to', 'contribute', 'to', 'the', 'development', 'of', 'peaceful', 'and', 'sustainable', 'society', 'while', 'delivering', 'kando', 'sense', 'of', 'excitement']
