** **
# Step 1: Load Package
** **

In [1]:
import numpy as np 
import pandas as pd 
import requests
import io

** **
# Step 2: Load Data
** **

In [2]:
# Downloading the csv file from your GitHub account

url = ("https://raw.githubusercontent.com/Kensuzuki95/Corporate_AI_Ethics_Guideline_Analysis/main/Dataset/Dataset_Filtered.csv")
download = requests.get(url).content

dataset = pd.read_csv(io.StringIO(download.decode('utf-8')))

dataset.head()

Unnamed: 0,No.,Company Name,Country,Industry,Published Year,Last Revised,Link,Document Name,Main Text,Comment
0,1,Accenture,Ireland,Consulting,03-30-2021,03-30-2021,https://www.accenture.com/content/dam/accentur...,Responsible AI From principles to practice,Responsible AI\r\nFrom principles to practice\...,Addtional Details: https://www.accenture.com/u...
1,2,Adobe,United States of America,Software,,,https://www.adobe.com/content/dam/cc/en/ai-eth...,Adobe’s Commitment to AI Ethics,"Adobe’s Commitment to AI Ethics\r\nAt Adobe, o...",Addtional Details: https://www.adobe.com/conte...
2,3,Alphabet,United States of America,Software,,,https://ai.google/responsibilities/responsible...,Responsible AI practices,Responsible AI practices\r\nThe development of...,Addtional Information: https://ai.google/princ...
3,4,Amazon,United States of America,Software,,,https://d1.awsstatic.com/responsible-machine-l...,Responsible Use of Machine Learning,"Responsible Use of Machine Learning\r\nAt AWS,...",
4,5,Atos,France,Consulting,,,https://atos.net/en/lp/cybersecurity-magazine-...,The Atos Blueprint for Responsible AI,AI is a broad topic encompassing many differen...,


## Clean the Dataset Format

In [3]:
#Check for unecesarry columns
dataset.columns

Index(['No.', 'Company Name', 'Country', 'Industry', 'Published Year',
       'Last Revised', 'Link', 'Document Name', 'Main Text', 'Comment'],
      dtype='object')

In [4]:
text_data = dataset.drop(columns=['No.','Country', 'Industry', 'Published Year', 'Last Revised', 'Link', 'Comment'], axis=1)
text_data.info()
text_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Company Name   49 non-null     object
 1   Document Name  49 non-null     object
 2   Main Text      49 non-null     object
dtypes: object(3)
memory usage: 1.3+ KB


Unnamed: 0,Company Name,Document Name,Main Text
0,Accenture,Responsible AI From principles to practice,Responsible AI\r\nFrom principles to practice\...
1,Adobe,Adobe’s Commitment to AI Ethics,"Adobe’s Commitment to AI Ethics\r\nAt Adobe, o..."
2,Alphabet,Responsible AI practices,Responsible AI practices\r\nThe development of...
3,Amazon,Responsible Use of Machine Learning,"Responsible Use of Machine Learning\r\nAt AWS,..."
4,Atos,The Atos Blueprint for Responsible AI,AI is a broad topic encompassing many differen...


** **
#Step 3: Data Cleaning
** **

Since the goal of this analysis is to perform topic modeling, we will solely focus on the text data from each paper, and drop other metadata columns

## Remove punctuation/lower casing

Next, let’s perform a simple preprocessing on the content of paper_text column to make them more amenable for analysis, and reliable results. To do that, we’ll use a regular expression to remove any punctuation, and then lowercase the text

In [13]:
# Load the regular expression library
import re
import nltk


# Remove punctuation
text_data['Main_Text_Processed'] = text_data['Main Text'].map(lambda x: re.sub('[,\.!?()]', '', x))

# Convert the text to lowercase
text_data['Main_Text_Processed'] = text_data['Main_Text_Processed'].map(lambda x: x.lower())

# Applying Tokenization
nltk.download('punkt')
text_data['Main_Text_Tokenized'] = text_data.apply(lambda row: nltk.word_tokenize(row['Main_Text_Processed']), axis=1)

# Print out the first rows of papers
#training_data.head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Tokenize words and further clean-up text

Let’s tokenize each sentence into a list of words, removing punctuations and unnecessary characters altogether.

In [7]:
text_data.head()

Unnamed: 0,Company Name,Document Name,Main Text,Main_Text_Processed,Main_Text_Tokenized
0,Accenture,Responsible AI From principles to practice,Responsible AI\r\nFrom principles to practice\...,responsible ai\r\nfrom principles to practice\...,"[responsible, ai, from, principles, to, practi..."
1,Adobe,Adobe’s Commitment to AI Ethics,"Adobe’s Commitment to AI Ethics\r\nAt Adobe, o...",adobe’s commitment to ai ethics\r\nat adobe ou...,"[adobe, ’, s, commitment, to, ai, ethics, at, ..."
2,Alphabet,Responsible AI practices,Responsible AI practices\r\nThe development of...,responsible ai practices\r\nthe development of...,"[responsible, ai, practices, the, development,..."
3,Amazon,Responsible Use of Machine Learning,"Responsible Use of Machine Learning\r\nAt AWS,...",responsible use of machine learning\r\nat aws ...,"[responsible, use, of, machine, learning, at, ..."
4,Atos,The Atos Blueprint for Responsible AI,AI is a broad topic encompassing many differen...,ai is a broad topic encompassing many differen...,"[ai, is, a, broad, topic, encompassing, many, ..."


In [9]:
#defining the function to remove stopwords from tokenized text
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')

def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output

#applying the function
text_data['Main_text_without_stopwords'] = text_data['Main_Text_Tokenized'].apply(lambda x:remove_stopwords(x))
#training_data.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [11]:
#importing the Stemming function from nltk library
from nltk.stem.porter import PorterStemmer

#defining the object for stemming
porter_stemmer = PorterStemmer()

#defining a function for stemming
def stemming(text):
  stem_text = [porter_stemmer.stem(word) for word in text]
  return stem_text

text_data['Main_text_stemmed'] = text_data['Main_text_without_stopwords'].apply(lambda x: stemming(x))
text_data.head()

Unnamed: 0,Company Name,Document Name,Main Text,Main_Text_Processed,Main_Text_Tokenized,Main_text_without_stopwords,Main_text_stemmed
0,Accenture,Responsible AI From principles to practice,Responsible AI\r\nFrom principles to practice\...,responsible ai\r\nfrom principles to practice\...,"[responsible, ai, from, principles, to, practi...","[responsible, ai, principles, practice, conten...","[respons, ai, principl, practic, content, resp..."
1,Adobe,Adobe’s Commitment to AI Ethics,"Adobe’s Commitment to AI Ethics\r\nAt Adobe, o...",adobe’s commitment to ai ethics\r\nat adobe ou...,"[adobe, ’, s, commitment, to, ai, ethics, at, ...","[adobe, ’, commitment, ai, ethics, adobe, purp...","[adob, ’, commit, ai, ethic, adob, purpos, ser..."
2,Alphabet,Responsible AI practices,Responsible AI practices\r\nThe development of...,responsible ai practices\r\nthe development of...,"[responsible, ai, practices, the, development,...","[responsible, ai, practices, development, ai, ...","[respons, ai, practic, develop, ai, creat, new..."
3,Amazon,Responsible Use of Machine Learning,"Responsible Use of Machine Learning\r\nAt AWS,...",responsible use of machine learning\r\nat aws ...,"[responsible, use, of, machine, learning, at, ...","[responsible, use, machine, learning, aws, pro...","[respons, use, machin, learn, aw, proud, suppo..."
4,Atos,The Atos Blueprint for Responsible AI,AI is a broad topic encompassing many differen...,ai is a broad topic encompassing many differen...,"[ai, is, a, broad, topic, encompassing, many, ...","[ai, broad, topic, encompassing, many, differe...","[ai, broad, topic, encompass, mani, differ, fa..."


** **
#Step 4: Measure Text Similarity
** **



## Principle 1

In [55]:
!pip install sentence_transformers
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-mpnet-base-v2')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Downloading (…)a8e1d/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)0bca8e1d/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)e1d/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)a8e1d/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)8e1d/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)bca8e1d/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [56]:
sentences = text_data['Main Text'].values.tolist()
sentences.insert(0, 'It should be recognized that AI technologies do not necessarily, per se, ensure human and environmental and ecosystem flourishing. Furthermore, none of the processes related to the AI system life cycle shall exceed what is necessary to achieve legitimate aims or objectives and should be appropriate to the context. In the event of possible occurrence of any harm to human beings, human rights and fundamental freedoms, communities and society at large or the environment and ecosystems, the implementation of procedures for risk assessment and the adoption of measures in order to preclude the occurrence of such harm should be ensured.\nThe choice to use AI systems and which AI method to use should be justified in the following ways: (a) the AI method chosen should be appropriate and proportional to achieve a given legitimate aim; (b) the AI method chosen should not infringe upon the foundational values captured in this document, in particular, its use must not violate or abuse human rights; and (c) the AI method should be appropriate to the context and should be based on rigorous scientific foundations. In scenarios where decisions are understood to have an impact that is irreversible or difficult to reverse or may involve life and death decisions, final human determination should apply. In particular, AI systems should not be used for social scoring or mass surveillance purposes.')
#sentences

In [57]:
sentence_embeddings = model.encode(sentences)

In [58]:
sentence_embeddings.shape

(50, 768)

In [59]:
from sklearn.metrics.pairwise import cosine_similarity

In [60]:
results = cosine_similarity([sentence_embeddings[0]], sentence_embeddings[1:])
results = results.tolist()
results = results[0]
len(results)

49

In [61]:
text_data['Similarity_Score'] = results
text_data.head()

Unnamed: 0,Company Name,Document Name,Main Text,Main_Text_Processed,Main_Text_Tokenized,Main_text_without_stopwords,Main_text_stemmed,Similarity_Score
0,Accenture,Responsible AI From principles to practice,Responsible AI\r\nFrom principles to practice\...,responsible ai\r\nfrom principles to practice\...,"[responsible, ai, from, principles, to, practi...","[responsible, ai, principles, practice, conten...","[respons, ai, principl, practic, content, resp...",0.717751
1,Adobe,Adobe’s Commitment to AI Ethics,"Adobe’s Commitment to AI Ethics\r\nAt Adobe, o...",adobe’s commitment to ai ethics\r\nat adobe ou...,"[adobe, ’, s, commitment, to, ai, ethics, at, ...","[adobe, ’, commitment, ai, ethics, adobe, purp...","[adob, ’, commit, ai, ethic, adob, purpos, ser...",0.566864
2,Alphabet,Responsible AI practices,Responsible AI practices\r\nThe development of...,responsible ai practices\r\nthe development of...,"[responsible, ai, practices, the, development,...","[responsible, ai, practices, development, ai, ...","[respons, ai, practic, develop, ai, creat, new...",0.630594
3,Amazon,Responsible Use of Machine Learning,"Responsible Use of Machine Learning\r\nAt AWS,...",responsible use of machine learning\r\nat aws ...,"[responsible, use, of, machine, learning, at, ...","[responsible, use, machine, learning, aws, pro...","[respons, use, machin, learn, aw, proud, suppo...",0.596675
4,Atos,The Atos Blueprint for Responsible AI,AI is a broad topic encompassing many differen...,ai is a broad topic encompassing many differen...,"[ai, is, a, broad, topic, encompassing, many, ...","[ai, broad, topic, encompassing, many, differe...","[ai, broad, topic, encompass, mani, differ, fa...",0.709803
5,Capgemini,Our Code of Ethics for AI,Our Code of Ethics for AI\r\nAI is a general-p...,our code of ethics for ai\r\nai is a general-p...,"[our, code, of, ethics, for, ai, ai, is, a, ge...","[code, ethics, ai, ai, general-purpose, techno...","[code, ethic, ai, ai, general-purpos, technolo...",0.664051
6,Cisco,The Cisco Responsible AI Framework,The Cisco Responsible\r\nAI Framework\r\nSecur...,the cisco responsible\r\nai framework\r\nsecur...,"[the, cisco, responsible, ai, framework, secur...","[cisco, responsible, ai, framework, security, ...","[cisco, respons, ai, framework, secur, design,...",0.660496
7,Facebook,Facebook’s five pillars of Responsible AI,Facebook’s five pillars of Responsible AI\r\nA...,facebook’s five pillars of responsible ai\r\na...,"[facebook, ’, s, five, pillars, of, responsibl...","[facebook, ’, five, pillars, responsible, ai, ...","[facebook, ’, five, pillar, respons, ai, ai, t...",0.668631
8,FUJIFILM,Fujifilm Group AI Policy,Fujifilm Group AI Policy\r\nThe Fujifilm Group...,fujifilm group ai policy\r\nthe fujifilm group...,"[fujifilm, group, ai, policy, the, fujifilm, g...","[fujifilm, group, ai, policy, fujifilm, group,...","[fujifilm, group, ai, polici, fujifilm, group,...",0.600612
9,Fujitsu Ltd.,Fujitsu Group AI Commitment,Fujitsu Group AI Commitment\r\nProgress and in...,fujitsu group ai commitment\r\nprogress and in...,"[fujitsu, group, ai, commitment, progress, and...","[fujitsu, group, ai, commitment, progress, inn...","[fujitsu, group, ai, commit, progress, innov, ...",0.649162


## Principle 2