<a href="https://colab.research.google.com/github/Kensuzuki95/Corporate_AI_Ethics_Guideline_Analysis/blob/main/AI_Ethics_Guideline_Classification_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

** **
# Step 1: Load Package
** **

In [1]:
import numpy as np 
import pandas as pd 
import requests
import io

** **
# Step 2: Create Training Data
** **

## Step 2-1: Download Data

In [2]:
# Downloading the UNESCO Ai Ethics Principles as a csv file from GitHub Repository

url = ("https://raw.githubusercontent.com/Kensuzuki95/Corporate_AI_Ethics_Guideline_Analysis/main/Dataset/UNESCO_AI_Ethics_Principles.csv")
download = requests.get(url).content

training_data = pd.read_csv(io.StringIO(download.decode('utf-8')))

training_data.info()
training_data.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   No.             10 non-null     int64 
 1   Principle Name  10 non-null     object
 2   Content         10 non-null     object
dtypes: int64(1), object(2)
memory usage: 368.0+ bytes


Unnamed: 0,No.,Principle Name,Content
0,1,Proportionality and Do No Harm,It should be recognized that AI technologies d...
1,2,Safety and security,"Unwanted harms (safety risks), as well as vuln..."
2,3,Fairness and non-discrimination,AI actors should promote social justice and sa...
3,4,Sustainability,The development of sustainable societies relie...
4,5,"Right to Privacy, and Data Protection","Privacy, a right essential to the protection o..."
5,6,Human oversight and determination,Member States should ensure that it is always ...
6,7,Transparency and explainability,The transparency and explainability of AI syst...
7,8,Responsibility and accountability,"AI actors and Member States should respect, pr..."
8,9,Awareness and literacy,Learning about the impact of AI systems should...
9,10,Multi-stakeholder and adaptive governance and ...,International law and national sovereignty mus...


## Step 2-2: Preprocessing

In [4]:
# Load the regular expression library
import re
import nltk


# Remove punctuation
training_data['Content_Processed'] = training_data['Content'].map(lambda x: re.sub('[,\.!?()]', '', x))

# Convert the text to lowercase
training_data['Content_Processed'] = training_data['Content_Processed'].map(lambda x: x.lower())

# Applying Tokenization
nltk.download('punkt')
training_data['Content_Tokenized'] = training_data.apply(lambda row: nltk.word_tokenize(row['Content_Processed']), axis=1)

# Print out the first rows of papers
#training_data.head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,No.,Principle Name,Content,Content_Processed,Content_Tokenized
0,1,Proportionality and Do No Harm,It should be recognized that AI technologies d...,it should be recognized that ai technologies d...,"[it, should, be, recognized, that, ai, technol..."
1,2,Safety and security,"Unwanted harms (safety risks), as well as vuln...",unwanted harms safety risks as well as vulnera...,"[unwanted, harms, safety, risks, as, well, as,..."
2,3,Fairness and non-discrimination,AI actors should promote social justice and sa...,ai actors should promote social justice and sa...,"[ai, actors, should, promote, social, justice,..."
3,4,Sustainability,The development of sustainable societies relie...,the development of sustainable societies relie...,"[the, development, of, sustainable, societies,..."
4,5,"Right to Privacy, and Data Protection","Privacy, a right essential to the protection o...",privacy a right essential to the protection of...,"[privacy, a, right, essential, to, the, protec..."


In [5]:
#defining the function to remove stopwords from tokenized text
stopwords = nltk.corpus.stopwords.words('english')

def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output

#applying the function
training_data['Content_without_stopwords'] = training_data['Content_Tokenized'].apply(lambda x:remove_stopwords(x))
#training_data.head()

Unnamed: 0,No.,Principle Name,Content,Content_Processed,Content_Tokenized,Content_without_stopwords
0,1,Proportionality and Do No Harm,It should be recognized that AI technologies d...,it should be recognized that ai technologies d...,"[it, should, be, recognized, that, ai, technol...","[recognized, ai, technologies, necessarily, pe..."
1,2,Safety and security,"Unwanted harms (safety risks), as well as vuln...",unwanted harms safety risks as well as vulnera...,"[unwanted, harms, safety, risks, as, well, as,...","[unwanted, harms, safety, risks, well, vulnera..."
2,3,Fairness and non-discrimination,AI actors should promote social justice and sa...,ai actors should promote social justice and sa...,"[ai, actors, should, promote, social, justice,...","[ai, actors, promote, social, justice, safegua..."
3,4,Sustainability,The development of sustainable societies relie...,the development of sustainable societies relie...,"[the, development, of, sustainable, societies,...","[development, sustainable, societies, relies, ..."
4,5,"Right to Privacy, and Data Protection","Privacy, a right essential to the protection o...",privacy a right essential to the protection of...,"[privacy, a, right, essential, to, the, protec...","[privacy, right, essential, protection, human,..."


In [6]:
#importing the Stemming function from nltk library
from nltk.stem.porter import PorterStemmer

#defining the object for stemming
porter_stemmer = PorterStemmer()

#defining a function for stemming
def stemming(text):
  stem_text = [porter_stemmer.stem(word) for word in text]
  return stem_text

training_data['Content_Stemmed'] = training_data['Content_without_stopwords'].apply(lambda x: stemming(x))
training_data.head()

Unnamed: 0,No.,Principle Name,Content,Content_Processed,Content_Tokenized,Content_without_stopwords,Content_Stemmed
0,1,Proportionality and Do No Harm,It should be recognized that AI technologies d...,it should be recognized that ai technologies d...,"[it, should, be, recognized, that, ai, technol...","[recognized, ai, technologies, necessarily, pe...","[recogn, ai, technolog, necessarili, per, se, ..."
1,2,Safety and security,"Unwanted harms (safety risks), as well as vuln...",unwanted harms safety risks as well as vulnera...,"[unwanted, harms, safety, risks, as, well, as,...","[unwanted, harms, safety, risks, well, vulnera...","[unwant, harm, safeti, risk, well, vulner, att..."
2,3,Fairness and non-discrimination,AI actors should promote social justice and sa...,ai actors should promote social justice and sa...,"[ai, actors, should, promote, social, justice,...","[ai, actors, promote, social, justice, safegua...","[ai, actor, promot, social, justic, safeguard,..."
3,4,Sustainability,The development of sustainable societies relie...,the development of sustainable societies relie...,"[the, development, of, sustainable, societies,...","[development, sustainable, societies, relies, ...","[develop, sustain, societi, reli, achiev, comp..."
4,5,"Right to Privacy, and Data Protection","Privacy, a right essential to the protection o...",privacy a right essential to the protection of...,"[privacy, a, right, essential, to, the, protec...","[privacy, right, essential, protection, human,...","[privaci, right, essenti, protect, human, dign..."
