# Import and create dataframe containing official objectives for each SDGs 
This page contains all the algorithms we used to increase the number of text in our initial database :

- Natural Language Generation (NLG) - Markov Chain
- Generate synonyms of each SDG keywords and get its definition with Wikipedia API
- Presentation of a procedure to delete most common word in each SDG class text and none-sense word 

In [6]:
# SDG_Objectives contains all the targets for each SDG 
SDG_Obj = pd.read_csv('SDG_Objectives.csv',sep=";")

In [7]:
# Fill Nan cells by 0
SDG_Obj = SDG_Obj.fillna(0)

# Shuffle order to mix SDGs numbers
random.seed(30)
SDG_Obj = SDG_Obj.sample(frac=1)

In [8]:
# Change float format to int format 
cols=[i for i in SDG_Obj.columns if i not in ["Text"]]
for col in cols:
    SDG_Obj[col]=pd.to_numeric(SDG_Obj[col], downcast='integer')
    
# Check
SDG_Obj.dtypes
SDG_Obj.tail(2)

Unnamed: 0,ID,Text,SDG_1,SDG_2,SDG_3,SDG_4,SDG_5,SDG_6,SDG_7,SDG_8,SDG_9,SDG_10,SDG_11,SDG_12,SDG_13,SDG_14,SDG_15,SDG_16,SDG_17_
468,469,While there has been an increase in the number...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
549,550,"11.7 By 2030, provide universal access to safe...",0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


## Natural Language Generation (Markov Chain)

In [9]:
# Select text for the 17 SDGs categories

SDG_1 = SDG_Obj['SDG_1']==1
SDG1 = SDG_Obj[SDG_1]
SDG1 = SDG1.drop(['ID', 'SDG_1','SDG_2','SDG_3','SDG_4','SDG_5','SDG_6','SDG_7',
                 'SDG_8','SDG_9','SDG_10','SDG_11','SDG_12', 'SDG_13','SDG_14',
                 'SDG_15','SDG_16','SDG_17_'], axis=1)

SDG_2 = SDG_Obj['SDG_2']==1
SDG2 = SDG_Obj[SDG_2]
SDG2 = SDG2.drop(['ID', 'SDG_1','SDG_2','SDG_3','SDG_4','SDG_5','SDG_6','SDG_7',
                 'SDG_8','SDG_9','SDG_10','SDG_11','SDG_12', 'SDG_13','SDG_14',
                 'SDG_15','SDG_16','SDG_17_'], axis=1)

SDG_3 = SDG_Obj['SDG_3']==1
SDG3 = SDG_Obj[SDG_3]
SDG3 = SDG3.drop(['ID', 'SDG_1','SDG_2','SDG_3','SDG_4','SDG_5','SDG_6','SDG_7',
                 'SDG_8','SDG_9','SDG_10','SDG_11','SDG_12', 'SDG_13','SDG_14',
                 'SDG_15','SDG_16','SDG_17_'], axis=1)

SDG_4 = SDG_Obj['SDG_4']==1
SDG4 = SDG_Obj[SDG_4]
SDG4 = SDG4.drop(['ID', 'SDG_1','SDG_2','SDG_3','SDG_4','SDG_5','SDG_6','SDG_7',
                 'SDG_8','SDG_9','SDG_10','SDG_11','SDG_12', 'SDG_13','SDG_14',
                 'SDG_15','SDG_16','SDG_17_'], axis=1)

SDG_5 = SDG_Obj['SDG_5']==1
SDG5 = SDG_Obj[SDG_5]
SDG5 = SDG5.drop(['ID', 'SDG_1','SDG_2','SDG_3','SDG_4','SDG_5','SDG_6','SDG_7',
                 'SDG_8','SDG_9','SDG_10','SDG_11','SDG_12', 'SDG_13','SDG_14',
                 'SDG_15','SDG_16','SDG_17_'], axis=1)

SDG_6 = SDG_Obj['SDG_6']==1
SDG6 = SDG_Obj[SDG_6]
SDG6 = SDG6.drop(['ID', 'SDG_1','SDG_2','SDG_3','SDG_4','SDG_5','SDG_6','SDG_7',
                 'SDG_8','SDG_9','SDG_10','SDG_11','SDG_12', 'SDG_13','SDG_14',
                 'SDG_15','SDG_16','SDG_17_'], axis=1)

SDG_7 = SDG_Obj['SDG_7']==1
SDG7 = SDG_Obj[SDG_7]
SDG7 = SDG7.drop(['ID', 'SDG_1','SDG_2','SDG_3','SDG_4','SDG_5','SDG_6','SDG_7',
                 'SDG_8','SDG_9','SDG_10','SDG_11','SDG_12', 'SDG_13','SDG_14',
                 'SDG_15','SDG_16','SDG_17_'], axis=1)

SDG_8 = SDG_Obj['SDG_8']==1
SDG8 = SDG_Obj[SDG_8]
SDG8 = SDG8.drop(['ID', 'SDG_1','SDG_2','SDG_3','SDG_4','SDG_5','SDG_6','SDG_7',
                 'SDG_8','SDG_9','SDG_10','SDG_11','SDG_12', 'SDG_13','SDG_14',
                 'SDG_15','SDG_16','SDG_17_'], axis=1)

SDG_9 = SDG_Obj['SDG_9']==1
SDG9 = SDG_Obj[SDG_9]
SDG9 = SDG9.drop(['ID', 'SDG_1','SDG_2','SDG_3','SDG_4','SDG_5','SDG_6','SDG_7',
                 'SDG_8','SDG_9','SDG_10','SDG_11','SDG_12', 'SDG_13','SDG_14',
                 'SDG_15','SDG_16','SDG_17_'], axis=1)

SDG_10 = SDG_Obj['SDG_10']==1
SDG10 = SDG_Obj[SDG_10]
SDG10 = SDG10.drop(['ID', 'SDG_1','SDG_2','SDG_3','SDG_4','SDG_5','SDG_6','SDG_7',
                 'SDG_8','SDG_9','SDG_10','SDG_11','SDG_12', 'SDG_13','SDG_14',
                 'SDG_15','SDG_16','SDG_17_'], axis=1)

SDG_11 = SDG_Obj['SDG_11']==1
SDG11 = SDG_Obj[SDG_11]
SDG11 = SDG11.drop(['ID', 'SDG_1','SDG_2','SDG_3','SDG_4','SDG_5','SDG_6','SDG_7',
                 'SDG_8','SDG_9','SDG_10','SDG_11','SDG_12', 'SDG_13','SDG_14',
                 'SDG_15','SDG_16','SDG_17_'], axis=1)

SDG_12 = SDG_Obj['SDG_12']==1
SDG12 = SDG_Obj[SDG_12]
SDG12 = SDG12.drop(['ID', 'SDG_1','SDG_2','SDG_3','SDG_4','SDG_5','SDG_6','SDG_7',
                 'SDG_8','SDG_9','SDG_10','SDG_11','SDG_12', 'SDG_13','SDG_14',
                 'SDG_15','SDG_16','SDG_17_'], axis=1)

SDG_13 = SDG_Obj['SDG_13']==1
SDG13 = SDG_Obj[SDG_13]
SDG13 = SDG13.drop(['ID', 'SDG_1','SDG_2','SDG_3','SDG_4','SDG_5','SDG_6','SDG_7',
                 'SDG_8','SDG_9','SDG_10','SDG_11','SDG_12', 'SDG_13','SDG_14',
                 'SDG_15','SDG_16','SDG_17_'], axis=1)

SDG_14 = SDG_Obj['SDG_14']==1
SDG14 = SDG_Obj[SDG_14]
SDG14 = SDG14.drop(['ID', 'SDG_1','SDG_2','SDG_3','SDG_4','SDG_5','SDG_6','SDG_7',
                 'SDG_8','SDG_9','SDG_10','SDG_11','SDG_12', 'SDG_13','SDG_14',
                 'SDG_15','SDG_16','SDG_17_'], axis=1)

SDG_15 = SDG_Obj['SDG_15']==1
SDG15 = SDG_Obj[SDG_15]
SDG15 = SDG15.drop(['ID', 'SDG_1','SDG_2','SDG_3','SDG_4','SDG_5','SDG_6','SDG_7',
                 'SDG_8','SDG_9','SDG_10','SDG_11','SDG_12', 'SDG_13','SDG_14',
                 'SDG_15','SDG_16','SDG_17_'], axis=1)

SDG_16 = SDG_Obj['SDG_16']==1
SDG16 = SDG_Obj[SDG_16]
SDG16 = SDG16.drop(['ID', 'SDG_1','SDG_2','SDG_3','SDG_4','SDG_5','SDG_6','SDG_7',
                 'SDG_8','SDG_9','SDG_10','SDG_11','SDG_12', 'SDG_13','SDG_14',
                 'SDG_15','SDG_16','SDG_17_'], axis=1)

SDG_17_ = SDG_Obj['SDG_17_']==1
SDG17 = SDG_Obj[SDG_17_]
SDG17 = SDG17.drop(['ID', 'SDG_1','SDG_2','SDG_3','SDG_4','SDG_5','SDG_6','SDG_7',
                 'SDG_8','SDG_9','SDG_10','SDG_11','SDG_12', 'SDG_13','SDG_14',
                 'SDG_15','SDG_16','SDG_17_'], axis=1)

## Generate text for each SDG

### SDG 1

In [10]:
import markovify
# Apply Markov Chain to original text

text_model = markovify.NewlineText(SDG1.Text, state_size = 1)

# Create a fonction "f" with make_sentence (Markovify) to genete text 
def f():
    result = [] 
    for i in range(300): # Generate 300 sentences 
        result.append(text_model.make_sentence())
    return result

sentences_1 = f()
#print(sentences_1)

# convert list to dataframe
S1 = pd.DataFrame(sentences_1) 
S1.columns = ['Text']

# Assign 1 to all SGD1 texts
S1['SDG_1'] = 1

# 0 for others SDGs
for newcol in ['SDG_2','SDG_3','SDG_4','SDG_5','SDG_6','SDG_7', 'SDG_8','SDG_9','SDG_10',
   'SDG_11','SDG_12', 'SDG_13','SDG_14','SDG_15','SDG_16','SDG_17_']:
    S1[newcol]= 0

### SDG 2

In [11]:
text_model = markovify.NewlineText(SDG2.Text, state_size = 1)

def f():
    result = [] 
    for i in range(300): # Generate 300 sentences 
        result.append(text_model.make_sentence())
    return result

sentences_2 = f()
#print(sentences_2)

# convert list to dataframe
S2 = pd.DataFrame(sentences_2) 
S2.columns = ['Text']

# 0 for others SDGs
for newcol in ['SDG_1','SDG_2','SDG_3','SDG_4','SDG_5','SDG_6','SDG_7', 'SDG_8','SDG_9','SDG_10',
   'SDG_11','SDG_12', 'SDG_13','SDG_14','SDG_15','SDG_16','SDG_17_']:
    S2[newcol]= 0
    
# Assign 1 to all SGD2 texts
S2['SDG_2'] = 1

### SDG 3

In [12]:
text_model = markovify.NewlineText(SDG3.Text, state_size = 1)

def f():
    result = [] 
    for i in range(300): # Generate 300 sentences 
        result.append(text_model.make_sentence())
    return result

sentences_3 = f()
#print(sentences_3)

S3 = pd.DataFrame(sentences_3) 
S3.columns = ['Text']

for newcol in ['SDG_1','SDG_2','SDG_3','SDG_4','SDG_5','SDG_6','SDG_7', 'SDG_8','SDG_9','SDG_10',
   'SDG_11','SDG_12', 'SDG_13','SDG_14','SDG_15','SDG_16','SDG_17_']:
    S3[newcol]= 0
    
S3['SDG_3'] = 1

### SDG 4

In [13]:
text_model = markovify.NewlineText(SDG4.Text, state_size = 1)

def f():
    result = [] 
    for i in range(300): # Generate 300 sentences 
        result.append(text_model.make_sentence())
    return result

sentences_4 = f()
#print(sentences_4)

S4 = pd.DataFrame(sentences_4)
S4.columns = ['Text']

for newcol in ['SDG_1','SDG_2','SDG_3','SDG_4','SDG_5','SDG_6','SDG_7', 'SDG_8','SDG_9','SDG_10',
   'SDG_11','SDG_12', 'SDG_13','SDG_14','SDG_15','SDG_16','SDG_17_']:
    S4[newcol]= 0
    
S4['SDG_4'] = 1

### SDG 5

In [14]:
text_model = markovify.NewlineText(SDG5.Text, state_size = 1)

def f():
    result = [] 
    for i in range(300): # Generate 300 sentences 
        result.append(text_model.make_sentence())
    return result

sentences_5 = f()
#print(sentences_5)

S5 = pd.DataFrame(sentences_5) 
S5.columns = ['Text']

for newcol in ['SDG_1','SDG_2','SDG_3','SDG_4','SDG_5','SDG_6','SDG_7', 'SDG_8','SDG_9','SDG_10',
   'SDG_11','SDG_12', 'SDG_13','SDG_14','SDG_15','SDG_16','SDG_17_']:
    S5[newcol]= 0
    
S5['SDG_5'] = 1

### SDG 6

In [15]:
text_model = markovify.NewlineText(SDG6.Text, state_size = 1)

def f():
    result = [] 
    for i in range(300): # Generate 300 sentences 
        result.append(text_model.make_sentence())
    return result

sentences_6 = f()
#print(sentences_6)

S6 = pd.DataFrame(sentences_6) 
S6.columns = ['Text']

for newcol in ['SDG_1','SDG_2','SDG_3','SDG_4','SDG_5','SDG_6','SDG_7', 'SDG_8','SDG_9','SDG_10',
   'SDG_11','SDG_12', 'SDG_13','SDG_14','SDG_15','SDG_16','SDG_17_']:
    S6[newcol]= 0
    
S6['SDG_6'] = 1

### SDG 7

In [16]:
text_model = markovify.NewlineText(SDG7.Text, state_size = 1)

def f():
    result = [] 
    for i in range(300): # Generate 300 sentences 
        result.append(text_model.make_sentence())
    return result

sentences_7 = f()
#print(sentences_7)

S7 = pd.DataFrame(sentences_7)
S7.columns = ['Text']

for newcol in ['SDG_1','SDG_2','SDG_3','SDG_4','SDG_5','SDG_6','SDG_7', 'SDG_8','SDG_9','SDG_10',
   'SDG_11','SDG_12', 'SDG_13','SDG_14','SDG_15','SDG_16','SDG_17_']:
    S7[newcol]= 0
    
S7['SDG_7'] = 1

### SDG 8

In [17]:
text_model = markovify.NewlineText(SDG8.Text, state_size = 1)

def f():
    result = [] 
    for i in range(300): # Generate 300 sentences 
        result.append(text_model.make_sentence())
    return result

sentences_8 = f()
#print(sentences_8)

S8 = pd.DataFrame(sentences_8)
S8.columns = ['Text']

for newcol in ['SDG_1','SDG_2','SDG_3','SDG_4','SDG_5','SDG_6','SDG_7', 'SDG_8','SDG_9','SDG_10',
   'SDG_11','SDG_12', 'SDG_13','SDG_14','SDG_15','SDG_16','SDG_17_']:
    S8[newcol]= 0
    
S8['SDG_8'] = 1

### SDG 9

In [18]:
text_model = markovify.NewlineText(SDG9.Text, state_size = 1)

def f():
    result = [] 
    for i in range(300): # Generate 300 sentences 
        result.append(text_model.make_sentence())
    return result

sentences_9 = f()
#print(sentences_9)

S9 = pd.DataFrame(sentences_9)
S9.columns = ['Text']

for newcol in ['SDG_1','SDG_2','SDG_3','SDG_4','SDG_5','SDG_6','SDG_7', 'SDG_8','SDG_9','SDG_10',
   'SDG_11','SDG_12', 'SDG_13','SDG_14','SDG_15','SDG_16','SDG_17_']:
    S9[newcol]= 0
    
S9['SDG_9'] = 1

### SDG 10

In [19]:
text_model = markovify.NewlineText(SDG10.Text, state_size = 1)

def f():
    result = [] 
    for i in range(300): # Generate 300 sentences 
        result.append(text_model.make_sentence())
    return result

sentences_10 = f()
#print(sentences_10)

S10 = pd.DataFrame(sentences_10)
S10.columns = ['Text']

for newcol in ['SDG_1','SDG_2','SDG_3','SDG_4','SDG_5','SDG_6','SDG_7', 'SDG_8','SDG_9','SDG_10',
   'SDG_11','SDG_12', 'SDG_13','SDG_14','SDG_15','SDG_16','SDG_17_']:
    S10[newcol]= 0
    
S10['SDG_10'] = 1

### SDG 11

In [20]:
text_model = markovify.NewlineText(SDG11.Text, state_size = 1)

def f():
    result = [] 
    for i in range(300): # Generate 300 sentences 
        result.append(text_model.make_sentence())
    return result

sentences_11 = f()
#print(sentences_11)

S11 = pd.DataFrame(sentences_11)
S11.columns = ['Text']

for newcol in ['SDG_1','SDG_2','SDG_3','SDG_4','SDG_5','SDG_6','SDG_7', 'SDG_8','SDG_9','SDG_10',
   'SDG_11','SDG_12', 'SDG_13','SDG_14','SDG_15','SDG_16','SDG_17_']:
    S11[newcol]= 0
    
S11['SDG_11'] = 1

### SDG 12

In [21]:
text_model = markovify.NewlineText(SDG12.Text, state_size = 1)

def f():
    result = [] 
    for i in range(300): # Generate 300 sentences 
        result.append(text_model.make_sentence())
    return result

sentences_12 = f()
#print(sentences_12)

S12 = pd.DataFrame(sentences_12)
S12.columns = ['Text']

for newcol in ['SDG_1','SDG_2','SDG_3','SDG_4','SDG_5','SDG_6','SDG_7', 'SDG_8','SDG_9','SDG_10',
   'SDG_11','SDG_12', 'SDG_13','SDG_14','SDG_15','SDG_16','SDG_17_']:
    S12[newcol]= 0
    
S12['SDG_12'] = 1

### SDG 13

In [22]:
text_model = markovify.NewlineText(SDG13.Text, state_size = 1)

def f():
    result = [] 
    for i in range(300): # Generate 300 sentences 
        result.append(text_model.make_sentence())
    return result

sentences_13 = f()
#print(sentences_13)

S13 = pd.DataFrame(sentences_13) 
S13.columns = ['Text']

for newcol in ['SDG_1','SDG_2','SDG_3','SDG_4','SDG_5','SDG_6','SDG_7', 'SDG_8','SDG_9','SDG_10',
   'SDG_11','SDG_12', 'SDG_13','SDG_14','SDG_15','SDG_16','SDG_17_']:
    S13[newcol]= 0
    
S13['SDG_13'] = 1

### SDG 14

In [23]:
text_model = markovify.NewlineText(SDG14.Text, state_size = 1)

def f():
    result = [] 
    for i in range(300): # Generate 300 sentences 
        result.append(text_model.make_sentence())
    return result

sentences_14 = f()
#print(sentences_14)

S14 = pd.DataFrame(sentences_14)
S14.columns = ['Text']

for newcol in ['SDG_1','SDG_2','SDG_3','SDG_4','SDG_5','SDG_6','SDG_7', 'SDG_8','SDG_9','SDG_10',
   'SDG_11','SDG_12', 'SDG_13','SDG_14','SDG_15','SDG_16','SDG_17_']:
    S14[newcol]= 0
    
S14['SDG_14'] = 1

### SDG 15

In [24]:
text_model = markovify.NewlineText(SDG15.Text, state_size = 1)

def f():
    result = [] 
    for i in range(300): # Generate 300 sentences 
        result.append(text_model.make_sentence())
    return result

sentences_15 = f()
#print(sentences_15)

S15 = pd.DataFrame(sentences_15)
S15.columns = ['Text']

for newcol in ['SDG_1','SDG_2','SDG_3','SDG_4','SDG_5','SDG_6','SDG_7', 'SDG_8','SDG_9','SDG_10',
   'SDG_11','SDG_12', 'SDG_13','SDG_14','SDG_15','SDG_16','SDG_17_']:
    S15[newcol]= 0
    
S15['SDG_15'] = 1

### SDG 16

In [25]:
text_model = markovify.NewlineText(SDG16.Text, state_size = 1)

def f():
    result = [] 
    for i in range(300): # Generate 300 sentences 
        result.append(text_model.make_sentence())
    return result

sentences_16 = f()
#print(sentences_16)

S16 = pd.DataFrame(sentences_16)
S16.columns = ['Text']

for newcol in ['SDG_1','SDG_2','SDG_3','SDG_4','SDG_5','SDG_6','SDG_7', 'SDG_8','SDG_9','SDG_10',
   'SDG_11','SDG_12', 'SDG_13','SDG_14','SDG_15','SDG_16','SDG_17_']:
    S16[newcol]= 0
    
S16['SDG_16'] = 1

### SDG 17

In [26]:
text_model = markovify.NewlineText(SDG17.Text, state_size = 1)

def f():
    result = [] 
    for i in range(300): # Generate 300 sentences 
        result.append(text_model.make_sentence())
    return result

sentences_17 = f()
#print(sentences_17)

S17 = pd.DataFrame(sentences_17) 
S17.columns = ['Text']

for newcol in ['SDG_1','SDG_2','SDG_3','SDG_4','SDG_5','SDG_6','SDG_7', 'SDG_8','SDG_9','SDG_10',
   'SDG_11','SDG_12', 'SDG_13','SDG_14','SDG_15','SDG_16','SDG_17_']:
    S17[newcol]= 0
    
S17['SDG_17_'] = 1

### Concataining generated texts in one dataframe

In [27]:
# Concat all the SGDs new text in a single dataframe 
New_text = pd.concat([S1, S2, S3, S4, S5, S6, S7, S8, S9, S10, S11, S12, S13, S14, S15, S16, S17], axis=0)
New_text["ID"] = np.arange(len(New_text))

In [28]:
# shuffle dataframe before split it in train test 
New_text = New_text.sample(frac=1)

# Split New_text in train and test (80/20) sample (Sklearn function)
#train, test = train_test_split(New_text, test_size=0.2)

In [29]:
New_text['Text'] = New_text['Text'].astype(str)

In [30]:
cols=[i for i in New_text.columns if i not in ["Text"]]
for col in cols:
    New_text[col]=pd.to_numeric(New_text[col], downcast='integer')

#### Concatane SDG_objectives to Markov chain generated texts :

In [34]:
frames = [SDG_Obj, New_text]
res = pd.concat(frames)

In [35]:
# shuffle dataframe before split it in train test 
#res = res.sample(frac=1)

In [36]:
res["ID"] = np.arange(len(res))

### Process to delete most common words between SDGs and none-sense words 
See script SDGs Text Analysis for word occurence for each SDG

In [38]:
# SDG1
SDG_1 = res['SDG_1']==1
SDG1 = res[SDG_1]

In [39]:
# Words to be delete 
remove_words = ['extreme', 'countries', 'world', 'population', 'development', 'including',
                'paper', 'standard', 'national','well', 'basic', 'trends','africa', 'years',
                'persons','declined', 'reduction', 'oecd','global','OECD','estimates','also',
                'across','reduce','since','large','many','Africa','women']

In [40]:
pat = r'\b(?:{})\b'.format('|'.join(remove_words))

In [41]:
pat

'\\b(?:extreme|countries|world|population|development|including|paper|standard|national|well|basic|trends|africa|years|persons|declined|reduction|oecd|global|OECD|estimates|also|across|reduce|since|large|many|Africa|women)\\b'

In [42]:
SDG1['Text'] = SDG1['Text'].str.replace(pat, '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [43]:
import numpy as np
import nltk
import collections
import re
from nltk.corpus import stopwords
#from gensim.models import Word2Vec

#Remove everythings exepct alphabet 
SDG1['clean_text'] = SDG1['Text'].str.replace("[^a-zA-Z#]", " ")

# Remove null fields
SDG1['clean_text'] = SDG1['clean_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

# Make all text lowercase
SDG1['clean_text'] =SDG1['clean_text'].apply(lambda x: x.lower())

# Tokenization
tokenized_doc_1 = SDG1['clean_text'].apply(lambda x: x.split())

# Delete stop-words
nltk.download('stopwords')
#stop_words = set(stopwords.words('english'))

stop_words = nltk.corpus.stopwords.words('english')
newStopWords = ['cent']
stop_words.extend(newStopWords)
tokenized_doc_1 = tokenized_doc_1.apply(lambda x: [item for item in x if item not in stop_words])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jguisiano/nltk_data...
[nltk_data]   Package stop

In [44]:
list = tokenized_doc_1.tolist()

new_list = []
for words in list:
    new_list += words

In [45]:
from collections import Counter
word_counts = Counter(new_list)
top = word_counts.most_common(35)
print(top)

[('poverty', 692), ('social', 234), ('children', 161), ('protection', 154), ('living', 144), ('income', 125), ('people', 115), ('vulnerable', 91), ('poor', 89), ('child', 81), ('inequality', 72), ('benefits', 69), ('level', 57), ('levels', 56), ('access', 55), ('regions', 52), ('families', 52), ('agenda', 51), ('dimensions', 51), ('services', 50), ('support', 50), ('policy', 48), ('progress', 47), ('economic', 46), ('programmes', 46), ('cash', 46), ('monetary', 46), ('data', 45), ('significant', 45), ('receive', 45), ('forms', 44), ('adequate', 44), ('provide', 43), ('provides', 39), ('indicators', 39)]


In [46]:
# SDG2
SDG_2 = res['SDG_2']==1
SDG2 = res[SDG_2]

In [47]:
# Words to be delete 
remove_words = ['million', 'countries', 'world', 'poverty', 'sustainable', 'people',
               'access','children','development','systems','small','policy', 'cent',
               'resilient','international','ensure','also','related','security','Targets'
               'Climate','address','women','Change','global','agenda']

In [48]:
pat = r'\b(?:{})\b'.format('|'.join(remove_words))

In [49]:
SDG2['Text'] = SDG2['Text'].str.replace(pat, '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [50]:
#Remove everythings exepct alphabet 
SDG2['clean_text'] = SDG2['Text'].str.replace("[^a-zA-Z#]", " ")

# Remove null fields
SDG2['clean_text'] = SDG2['clean_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

# Make all text lowercase
SDG2['clean_text'] =SDG2['clean_text'].apply(lambda x: x.lower())

# Tokenization
tokenized_doc_2 = SDG2['clean_text'].apply(lambda x: x.split())

# Delete stop-words
nltk.download('stopwords')
#stop_words = set(stopwords.words('english'))

stop_words = nltk.corpus.stopwords.words('english')
newStopWords = ['cent']
stop_words.extend(newStopWords)
tokenized_doc_2 = tokenized_doc_2.apply(lambda x: [item for item in x if item not in stop_words])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jguisiano/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [51]:
list = tokenized_doc_2.tolist()

new_list = []
for words in list:
    new_list += words

In [52]:
from collections import Counter
word_counts = Counter(new_list)
top = word_counts.most_common(35)
print(top)

[('food', 679), ('agriculture', 313), ('agricultural', 264), ('hunger', 236), ('including', 190), ('malnutrition', 160), ('resources', 127), ('production', 124), ('nutrition', 120), ('markets', 119), ('targets', 118), ('productivity', 107), ('efforts', 106), ('farmers', 101), ('climate', 99), ('rural', 99), ('water', 97), ('land', 94), ('genetic', 94), ('progress', 93), ('national', 92), ('productive', 92), ('producers', 92), ('many', 90), ('scale', 86), ('risk', 85), ('policies', 83), ('change', 81), ('achieving', 80), ('needed', 79), ('extreme', 78), ('overweight', 76), ('increased', 74), ('health', 73), ('increase', 71)]


In [53]:
# SDG3
SDG_3 = res['SDG_3']==1
SDG3 = res[SDG_3]

In [54]:
# Words to be delete 
remove_words = ['people', 'countries', 'well', 'access', 'million', 'communicable',
               'development','child','coverage','women','children','estimated', 'OECD',
               'oecd','income','data','effective','universal','however','However'
               'globally','rights','also','among','including','girls','Globally','Target','target',
               'Targets','targets','Africa','Saharan','education']

In [55]:
pat = r'\b(?:{})\b'.format('|'.join(remove_words))

In [56]:
SDG3['Text'] = SDG3['Text'].str.replace(pat, '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [57]:
#Remove everythings exepct alphabet 
SDG3['clean_text'] = SDG3['Text'].str.replace("[^a-zA-Z#]", " ")

# Remove null fields
SDG3['clean_text'] = SDG3['clean_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

# Make all text lowercase
SDG3['clean_text'] =SDG3['clean_text'].apply(lambda x: x.lower())

# Tokenization
tokenized_doc_3 = SDG3['clean_text'].apply(lambda x: x.split())

# Delete stop-words
nltk.download('stopwords')
#stop_words = set(stopwords.words('english'))

stop_words = nltk.corpus.stopwords.words('english')
newStopWords = ['cent']
stop_words.extend(newStopWords)
tokenized_doc_3 = tokenized_doc_3.apply(lambda x: [item for item in x if item not in stop_words])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jguisiano/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [58]:
list = tokenized_doc_3.tolist()

new_list = []
for words in list:
    new_list += words

In [59]:
from collections import Counter
word_counts = Counter(new_list)
top = word_counts.most_common(35)
print(top)

[('health', 875), ('diseases', 201), ('mental', 195), ('care', 192), ('deaths', 164), ('mortality', 159), ('global', 141), ('services', 139), ('burden', 94), ('maternal', 85), ('population', 80), ('progress', 79), ('risk', 76), ('reproductive', 73), ('still', 68), ('tuberculosis', 67), ('disease', 67), ('malaria', 64), ('available', 63), ('treatment', 62), ('births', 60), ('world', 60), ('however', 59), ('agenda', 59), ('system', 59), ('interventions', 59), ('public', 58), ('developing', 57), ('financing', 57), ('across', 56), ('conditions', 55), ('least', 55), ('number', 55), ('tobacco', 54), ('death', 53)]


In [60]:
# SDG4
SDG_4 = res['SDG_4']==1
SDG4 = res[SDG_4]

In [61]:
# Words to be delete 
remove_words = ['women', 'gender', 'countrie', 'including', 'development', 'data', 
               'girls','implementation','access','global','national','early','Rights'
               'care', 'human','also','provide','across','Saharan','Women']

In [62]:
pat = r'\b(?:{})\b'.format('|'.join(remove_words))

In [63]:
SDG4['Text'] = SDG4['Text'].str.replace(pat, '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [64]:
#Remove everythings exepct alphabet 
SDG4['clean_text'] = SDG4['Text'].str.replace("[^a-zA-Z#]", " ")

# Remove null fields
SDG4['clean_text'] = SDG4['clean_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

# Make all text lowercase
SDG4['clean_text'] =SDG4['clean_text'].apply(lambda x: x.lower())

# Tokenization
tokenized_doc_4 = SDG4['clean_text'].apply(lambda x: x.split())

# Delete stop-words
nltk.download('stopwords')
#stop_words = set(stopwords.words('english'))

stop_words = nltk.corpus.stopwords.words('english')
newStopWords = ['cent']
stop_words.extend(newStopWords)
tokenized_doc_4 = tokenized_doc_4.apply(lambda x: [item for item in x if item not in stop_words])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jguisiano/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [65]:
list = tokenized_doc_4.tolist()

new_list = []
for words in list:
    new_list += words

In [66]:
from collections import Counter
word_counts = Counter(new_list)
top = word_counts.most_common(35)
print(top)

[('education', 581), ('learning', 217), ('countries', 182), ('children', 181), ('skills', 147), ('equality', 140), ('school', 133), ('secondary', 129), ('primary', 121), ('work', 90), ('quality', 90), ('vocational', 88), ('students', 84), ('training', 83), ('participation', 82), ('levels', 80), ('opportunities', 78), ('care', 74), ('labour', 73), ('sustainable', 72), ('educational', 69), ('support', 69), ('system', 69), ('adult', 67), ('many', 66), ('youth', 64), ('efforts', 63), ('ensure', 62), ('services', 62), ('health', 58), ('years', 58), ('reading', 57), ('united', 56), ('nations', 56), ('developing', 56)]


In [67]:
# SDG5
SDG_5 = res['SDG_5']==1
SDG5 = res[SDG_5]

In [68]:
# Words to be delete 
remove_words = ['data', 'including', 'countries', 'implementation', 'development', 'health', 
                'across','access','care','national','resources','OECD','oecd','sustainable',
                'action','services','policies','public','making','agenda','also','political',
                'governments','level','levels','programme','positions','global',
                'policy','CHAPTER','Chapter','chapter','mechanisms','average',
                'international', 'Programme','decision','effective','Mexico'
                'address','responsive','monitoring','agenda','private','continue','address']

In [69]:
pat = r'\b(?:{})\b'.format('|'.join(remove_words))

In [70]:
SDG5['Text'] = SDG5['Text'].str.replace(pat, '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [71]:
#Remove everythings exepct alphabet 
SDG5['clean_text'] = SDG5['Text'].str.replace("[^a-zA-Z#]", " ")

# Remove null fields
SDG5['clean_text'] = SDG5['clean_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

# Make all text lowercase
SDG5['clean_text'] =SDG5['clean_text'].apply(lambda x: x.lower())

# Tokenization
tokenized_doc_5 = SDG5['clean_text'].apply(lambda x: x.split())

# Delete stop-words
nltk.download('stopwords')
#stop_words = set(stopwords.words('english'))

stop_words = nltk.corpus.stopwords.words('english')
newStopWords = ['cent']
stop_words.extend(newStopWords)
tokenized_doc_5 = tokenized_doc_5.apply(lambda x: [item for item in x if item not in stop_words])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jguisiano/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [72]:
list = tokenized_doc_5.tolist()

new_list = []
for words in list:
    new_list += words

In [73]:
from collections import Counter
word_counts = Counter(new_list)
top = word_counts.most_common(35)
print(top)

[('women', 862), ('gender', 486), ('girls', 304), ('equality', 288), ('violence', 151), ('work', 140), ('rights', 137), ('sexual', 111), ('unpaid', 104), ('social', 97), ('empowerment', 94), ('participation', 93), ('marriage', 77), ('domestic', 76), ('progress', 75), ('economic', 74), ('reproductive', 70), ('equal', 70), ('legal', 64), ('discrimination', 64), ('laws', 62), ('accountability', 61), ('leadership', 60), ('barriers', 59), ('human', 59), ('local', 59), ('gaps', 57), ('education', 57), ('efforts', 55), ('opportunities', 55), ('child', 55), ('protection', 54), ('achieve', 54), ('forms', 52), ('achieving', 52)]


In [74]:
# SDG6
SDG_6 = res['SDG_6']==1
SDG6 = res[SDG_6]

In [75]:
# Words to be delete 
remove_words = ['management', 'protection', 'countries', 'related', 'climate', 'challenges', 
                'presents','billion','managed','central','government','affects','achieve',
                'cooperation','governance','universal','facilities','efficiency','improve',
                'globally','support','challenge','planning','security','Nothern','service',
                'services','particular','public','human','natural','basic','development','world',
                'sustainable','sector','CHAPTER','Chapter','chapter','including','including','change',
                'population','also','Asia','levels','health','economic','increase','global',
                'well','essential','Africa','need','growth','people'
                ]

In [76]:
pat = r'\b(?:{})\b'.format('|'.join(remove_words))

In [77]:
SDG6['Text'] = SDG6['Text'].str.replace(pat, '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [78]:
#Remove everythings exepct alphabet 
SDG6['clean_text'] = SDG6['Text'].str.replace("[^a-zA-Z#]", " ")

# Remove null fields
SDG6['clean_text'] = SDG6['clean_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

# Make all text lowercase
SDG6['clean_text'] =SDG6['clean_text'].apply(lambda x: x.lower())

# Tokenization
tokenized_doc_6 = SDG6['clean_text'].apply(lambda x: x.split())

# Delete stop-words
nltk.download('stopwords')
#stop_words = set(stopwords.words('english'))

stop_words = nltk.corpus.stopwords.words('english')
newStopWords = ['cent']
stop_words.extend(newStopWords)
tokenized_doc_6 = tokenized_doc_6.apply(lambda x: [item for item in x if item not in stop_words])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jguisiano/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [79]:
list = tokenized_doc_6.tolist()

new_list = []
for words in list:
    new_list += words

In [80]:
from collections import Counter
word_counts = Counter(new_list)
top = word_counts.most_common(35)
print(top)

[('water', 1037), ('sanitation', 225), ('wastewater', 107), ('quality', 95), ('resources', 94), ('drinking', 86), ('stress', 82), ('access', 68), ('supply', 66), ('availability', 60), ('irrigation', 57), ('ecosystems', 54), ('freshwater', 53), ('national', 52), ('treatment', 51), ('safely', 50), ('ensure', 48), ('transboundary', 43), ('recycling', 42), ('scarcity', 41), ('clean', 41), ('least', 36), ('france', 36), ('address', 35), ('integrated', 35), ('many', 34), ('local', 34), ('crucial', 34), ('hygiene', 33), ('northern', 33), ('infrastructure', 33), ('across', 33), ('life', 33), ('major', 32), ('impact', 32)]


In [81]:
# SDG7
SDG_7 = res['SDG_7']==1
SDG7 = res[SDG_7]

In [82]:
# Words to be delete 
remove_words = ['countries', 'policies', 'regional', 'provides', 'climate', 'challenges', 
                'challenges','increased','infrastructure','services','solutions', 'cooperation',
                'developing','improvement','universal','including','improved','international',
                'generation','integrated','population','financial','continues','improvements',
                'economic','policy','efficiency','CHAPTER','Chapter','chapter','sustainable'
                'recent','change','development','billion','people','still','without','needs',
                'however','Recent','years','private','since','issues','agenda','increase',
                'technology','ensure','Agenda','however','However','also','share'
                ]

In [83]:
pat = r'\b(?:{})\b'.format('|'.join(remove_words))

In [84]:
SDG7['Text'] = SDG7['Text'].str.replace(pat, '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [85]:
#Remove everythings exepct alphabet 
SDG7['clean_text'] = SDG7['Text'].str.replace("[^a-zA-Z#]", " ")

# Remove null fields
SDG7['clean_text'] = SDG7['clean_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

# Make all text lowercase
SDG7['clean_text'] =SDG7['clean_text'].apply(lambda x: x.lower())

# Tokenization
tokenized_doc_7 = SDG7['clean_text'].apply(lambda x: x.split())

# Delete stop-words
nltk.download('stopwords')
#stop_words = set(stopwords.words('english'))

stop_words = nltk.corpus.stopwords.words('english')
newStopWords = ['cent']
stop_words.extend(newStopWords)
tokenized_doc_7 = tokenized_doc_7.apply(lambda x: [item for item in x if item not in stop_words])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jguisiano/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [86]:
list = tokenized_doc_7.tolist()

new_list = []
for words in list:
    new_list += words

In [87]:
from collections import Counter
word_counts = Counter(new_list)
top = word_counts.most_common(35)
print(top)

[('energy', 960), ('access', 205), ('renewable', 193), ('global', 185), ('investment', 130), ('growth', 129), ('clean', 107), ('electricity', 91), ('sustainable', 90), ('consumption', 87), ('rate', 84), ('modern', 84), ('power', 81), ('cooking', 74), ('fossil', 70), ('recent', 62), ('sector', 60), ('fuels', 54), ('resources', 54), ('technologies', 53), ('sectors', 53), ('progress', 50), ('solar', 48), ('security', 44), ('fuel', 42), ('measures', 42), ('cost', 42), ('transition', 42), ('support', 40), ('subsidies', 40), ('green', 39), ('trends', 39), ('country', 39), ('system', 37), ('wind', 36)]


In [88]:
# SDG8
SDG_8 = res['SDG_8']==1
SDG8 = res[SDG_8]

In [89]:
# Words to be delete 
remove_words = ['countries','including','significant','inclusive','institutions','empirical',
                'discusses','environment','enhancing','environments','infrastructure']

In [90]:
pat = r'\b(?:{})\b'.format('|'.join(remove_words))

In [91]:
SDG8['Text'] = SDG8['Text'].str.replace(pat, '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [92]:
#Remove everythings exepct alphabet 
SDG8['clean_text'] = SDG8['Text'].str.replace("[^a-zA-Z#]", " ")

# Remove null fields
SDG8['clean_text'] = SDG8['clean_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

# Make all text lowercase
SDG8['clean_text'] =SDG8['clean_text'].apply(lambda x: x.lower())

# Tokenization
tokenized_doc_8 = SDG8['clean_text'].apply(lambda x: x.split())

# Delete stop-words
nltk.download('stopwords')
#stop_words = set(stopwords.words('english'))

stop_words = nltk.corpus.stopwords.words('english')
newStopWords = ['cent']
stop_words.extend(newStopWords)
tokenized_doc_8 = tokenized_doc_8.apply(lambda x: [item for item in x if item not in stop_words])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jguisiano/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [93]:
list = tokenized_doc_8.tolist()

new_list = []
for words in list:
    new_list += words

In [94]:
from collections import Counter
word_counts = Counter(new_list)
top = word_counts.most_common(35)
print(top)

[('growth', 283), ('employment', 257), ('labour', 246), ('work', 164), ('economic', 157), ('workers', 138), ('jobs', 126), ('productivity', 124), ('market', 121), ('opportunities', 121), ('social', 113), ('developed', 100), ('also', 98), ('policy', 91), ('world', 83), ('skills', 83), ('least', 82), ('chapter', 81), ('youth', 78), ('improve', 77), ('economy', 76), ('people', 73), ('rate', 71), ('education', 68), ('capita', 66), ('policies', 65), ('training', 65), ('young', 63), ('global', 63), ('financial', 63), ('quality', 63), ('working', 61), ('real', 61), ('increase', 60), ('high', 59)]


In [95]:
# SDG9
SDG_9 = res['SDG_9']==1
SDG9 = res[SDG_9]

In [96]:
# Words to be delete 
remove_words = ['international','particular','respective','population',
               'governments','additional','significantly','sustainable',
               'development','country','countries','policy','policies','inclusive','increased','also','CHAPTER','Chapter',
               'chapter','many','global','added','high','access','including','developed','social','well',
               'challenges','however','However','discusses','least','growth']

In [97]:
pat = r'\b(?:{})\b'.format('|'.join(remove_words))

In [98]:
SDG9['Text'] = SDG9['Text'].str.replace(pat, '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [99]:
#Remove everythings exepct alphabet 
SDG9['clean_text'] = SDG9['Text'].str.replace("[^a-zA-Z#]", " ")

# Remove null fields
SDG9['clean_text'] = SDG9['clean_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

# Make all text lowercase
SDG9['clean_text'] =SDG9['clean_text'].apply(lambda x: x.lower())

# Tokenization
tokenized_doc_9 = SDG9['clean_text'].apply(lambda x: x.split())

# Delete stop-words
nltk.download('stopwords')
#stop_words = set(stopwords.words('english'))

stop_words = nltk.corpus.stopwords.words('english')
newStopWords = ['cent']
stop_words.extend(newStopWords)
tokenized_doc_9 = tokenized_doc_9.apply(lambda x: [item for item in x if item not in stop_words])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jguisiano/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [100]:
list = tokenized_doc_9.tolist()

new_list = []
for words in list:
    new_list += words

In [101]:
from collections import Counter
word_counts = Counter(new_list)
top = word_counts.most_common(35)
print(top)

[('innovation', 323), ('infrastructure', 315), ('manufacturing', 171), ('economic', 153), ('developing', 143), ('support', 141), ('investment', 110), ('industrial', 108), ('industrialization', 104), ('business', 102), ('technologies', 99), ('research', 93), ('invention', 92), ('sectors', 90), ('value', 87), ('trade', 85), ('technological', 76), ('digital', 76), ('technology', 73), ('share', 73), ('firms', 72), ('goal', 72), ('process', 67), ('transformation', 66), ('industry', 64), ('provision', 64), ('promote', 62), ('made', 57), ('increase', 56), ('smes', 56), ('services', 55), ('green', 55), ('capabilities', 53), ('future', 51), ('resilient', 51)]


In [102]:
# SDG10
SDG_10 = res['SDG_10']==1
SDG10 = res[SDG_10]

In [103]:
# Words to be delete 
remove_words = ['significantly','uncertainty','representation','population',
               'countries','policies','OECD','developing','across','particular','paper',
               'including','well','based','international','among','billion','Denmark','also',
               'developed','development','world','ensure']

In [104]:
pat = r'\b(?:{})\b'.format('|'.join(remove_words))

In [105]:
SDG10['Text'] = SDG10['Text'].str.replace(pat, '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [106]:
#Remove everythings exepct alphabet 
SDG10['clean_text'] = SDG10['Text'].str.replace("[^a-zA-Z#]", " ")

# Remove null fields
SDG10['clean_text'] = SDG10['clean_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

# Make all text lowercase
SDG10['clean_text'] =SDG10['clean_text'].apply(lambda x: x.lower())

# Tokenization
tokenized_doc_10 = SDG10['clean_text'].apply(lambda x: x.split())

# Delete stop-words
nltk.download('stopwords')
#stop_words = set(stopwords.words('english'))

stop_words = nltk.corpus.stopwords.words('english')
newStopWords = ['cent']
stop_words.extend(newStopWords)
tokenized_doc_10 = tokenized_doc_10.apply(lambda x: [item for item in x if item not in stop_words])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jguisiano/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [107]:
list = tokenized_doc_10.tolist()

new_list = []
for words in list:
    new_list += words

In [108]:
from collections import Counter
word_counts = Counter(new_list)
top = word_counts.most_common(35)
print(top)

[('income', 375), ('inequality', 293), ('growth', 181), ('social', 140), ('transfers', 113), ('impact', 111), ('reforms', 108), ('household', 107), ('distribution', 106), ('structural', 99), ('incomes', 96), ('global', 94), ('redistribution', 93), ('economic', 87), ('reduce', 80), ('average', 74), ('labour', 73), ('trade', 71), ('institutions', 65), ('decline', 63), ('assistance', 62), ('high', 62), ('mobility', 62), ('taxes', 62), ('least', 61), ('migration', 55), ('level', 54), ('inequalities', 53), ('market', 53), ('financial', 53), ('redistributive', 50), ('long', 48), ('goal', 46), ('higher', 45), ('benefits', 44)]


In [109]:
# SDG11
SDG_11 = res['SDG_11']==1
SDG11 = res[SDG_11]

In [110]:
# Words to be delete 
remove_words = ['substantially','implementation','highlighting','unprecedented', 'significance',
               'agricultural','implementing','institutional','increasingly','intensifying','efficiencies',
               'international','consequences','participation','nhistorically','sustainable','well','also',
               'planning','Chapter','CHAPTER','chapter','billion','water','children','global','world','countries',
               'inclusive','number','policies','economic','including','better','access','many','target',
               'sector','goals','agenda','growth']

In [111]:
pat = r'\b(?:{})\b'.format('|'.join(remove_words))

In [112]:
SDG11['Text'] = SDG11['Text'].str.replace(pat, '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [113]:
#Remove everythings exepct alphabet 
SDG11['clean_text'] = SDG11['Text'].str.replace("[^a-zA-Z#]", " ")

# Remove null fields
SDG11['clean_text'] = SDG11['clean_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

# Make all text lowercase
SDG11['clean_text'] =SDG11['clean_text'].apply(lambda x: x.lower())

# Tokenization
tokenized_doc_11 = SDG11['clean_text'].apply(lambda x: x.split())

# Delete stop-words
nltk.download('stopwords')
#stop_words = set(stopwords.words('english'))

stop_words = nltk.corpus.stopwords.words('english')
newStopWords = ['cent']
stop_words.extend(newStopWords)
tokenized_doc_11 = tokenized_doc_11.apply(lambda x: [item for item in x if item not in stop_words])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jguisiano/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [114]:
list = tokenized_doc_11.tolist()

new_list = []
for words in list:
    new_list += words

In [115]:
from collections import Counter
word_counts = Counter(new_list)
top = word_counts.most_common(35)
print(top)

[('urban', 560), ('cities', 311), ('development', 241), ('areas', 190), ('policy', 154), ('transport', 135), ('services', 130), ('housing', 127), ('waste', 126), ('rural', 123), ('people', 112), ('city', 110), ('management', 107), ('safe', 103), ('environmental', 98), ('urbanization', 97), ('population', 91), ('public', 91), ('persons', 80), ('capital', 79), ('state', 68), ('examines', 67), ('progress', 65), ('regional', 63), ('human', 59), ('affordable', 59), ('basic', 58), ('indicators', 58), ('attention', 57), ('increased', 55), ('rapid', 54), ('infrastructure', 54), ('reviews', 53), ('framework', 52), ('quality', 52)]


In [116]:
# SDG12
SDG_12 = res['SDG_12']==1
SDG12 = res[SDG_12]

In [117]:
# Words to be delete 
remove_words = ['international','substantially','significantly','fundamentally','countries',
               'disadvantaged','macroeconomic','transnational','participation','development',
               'management','natural','Chapter','CHAPTER','chapter','related','policies','developing',
               'well','value','growth','policy','national','frameworks','needs','global','towards'
               'provides','efficiency','along']

In [118]:
pat = r'\b(?:{})\b'.format('|'.join(remove_words))

In [119]:
SDG12['Text'] = SDG12['Text'].str.replace(pat, '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [120]:
#Remove everythings exepct alphabet 
SDG12['clean_text'] = SDG12['Text'].str.replace("[^a-zA-Z#]", " ")

# Remove null fields
SDG12['clean_text'] = SDG12['clean_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

# Make all text lowercase
SDG12['clean_text'] =SDG12['clean_text'].apply(lambda x: x.lower())

# Tokenization
tokenized_doc_12 = SDG12['clean_text'].apply(lambda x: x.split())

# Delete stop-words
nltk.download('stopwords')
#stop_words = set(stopwords.words('english'))

stop_words = nltk.corpus.stopwords.words('english')
newStopWords = ['cent']
stop_words.extend(newStopWords)
tokenized_doc_12 = tokenized_doc_12.apply(lambda x: [item for item in x if item not in stop_words])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jguisiano/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [121]:
list = tokenized_doc_12.tolist()

new_list = []
for words in list:
    new_list += words

In [122]:
from collections import Counter
word_counts = Counter(new_list)
top = word_counts.most_common(35)
print(top)

[('waste', 275), ('consumption', 199), ('sustainable', 190), ('production', 179), ('material', 178), ('food', 147), ('economy', 145), ('circular', 131), ('resources', 131), ('economic', 130), ('towards', 115), ('environmental', 102), ('resource', 96), ('transition', 86), ('materials', 82), ('sustainability', 79), ('footprint', 76), ('tons', 67), ('impacts', 63), ('trade', 59), ('practices', 56), ('products', 55), ('quality', 55), ('efficient', 55), ('used', 52), ('chemicals', 51), ('life', 50), ('recycling', 49), ('patterns', 49), ('reduction', 47), ('guidance', 46), ('billion', 46), ('extraction', 44), ('capita', 43), ('amount', 43)]


In [123]:
# SDG13
SDG_13 = res['SDG_13']==1
SDG13 = res[SDG_13]

In [124]:
# Words to be delete 
remove_words = ['countries','development','global','national','developing','developed',
               'levels','well','least','governments','Chapter','chapter','CHAPTER','OECD','related','Framework','parties','states',
               'many','million','sustainable','framework','policies','addition','women','often','particularly',
               'current','infrastructure','government','much','April','operation','article','people','nationally','address','Current']

In [125]:
pat = r'\b(?:{})\b'.format('|'.join(remove_words))

In [126]:
SDG13['Text'] = SDG13['Text'].str.replace(pat, '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [127]:
#Remove everythings exepct alphabet 
SDG13['clean_text'] = SDG13['Text'].str.replace("[^a-zA-Z#]", " ")

# Remove null fields
SDG13['clean_text'] = SDG13['clean_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

# Make all text lowercase
SDG13['clean_text'] =SDG13['clean_text'].apply(lambda x: x.lower())

# Tokenization
tokenized_doc_13 = SDG13['clean_text'].apply(lambda x: x.split())

# Delete stop-words
nltk.download('stopwords')
#stop_words = set(stopwords.words('english'))

stop_words = nltk.corpus.stopwords.words('english')
newStopWords = ['cent']
stop_words.extend(newStopWords)
tokenized_doc_13 = tokenized_doc_13.apply(lambda x: [item for item in x if item not in stop_words])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jguisiano/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [128]:
list = tokenized_doc_13.tolist()

new_list = []
for words in list:
    new_list += words

In [129]:
from collections import Counter
word_counts = Counter(new_list)
top = word_counts.most_common(50)
print(top)

[('climate', 618), ('change', 391), ('disasters', 145), ('adaptation', 135), ('agreement', 116), ('action', 114), ('risk', 112), ('natural', 99), ('disaster', 97), ('emissions', 85), ('paris', 85), ('resilience', 82), ('impact', 80), ('strategies', 79), ('resilient', 69), ('actions', 67), ('hazards', 66), ('reduction', 60), ('capacity', 56), ('warming', 56), ('united', 51), ('nations', 51), ('convention', 51), ('mitigation', 50), ('planning', 45), ('states', 45), ('increase', 45), ('report', 45), ('vulnerable', 44), ('help', 43), ('relationship', 42), ('greenhouse', 41), ('effects', 40), ('goals', 40), ('goal', 40), ('small', 39), ('looks', 38), ('first', 38), ('management', 37), ('island', 37), ('local', 37), ('sector', 37), ('implement', 37), ('carbon', 37), ('reduce', 37), ('programmes', 37), ('pathways', 37), ('adaptive', 36), ('determined', 36), ('contributions', 36)]


In [130]:
# SDG14
SDG_14 = res['SDG_14']==1
SDG14 = res[SDG_14]

In [131]:
# Words to be delete 
remove_words = ['sustainable','countries','small','well','development','global','role','international','International',
               'management','developed','levels','least','within','scale','contribute','developing','economics','trends','increase',
               'states','implementation','health','based','year','also','economic']

In [132]:
pat = r'\b(?:{})\b'.format('|'.join(remove_words))

In [133]:
SDG14['Text'] = SDG14['Text'].str.replace(pat, '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [134]:
#Remove everythings exepct alphabet 
SDG14['clean_text'] = SDG14['Text'].str.replace("[^a-zA-Z#]", " ")

# Remove null fields
SDG14['clean_text'] = SDG14['clean_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

# Make all text lowercase
SDG14['clean_text'] =SDG14['clean_text'].apply(lambda x: x.lower())

# Tokenization
tokenized_doc_14 = SDG14['clean_text'].apply(lambda x: x.split())

# Delete stop-words
nltk.download('stopwords')
#stop_words = set(stopwords.words('english'))

stop_words = nltk.corpus.stopwords.words('english')
newStopWords = ['cent']
stop_words.extend(newStopWords)
tokenized_doc_14 = tokenized_doc_14.apply(lambda x: [item for item in x if item not in stop_words])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jguisiano/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [135]:
list = tokenized_doc_14.tolist()

new_list = []
for words in list:
    new_list += words

In [136]:
from collections import Counter
word_counts = Counter(new_list)
top = word_counts.most_common(50)
print(top)

[('marine', 477), ('oceans', 159), ('pollution', 158), ('ocean', 150), ('fisheries', 143), ('food', 130), ('resources', 128), ('fishing', 124), ('coastal', 115), ('fish', 97), ('biodiversity', 91), ('conservation', 81), ('stocks', 66), ('land', 62), ('areas', 54), ('life', 54), ('subsidies', 53), ('ecosystems', 53), ('seas', 52), ('livelihoods', 51), ('unregulated', 49), ('including', 47), ('water', 46), ('order', 43), ('indonesia', 42), ('webs', 42), ('small', 41), ('states', 41), ('illegal', 41), ('unreported', 41), ('current', 40), ('waters', 40), ('sector', 40), ('security', 40), ('island', 39), ('chapter', 39), ('earth', 38), ('world', 38), ('policy', 37), ('since', 37), ('policies', 37), ('address', 37), ('provides', 37), ('increased', 36), ('industrial', 36), ('large', 35), ('increasing', 35), ('depend', 35), ('convention', 35), ('biotechnology', 35)]


In [137]:
# SDG15
SDG_15 = res['SDG_15']==1
SDG15 = res[SDG_15]

In [138]:
# Words to be delete 
remove_words = ['countries','water','development','across','services','management','including',
               'chapter','Chapter','CHAPTER','covered','benefits','support','also','important','trades','national','food',
               'approaches','continues','policies','million','sector','global','international','policy','world','trade',
               ]

In [139]:
pat = r'\b(?:{})\b'.format('|'.join(remove_words))

In [140]:
SDG15['Text'] = SDG15['Text'].str.replace(pat, '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [141]:
#Remove everythings exepct alphabet 
SDG15['clean_text'] = SDG15['Text'].str.replace("[^a-zA-Z#]", " ")

# Remove null fields
SDG15['clean_text'] = SDG15['clean_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

# Make all text lowercase
SDG15['clean_text'] =SDG15['clean_text'].apply(lambda x: x.lower())

# Tokenization
tokenized_doc_15 = SDG15['clean_text'].apply(lambda x: x.split())

# Delete stop-words
nltk.download('stopwords')
#stop_words = set(stopwords.words('english'))

stop_words = nltk.corpus.stopwords.words('english')
newStopWords = ['cent']
stop_words.extend(newStopWords)
tokenized_doc_15 = tokenized_doc_15.apply(lambda x: [item for item in x if item not in stop_words])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jguisiano/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [142]:
list = tokenized_doc_15.tolist()

new_list = []
for words in list:
    new_list += words

In [143]:
from collections import Counter
word_counts = Counter(new_list)
top = word_counts.most_common(50)
print(top)

[('biodiversity', 469), ('forest', 306), ('species', 235), ('land', 227), ('areas', 207), ('ecosystems', 197), ('wildlife', 187), ('sustainable', 162), ('loss', 160), ('forests', 142), ('conservation', 137), ('terrestrial', 107), ('resources', 102), ('ecosystem', 95), ('degradation', 94), ('trafficking', 91), ('protected', 76), ('poaching', 74), ('trends', 68), ('environmental', 67), ('area', 65), ('protecting', 65), ('provide', 65), ('mountain', 64), ('deforestation', 63), ('economic', 62), ('progress', 60), ('natural', 60), ('green', 60), ('earth', 59), ('restore', 58), ('increased', 58), ('well', 58), ('protect', 57), ('restoration', 54), ('instruments', 53), ('illegal', 53), ('hectares', 53), ('many', 52), ('however', 51), ('diversity', 50), ('related', 50), ('illicit', 49), ('action', 49), ('efforts', 48), ('promote', 47), ('degraded', 47), ('ensure', 47), ('private', 47), ('communities', 46)]


In [144]:
# SDG16
SDG_16 = res['SDG_16']==1
SDG16 = res[SDG_16]

In [145]:
# Words to be delete 
remove_words = ['countries','development','access','framework','inclusive','sustainable','developing','also','Chapter',
               'effective','people','international','levels','report','Chapter','children','human', 'services',
               'ensure','data','national','public','growth','making','CHAPTER','many','OECD','based','Africa'
               'agenda','Agenda','well','number','increased','involved','important','strengthen']

In [146]:
pat = r'\b(?:{})\b'.format('|'.join(remove_words))

In [147]:
SDG16['Text'] = SDG16['Text'].str.replace(pat, '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [148]:
#Remove everythings exepct alphabet 
SDG16['clean_text'] = SDG16['Text'].str.replace("[^a-zA-Z#]", " ")

# Remove null fields
SDG16['clean_text'] = SDG16['clean_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

# Make all text lowercase
SDG16['clean_text'] =SDG16['clean_text'].apply(lambda x: x.lower())

# Tokenization
tokenized_doc_16 = SDG16['clean_text'].apply(lambda x: x.split())

# Delete stop-words
nltk.download('stopwords')
#stop_words = set(stopwords.words('english'))

stop_words = nltk.corpus.stopwords.words('english')
newStopWords = ['cent']
stop_words.extend(newStopWords)
tokenized_doc_16 = tokenized_doc_16.apply(lambda x: [item for item in x if item not in stop_words])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jguisiano/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [149]:
list = tokenized_doc_16.tolist()

new_list = []
for words in list:
    new_list += words

In [150]:
from collections import Counter
word_counts = Counter(new_list)
top = word_counts.most_common(50)
print(top)

[('justice', 200), ('corruption', 164), ('violence', 151), ('institutions', 139), ('legal', 137), ('rights', 111), ('peace', 108), ('conflict', 85), ('africa', 77), ('homicide', 75), ('forms', 73), ('foreign', 70), ('laws', 69), ('progress', 69), ('regions', 66), ('accountable', 65), ('illicit', 65), ('including', 63), ('security', 63), ('societies', 59), ('information', 58), ('available', 58), ('different', 57), ('implementation', 57), ('years', 56), ('flows', 52), ('role', 52), ('without', 51), ('crime', 51), ('judge', 51), ('strengthening', 50), ('actors', 49), ('policies', 48), ('economies', 46), ('goal', 45), ('decision', 45), ('armed', 45), ('civil', 45), ('society', 45), ('governance', 44), ('economic', 43), ('increasing', 43), ('violent', 42), ('could', 42), ('needs', 42), ('remained', 41), ('financial', 41), ('efforts', 41), ('convention', 40), ('rule', 40)]


In [151]:
# SDG17
SDG_17 = res['SDG_17_']==1
SDG17 = res[SDG_17]

In [152]:
# Words to be delete 
remove_words = ['countries','sustainable','least','developed','developing','enhance','based','international','policy',
               'including','national','chapter','agenda','global','world','increase','effective','South','available','capacity'
                ,'data','Sustainable','Agenda','level','billion','Enhance','applied','particular',
               'existing','Asia','need','chapter','Chapter','country','However','however']

In [153]:
pat = r'\b(?:{})\b'.format('|'.join(remove_words))

In [154]:
SDG17['Text'] = SDG17['Text'].str.replace(pat, '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [155]:
#Remove everythings exepct alphabet 
SDG17['clean_text'] = SDG17['Text'].str.replace("[^a-zA-Z#]", " ")

# Remove null fields
SDG17['clean_text'] = SDG17['clean_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

# Make all text lowercase
SDG17['clean_text'] =SDG17['clean_text'].apply(lambda x: x.lower())

# Tokenization
tokenized_doc_17 = SDG17['clean_text'].apply(lambda x: x.split())

# Delete stop-words
nltk.download('stopwords')
#stop_words = set(stopwords.words('english'))

stop_words = nltk.corpus.stopwords.words('english')
newStopWords = ['cent']
stop_words.extend(newStopWords)
tokenized_doc_17 = tokenized_doc_17.apply(lambda x: [item for item in x if item not in stop_words])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jguisiano/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [156]:
list = tokenized_doc_17.tolist()

new_list = []
for words in list:
    new_list += words

In [157]:
from collections import Counter
word_counts = Counter(new_list)
top = word_counts.most_common(50)
print(top)

[('development', 421), ('financing', 116), ('support', 112), ('partnerships', 112), ('science', 103), ('goals', 102), ('technology', 85), ('triangular', 75), ('partnership', 73), ('operation', 72), ('actors', 70), ('assistance', 64), ('building', 58), ('debt', 57), ('domestic', 55), ('registration', 54), ('finance', 53), ('financial', 52), ('resources', 52), ('factors', 52), ('among', 52), ('knowledge', 52), ('statistical', 51), ('public', 51), ('united', 50), ('plans', 49), ('share', 49), ('income', 48), ('trade', 48), ('cooperation', 47), ('practical', 47), ('achieve', 47), ('around', 46), ('well', 46), ('action', 43), ('partners', 42), ('instruments', 42), ('private', 41), ('costs', 41), ('high', 40), ('inclusive', 39), ('means', 38), ('sources', 38), ('gross', 37), ('implementation', 37), ('policies', 37), ('strategies', 37), ('promote', 36), ('also', 35), ('implement', 35)]


In [158]:
# Concat all modified text (SDG1 ...) and delete text_clean for all 

In [207]:
# Concat all the SDG in a single dataframe 
res = pd.concat([SDG1, SDG2, SDG3, SDG4, SDG5, SDG6, SDG7, SDG8, SDG9, SDG10, SDG11, SDG12, SDG13, SDG14, SDG15, SDG16, SDG17], axis=0)
res = res.sample(frac=1)

In [208]:
res["ID"] = np.arange(len(res))

In [209]:
res['Text'] = res['Text'].astype(str)

In [210]:
res = res.drop('clean_text', axis=1)

In [211]:
res.Text = res.Text.replace('\s+', ' ', regex=True)

In [212]:
res.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6016 entries, 203 to 50
Data columns (total 19 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   ID       6016 non-null   int64 
 1   Text     6016 non-null   object
 2   SDG_1    6016 non-null   int8  
 3   SDG_2    6016 non-null   int8  
 4   SDG_3    6016 non-null   int8  
 5   SDG_4    6016 non-null   int8  
 6   SDG_5    6016 non-null   int8  
 7   SDG_6    6016 non-null   int8  
 8   SDG_7    6016 non-null   int8  
 9   SDG_8    6016 non-null   int8  
 10  SDG_9    6016 non-null   int8  
 11  SDG_10   6016 non-null   int8  
 12  SDG_11   6016 non-null   int8  
 13  SDG_12   6016 non-null   int8  
 14  SDG_13   6016 non-null   int8  
 15  SDG_14   6016 non-null   int8  
 16  SDG_15   6016 non-null   int8  
 17  SDG_16   6016 non-null   int8  
 18  SDG_17_  6016 non-null   int8  
dtypes: int64(1), int8(17), object(1)
memory usage: 240.9+ KB


In [213]:
res.to_csv('res.csv')

In [None]:
del list

## Get the synonyms of each SDG keywords and obtains its definition 

In [None]:
##  Generate synonyms for each SDG keywords with GTP-2
import requests
r = requests.post(
    "https://api.deepai.org/api/text-generator",
    data={
        'text': 'End poverty, protection for the poor',
    },
    headers={'api-key': 'quickstart-QUdJIGlzIGNvbWluZy4uLi4K'}
)
print(r.json())

In [None]:
## Wikipedia articles 

In [None]:
# Search synonyms for each SDG keywords by using NLTK
import nltk 
from nltk.corpus import wordnet 

synonyms = [] 
antonyms = [] 
SD1 =[] 

for syn in wordnet.synsets("Poverty"): 
    for l in syn.lemmas(): 
        synonyms.append(l.name()) 
        if l.antonyms(): 
            antonyms.append(l.antonyms()[0].name()) 
  
SD1 = set(synonyms)
SD1 = pd.DataFrame(SD1) 
SD1.columns = ['text']
SD1['col'] = 'SDG1'

In [None]:
synonyms = [] 
antonyms = [] 
SD2 =[] 

for syn in wordnet.synsets("Hunger"): 
    for l in syn.lemmas(): 
        synonyms.append(l.name()) 
        if l.antonyms(): 
            antonyms.append(l.antonyms()[0].name()) 
  
SD2 = set(synonyms)
SD2 = pd.DataFrame(SD2) 
SD2.columns = ['text']
SD2['col'] = 'SDG2'

In [None]:
synonyms = [] 
antonyms = [] 
SD3 =[] 

for syn in wordnet.synsets("Health"): 
    for l in syn.lemmas(): 
        synonyms.append(l.name()) 
        if l.antonyms(): 
            antonyms.append(l.antonyms()[0].name()) 
  
SD3 = set(synonyms)
SD3 = pd.DataFrame(SD3) 
SD3.columns = ['text']
SD3['col'] = 'SDG3'

In [None]:
synonyms = [] 
antonyms = [] 
SD31 =[] 

for syn in wordnet.synsets("Well-being"): 
    for l in syn.lemmas(): 
        synonyms.append(l.name()) 
        if l.antonyms(): 
            antonyms.append(l.antonyms()[0].name()) 
  
SD31 = set(synonyms)
SD31 = pd.DataFrame(SD31) 
SD31.columns = ['text']
SD31['col'] = 'SDG3'

In [None]:
synonyms = [] 
antonyms = [] 
SD4 =[] 

for syn in wordnet.synsets("Education"): 
    for l in syn.lemmas(): 
        synonyms.append(l.name()) 
        if l.antonyms(): 
            antonyms.append(l.antonyms()[0].name()) 
  
SD4 = set(synonyms)
SD4 = pd.DataFrame(SD4) 
SD4.columns = ['text']
SD4['col'] = 'SDG4'

In [None]:
synonyms = [] 
antonyms = [] 
SD5 =[] 

for syn in wordnet.synsets("Gender"): 
    for l in syn.lemmas(): 
        synonyms.append(l.name()) 
        if l.antonyms(): 
            antonyms.append(l.antonyms()[0].name()) 
  
SD5 = set(synonyms)
SD5 = pd.DataFrame(SD5) 
SD5.columns = ['text']
SD5['col'] = 'SDG5'

In [None]:
synonyms = [] 
antonyms = [] 
SD6 =[] 

for syn in wordnet.synsets("Sanitation"): 
    for l in syn.lemmas(): 
        synonyms.append(l.name()) 
        if l.antonyms(): 
            antonyms.append(l.antonyms()[0].name()) 
  
SD6 = set(synonyms)
SD6 = pd.DataFrame(SD6) 
SD6.columns = ['text']
SD6['col'] = 'SDG6'

In [None]:
synonyms = [] 
antonyms = [] 
SD61 =[] 

for syn in wordnet.synsets("Water"): 
    for l in syn.lemmas(): 
        synonyms.append(l.name()) 
        if l.antonyms(): 
            antonyms.append(l.antonyms()[0].name()) 
  
SD61 = set(synonyms)
SD61 = pd.DataFrame(SD61) 
SD61.columns = ['text']
SD61['col'] = 'SDG6'

In [None]:
synonyms = [] 
antonyms = [] 
SD7 =[] 

for syn in wordnet.synsets("Energy"): 
    for l in syn.lemmas(): 
        synonyms.append(l.name()) 
        if l.antonyms(): 
            antonyms.append(l.antonyms()[0].name()) 
  
SD7 = set(synonyms)
SD7 = pd.DataFrame(SD7)
SD7.columns = ['text']
SD7['col'] = 'SDG7'

In [None]:
synonyms = [] 
antonyms = [] 
SD8 =[] 

for syn in wordnet.synsets("Economy"): 
    for l in syn.lemmas(): 
        synonyms.append(l.name()) 
        if l.antonyms(): 
            antonyms.append(l.antonyms()[0].name()) 
  
SD8 = set(synonyms)
SD8 = pd.DataFrame(SD8) 
SD8.columns = ['text']
SD8['col'] = 'SDG8'

In [None]:
synonyms = [] 
antonyms = [] 
SD81 =[] 

for syn in wordnet.synsets("Work"): 
    for l in syn.lemmas(): 
        synonyms.append(l.name()) 
        if l.antonyms(): 
            antonyms.append(l.antonyms()[0].name()) 
  
SD81 = set(synonyms)
SD81 = pd.DataFrame(SD81) 
SD81.columns = ['text']
SD81['col'] = 'SDG8'

In [None]:
synonyms = [] 
antonyms = [] 
SD9 =[] 

for syn in wordnet.synsets("Industry"): 
    for l in syn.lemmas(): 
        synonyms.append(l.name()) 
        if l.antonyms(): 
            antonyms.append(l.antonyms()[0].name()) 
  
SD9 = set(synonyms)
SD9 = pd.DataFrame(SD9) 
SD9.columns = ['text']
SD9['col'] = 'SDG9'

In [None]:
synonyms = [] 
antonyms = [] 
SD91 =[] 

for syn in wordnet.synsets("Innovation"): 
    for l in syn.lemmas(): 
        synonyms.append(l.name()) 
        if l.antonyms(): 
            antonyms.append(l.antonyms()[0].name()) 
  
SD91 = set(synonyms)
SD91 = pd.DataFrame(SD91) 
SD91.columns = ['text']
SD91['col'] = 'SDG9'

In [None]:
synonyms = [] 
antonyms = [] 
SD92 =[] 

for syn in wordnet.synsets("Infrastructure"): 
    for l in syn.lemmas(): 
        synonyms.append(l.name()) 
        if l.antonyms(): 
            antonyms.append(l.antonyms()[0].name()) 
  
SD92 = set(synonyms)
SD92 = pd.DataFrame(SD92) 
SD92.columns = ['text']
SD92['col'] = 'SDG9'

In [None]:
synonyms = [] 
antonyms = [] 
SD10 =[] 

for syn in wordnet.synsets("Inequality"): 
    for l in syn.lemmas(): 
        synonyms.append(l.name()) 
        if l.antonyms(): 
            antonyms.append(l.antonyms()[0].name()) 
  
SD10 = set(synonyms)
SD10 = pd.DataFrame(SD10) 
SD10.columns = ['text']
SD10['col'] = 'SDG10'

In [None]:
synonyms = [] 
antonyms = [] 
SD11 =[] 

for syn in wordnet.synsets("Cities"): 
    for l in syn.lemmas(): 
        synonyms.append(l.name()) 
        if l.antonyms(): 
            antonyms.append(l.antonyms()[0].name()) 
  
SD11 = set(synonyms)
SD11 = pd.DataFrame(SD11) 
SD11.columns = ['text']
SD11['col'] = 'SDG11'

In [None]:
synonyms = [] 
antonyms = [] 
SD111 =[] 

for syn in wordnet.synsets("Communities"): 
    for l in syn.lemmas(): 
        synonyms.append(l.name()) 
        if l.antonyms(): 
            antonyms.append(l.antonyms()[0].name()) 
  
SD111 = set(synonyms)
SD111 = pd.DataFrame(SD111) 
SD111.columns = ['text']
SD111['col'] = 'SDG11'

In [None]:
synonyms = [] 
antonyms = [] 
SD12 =[] 

for syn in wordnet.synsets("Consumption"): 
    for l in syn.lemmas(): 
        synonyms.append(l.name()) 
        if l.antonyms(): 
            antonyms.append(l.antonyms()[0].name()) 
  
SD12 = set(synonyms)
SD12 = pd.DataFrame(SD12) 
SD12.columns = ['text']
SD12['col'] = 'SDG12'

In [None]:
synonyms = [] 
antonyms = [] 
SD121 =[] 

for syn in wordnet.synsets("Production"): 
    for l in syn.lemmas(): 
        synonyms.append(l.name()) 
        if l.antonyms(): 
            antonyms.append(l.antonyms()[0].name()) 
  
SD121 = set(synonyms)
SD121 = pd.DataFrame(SD121) 
SD121.columns = ['text']
SD121['col'] = 'SDG12'

In [None]:
synonyms = [] 
antonyms = [] 
SD13 =[] 

for syn in wordnet.synsets("Climate"): 
    for l in syn.lemmas(): 
        synonyms.append(l.name()) 
        if l.antonyms(): 
            antonyms.append(l.antonyms()[0].name()) 
  
SD13 = set(synonyms)
SD13 = pd.DataFrame(SD13) 
SD13.columns = ['text']
SD13['col'] = 'SDG13'

In [None]:
synonyms = [] 
antonyms = [] 
SD14 =[] 

for syn in wordnet.synsets("Marine"): 
    for l in syn.lemmas(): 
        synonyms.append(l.name()) 
        if l.antonyms(): 
            antonyms.append(l.antonyms()[0].name()) 
  
SD14 = set(synonyms)
SD14 = pd.DataFrame(SD14) 
SD14.columns = ['text']
SD14['col'] = 'SDG14'

In [None]:
synonyms = [] 
antonyms = [] 
SD141 =[] 

for syn in wordnet.synsets("Coastal"): 
    for l in syn.lemmas(): 
        synonyms.append(l.name()) 
        if l.antonyms(): 
            antonyms.append(l.antonyms()[0].name()) 
  
SD141 = set(synonyms)
SD141 = pd.DataFrame(SD141) 
SD141.columns = ['text']
SD141['col'] = 'SDG14'

In [None]:
synonyms = [] 
antonyms = [] 
SD15 =[] 

for syn in wordnet.synsets("Terrestrial"): 
    for l in syn.lemmas(): 
        synonyms.append(l.name()) 
        if l.antonyms(): 
            antonyms.append(l.antonyms()[0].name()) 
  
SD15 = set(synonyms)
SD15 = pd.DataFrame(SD15) 
SD15.columns = ['text']
SD15['col'] = 'SDG15'

In [None]:
synonyms = [] 
antonyms = [] 
SD151 =[] 

for syn in wordnet.synsets("Forests"): 
    for l in syn.lemmas(): 
        synonyms.append(l.name()) 
        if l.antonyms(): 
            antonyms.append(l.antonyms()[0].name()) 
  
SD151 = set(synonyms)
SD151 = pd.DataFrame(SD151) 
SD151.columns = ['text']
SD151['col'] = 'SDG15'

In [None]:
synonyms = [] 
antonyms = [] 
SD152 =[] 

for syn in wordnet.synsets("Biodiversity"): 
    for l in syn.lemmas(): 
        synonyms.append(l.name()) 
        if l.antonyms(): 
            antonyms.append(l.antonyms()[0].name()) 
  
SD152 = set(synonyms)
SD152 = pd.DataFrame(SD152) 
SD152.columns = ['text']
SD152['col'] = 'SDG15'

In [None]:
synonyms = [] 
antonyms = [] 
SD153 =[] 

for syn in wordnet.synsets("land"): 
    for l in syn.lemmas(): 
        synonyms.append(l.name()) 
        if l.antonyms(): 
            antonyms.append(l.antonyms()[0].name()) 
  
SD153 = set(synonyms)
SD153 = pd.DataFrame(SD153) 
SD153.columns = ['text']
SD153['col'] = 'SDG15'

In [None]:
synonyms = [] 
antonyms = [] 
SD16 =[] 

for syn in wordnet.synsets("Peace"): 
    for l in syn.lemmas(): 
        synonyms.append(l.name()) 
        if l.antonyms(): 
            antonyms.append(l.antonyms()[0].name()) 
  
SD16 = set(synonyms)
SD16 = pd.DataFrame(SD16) 
SD16.columns = ['text']
SD16['col'] = 'SDG16'

In [None]:
synonyms = [] 
antonyms = [] 
SD161 =[] 

for syn in wordnet.synsets("Justice"): 
    for l in syn.lemmas(): 
        synonyms.append(l.name()) 
        if l.antonyms(): 
            antonyms.append(l.antonyms()[0].name()) 
  
SD161 = set(synonyms)
SD161 = pd.DataFrame(SD161) 
SD161.columns = ['text']
SD161['col'] = 'SDG16'

In [None]:
synonyms = [] 
antonyms = [] 
SD17 =[] 

for syn in wordnet.synsets("Trade"): 
    for l in syn.lemmas(): 
        synonyms.append(l.name()) 
        if l.antonyms(): 
            antonyms.append(l.antonyms()[0].name()) 
  
SD17 = set(synonyms)
SD17 = pd.DataFrame(SD17) 
SD17.columns = ['text']
SD17['col'] = 'SDG17'

In [None]:
synonyms = [] 
antonyms = [] 
SD171 =[] 

for syn in wordnet.synsets("Technology"): 
    for l in syn.lemmas(): 
        synonyms.append(l.name()) 
        if l.antonyms(): 
            antonyms.append(l.antonyms()[0].name()) 
  
SD171 = set(synonyms)
SD171 = pd.DataFrame(SD171) 
SD171.columns = ['text']
SD171['col'] = 'SDG17'

In [None]:
synonyms = [] 
antonyms = [] 
SD172 =[] 

for syn in wordnet.synsets("Finance"): 
    for l in syn.lemmas(): 
        synonyms.append(l.name()) 
        if l.antonyms(): 
            antonyms.append(l.antonyms()[0].name()) 
  
SD172 = set(synonyms)
SD172 = pd.DataFrame(SD172) 
SD172.columns = ['text']
type(SD172)
SD172['col'] = 'SDG17'

In [None]:
synonyms = [] 
antonyms = [] 
SD173 =[] 

for syn in wordnet.synsets("Partnership"): 
    for l in syn.lemmas(): 
        synonyms.append(l.name()) 
        if l.antonyms(): 
            antonyms.append(l.antonyms()[0].name()) 
  
SD173 = set(synonyms)
SD173 = pd.DataFrame(SD173) 
SD173.columns = ['text']
SD173['col'] = 'SDG17'

In [None]:
# Concat all in a single dataframe 
Syn = pd.concat([SD1, SD2, SD3, SD31, SD4, SD5, SD6, SD61, SD7, SD8, SD81, SD9, SD91, SD92, SD10, SD11, SD111, SD12, SD121, SD13, SD14, SD141, SD15, SD151, SD152, SD153, SD16, SD161, SD17, SD171, SD172, SD173], axis=0)
Syn["ID"] = np.arange(len(Syn))

In [None]:
#pandas.set_option('display.max_rows', 300)
Syn.set_index('ID')

In [None]:
# Save syn in CSV and manually drop unuseful synonyms for each SDG
#Syn.to_csv('Syn.csv')

In [None]:
# Load clean Syn CSV file in dataframe
Syn = pd.read_csv('Syn.csv',sep=";")

In [None]:
Synonyms = Syn['text'].to_list()

In [None]:
Synonyms

In [None]:
### Generate a list of synonyms and use it in a loop for wikpedia

In [None]:
!pip install wikipedia

In [None]:
import wikipedia

Def = []
for i in Synonyms:
    Def.append((i, wikipedia.summary(i)))

In [None]:
Def

In [None]:
Def was exported to CSV file, then we also filter his results and add it to the initial database SDG_Objectives

### PDF text extraction 
We also add a selection of PDF files for each SDG sort by UN expert that we added in our database 

In [None]:
!pip install PyPDF2

In [None]:
!pip install tika

### SDG1

In [None]:
from tika import parser 
# Extract PDF text 
raw = parser.from_file('Flagship reports for SDG analysis/SDG 1 - No poverty/2017 HLPF Thematic review SDG1.pdf')
SDG1_1 = raw['content']

In [None]:
# Split string when '/n' occur
SDG1_1 = SDG1_1.splitlines()   

# Delete empty string
SDG1_1 = [x for x in SDG1_1 if x]

# Delete all :  ' 
SDG1_1 = map(lambda SDG1_1:SDG1_1.strip(' '' '),SDG1_1)

# Join all string to form just one per PDF
SDG1_1 = " ".join(SDG1_1)

# Delete extra white space 
import re
SDG1_1 = re.sub(r' +', ' ', SDG1_1)

# Delete URL
SDG1_1 = re.sub(r"http\S+", "", SDG1_1)

In [None]:
# Similar way to do whihtout the need of Java
# Loop for extracting all pages text for each document : 

#import PyPDF2 as pdf

# Check if PDF is encrypted 
#read_pdf.getIsEncrypted()

#with open('Flagship reports for SDG analysis/SDG 1 - No poverty/2017 HLPF Thematic review SDG1.pdf','rb') as pdf_file, open('SDG1_1.txt', 'w') as text_file:
    #read_pdf = PyPDF2.PdfFileReader(pdf_file)
    #number_of_pages = read_pdf.getNumPages()
    #for page_number in range(number_of_pages):  
        #page = read_pdf.getPage(page_number)
        #page_content = page.extractText()
        #text_file.write(page_content)

In [None]:
# Extract PDF text 
raw = parser.from_file('/Users/jadeguisiano/Desktop/Nations Unies/One-Planet/Flagship reports for SDG analysis/SDG 1 - No poverty/.Global multidimensional povery index 2019.pdf.icloud')
SDG1_2 = raw['content']

In [None]:
# Split string when '/n' occur
SDG1_2 = SDG1_2.splitlines()   

# Delete empty string
SDG1_2 = [x for x in SDG1_2 if x]

# Delete all :  ' 
SDG1_2 = map(lambda SDG1_2:SDG1_2.strip(' '' '),SDG1_2)

# Join all string to form just one per PDF
SDG1_2 = " ".join(SDG1_2)

# Delete extra white space 
import re
SDG1_2 = re.sub(r' +', ' ', SDG1_2)

# Delete URL
SDG1_2 = re.sub(r"http\S+", "", SDG1_2)

### SDG 2

In [None]:
# Extract PDF text 
raw = parser.from_file('/Users/jadeguisiano/Desktop/Nations Unies/One-Planet/Flagship reports for SDG analysis/SDG 2- No hunger/2017 HLPF thematic review SDG2.pdf')
SDG2_1 = raw['content']

In [None]:
# Split string when '/n' occur
SDG2_1 = SDG2_1.splitlines()   

# Delete empty string
SDG2_1 = [x for x in SDG2_1 if x]

# Delete all :  ' 
SDG2_1 = map(lambda SDG2_1:SDG2_1.strip(' '' '),SDG2_1)

# Join all string to form just one per PDF
SDG2_1 = " ".join(SDG2_1)

# Delete extra white space 
import re
SDG2_1 = re.sub(r' +', ' ', SDG2_1)

# Delete URL
SDG2_1 = re.sub(r"http\S+", "", SDG2_1)

In [None]:
# Extract PDF text 
raw = parser.from_file('/Users/jadeguisiano/Desktop/Nations Unies/One-Planet/Flagship reports for SDG analysis/SDG 2- No hunger/.The State of Food Security and Nutrition in the World.pdf.icloud')
SDG2_2 = raw['content']

In [None]:
# Split string when '/n' occur
SDG2_2 = SDG2_2.splitlines()   

# Delete empty string
SDG2_2 = [x for x in SDG2_2 if x]

# Delete all :  ' 
SDG2_2 = map(lambda SDG2_2:SDG2_2.strip(' '' '),SDG2_2)

# Join all string to form just one per PDF
SDG2_2 = " ".join(SDG2_2)

# Delete extra white space 
import re
SDG2_2 = re.sub(r' +', ' ', SDG2_2)

# Delete URL
SDG2_2 = re.sub(r"http\S+", "", SDG2_2)

In [None]:
### SDG 3

In [None]:
# Extract PDF text 
raw = parser.from_file('/Users/jadeguisiano/Desktop/Nations Unies/One-Planet/Flagship reports for SDG analysis/SDG 3 - Health/2017 HLPF Thematic review SDG 3.pdf')
SDG3_1 = raw['content']

In [None]:
# Split string when '/n' occur
SDG3_1 = SDG3_1.splitlines()   

# Delete empty string
SDG3_1 = [x for x in SDG3_1 if x]

# Delete all :  ' 
SDG3_1 = map(lambda SDG3_1:SDG3_1.strip(' '' '),SDG3_1)

# Join all string to form just one per PDF
SDG3_1 = " ".join(SDG3_1)

# Delete extra white space 
import re
SDG3_1 = re.sub(r' +', ' ', SDG3_1)

# Delete URL
SDG3_1 = re.sub(r"http\S+", "", SDG3_1)

In [None]:
### SDG 4

In [None]:
# Extract PDF text 
raw = parser.from_file('/Users/jadeguisiano/Desktop/Nations Unies/One-Planet/Flagship reports for SDG analysis/SDG 4 - Education/.Global Education Monitoring report 2020 UNESCO.pdf.icloud')
SDG4_1 = raw['content']

In [None]:
# Split string when '/n' occur
SDG4_1 = SDG4_1.splitlines()   

# Delete empty string
SDG4_1 = [x for x in SDG4_1 if x]

# Delete all :  ' 
SDG4_1 = map(lambda SDG4_1:SDG4_1.strip(' '' '),SDG4_1)

# Join all string to form just one per PDF
SDG4_1 = " ".join(SDG4_1)

# Delete extra white space 
import re
SDG4_1 = re.sub(r' +', ' ', SDG4_1)

# Delete URL
SDG4_1 = re.sub(r"http\S+", "", SDG4_1)

In [None]:
### SDG 5

In [None]:
# Extract PDF text 
raw = parser.from_file('/Users/jadeguisiano/Desktop/Nations Unies/One-Planet/Flagship reports for SDG analysis/SDG 5 - Gender Equality/HLPF 2017 thematic review sdg 5.pdf')
SDG5_1 = raw['content']

In [None]:
# Split string when '/n' occur
SDG5_1 = SDG5_1.splitlines()   

# Delete empty string
SDG5_1 = [x for x in SDG5_1 if x]

# Delete all :  ' 
SDG5_1 = map(lambda SDG5_1:SDG5_1.strip(' '' '),SDG5_1)

# Join all string to form just one per PDF
SDG5_1 = " ".join(SDG5_1)

# Delete extra white space 
SDG5_1 = re.sub(r' +', ' ', SDG5_1)

# Delete URL
SDG5_1 = re.sub(r"http\S+", "", SDG5_1)

In [None]:
### SDG 6

In [None]:
# Extract PDF text 
raw = parser.from_file('/Users/jadeguisiano/Desktop/Nations Unies/One-Planet/Flagship reports for SDG analysis/SDG 5 - Gender Equality/HLPF 2017 thematic review sdg 5.pdf')
SDG6_1 = raw['content']

In [None]:
# Split string when '/n' occur
SDG6_1 = SDG6_1.splitlines()   

# Delete empty string
SDG6_1 = [x for x in SDG6_1 if x]

# Delete all :  ' 
SDG6_1 = map(lambda SDG6_1:SDG6_1.strip(' '' '),SDG6_1)

# Join all string to form just one per PDF
SDG6_1 = " ".join(SDG6_1)

# Delete extra white space 
SDG6_1 = re.sub(r' +', ' ', SDG6_1)

# Delete URL
SDG6_1 = re.sub(r"http\S+", "", SDG6_1)

In [None]:
# Extract PDF text 
raw = parser.from_file('/Users/jadeguisiano/Desktop/Nations Unies/One-Planet/Flagship reports for SDG analysis/SDG 6 - Water/Water and climate change_UN Water.pdf')
SDG6_2 = raw['content']

In [None]:
# Split string when '/n' occur
SDG6_2 = SDG6_2.splitlines()   

# Delete empty string
SDG6_2 = [x for x in SDG6_2 if x]

# Delete all :  ' 
SDG6_2 = map(lambda SDG6_2:SDG6_2.strip(' '' '),SDG6_2)

# Join all string to form just one per PDF
SDG6_2 = " ".join(SDG6_2)

# Delete extra white space 
SDG6_2 = re.sub(r' +', ' ', SDG6_2)

# Delete URL
SDG6_2 = re.sub(r"http\S+", "", SDG6_2)

In [None]:
### SDG 7

In [None]:
# Extract PDF text 
raw = parser.from_file('/Users/jadeguisiano/Desktop/Nations Unies/One-Planet/Flagship reports for SDG analysis/SDG 7 - Energy/HLPF Thematic review SDG 7.pdf')
SDG7_1 = raw['content']

In [None]:
# Split string when '/n' occur
SDG7_1 = SDG7_1.splitlines()   

# Delete empty string
SDG7_1 = [x for x in SDG7_1 if x]

# Delete all :  ' 
SDG7_1 = map(lambda SDG7_1:SDG7_1.strip(' '' '),SDG7_1)

# Join all string to form just one per PDF
SDG7_1 = " ".join(SDG7_1)

# Delete extra white space 
SDG7_1 = re.sub(r' +', ' ', SDG7_1)

# Delete URL
SDG7_1 = re.sub(r"http\S+", "", SDG7_1)

In [None]:
# Extract PDF text 
raw = parser.from_file('/Users/jadeguisiano/Desktop/Nations Unies/One-Planet/Flagship reports for SDG analysis/SDG 7 - Energy/.Policy Briefs in support of the first SDG7 review at hlpf.pdf.icloud')
SDG7_2 = raw['content']

In [None]:
# Split string when '/n' occur
SDG7_2 = SDG7_2.splitlines()   

# Delete empty string
SDG7_2 = [x for x in SDG7_2 if x]

# Delete all :  ' 
SDG7_2 = map(lambda SDG7_2:SDG7_2.strip(' '' '),SDG7_2)

# Join all string to form just one per PDF
SDG7_2 = " ".join(SDG7_2)

# Delete extra white space 
SDG7_2 = re.sub(r' +', ' ', SDG7_2)

# Delete URL
SDG7_2 = re.sub(r"http\S+", "", SDG7_2)

In [None]:
# Extract PDF text 
raw = parser.from_file('/Users/jadeguisiano/Desktop/Nations Unies/One-Planet/Flagship reports for SDG analysis/SDG 7 - Energy/.Tracking SDG7_The Energy Progress Report 2020_IRENA.pdf.icloud')
SDG7_3 = raw['content']

In [None]:
# Split string when '/n' occur
SDG7_3 = SDG7_3.splitlines()   

# Delete empty string
SDG7_3 = [x for x in SDG7_3 if x]

# Delete all :  ' 
SDG7_3 = map(lambda SDG7_3:SDG7_3.strip(' '' '),SDG7_3)

# Join all string to form just one per PDF
SDG7_3 = " ".join(SDG7_3)

# Delete extra white space 
SDG7_3 = re.sub(r' +', ' ', SDG7_3)

# Delete URL
SDG7_3 = re.sub(r"http\S+", "", SDG7_3)

In [None]:
### SDG 8

In [None]:
# Extract PDF text 
raw = parser.from_file('/Users/jadeguisiano/Desktop/Nations Unies/One-Planet/Flagship reports for SDG analysis/SDG 8 - Decent work/Time to Act for SDG8_ILO.pdf')
SDG8_1 = raw['content']

In [None]:
# Split string when '/n' occur
SDG8_1 = SDG8_1.splitlines()   

# Delete empty string
SDG8_1 = [x for x in SDG8_1 if x]

# Delete all :  ' 
SDG8_1 = map(lambda SDG8_1:SDG8_1.strip(' '' '),SDG8_1)

# Join all string to form just one per PDF
SDG8_1 = " ".join(SDG8_1)

# Delete extra white space 
SDG8_1 = re.sub(r' +', ' ', SDG8_1)

# Delete URL
SDG8_1 = re.sub(r"http\S+", "", SDG8_1)

In [None]:
### SDG 9

In [None]:
# Extract PDF text 
raw = parser.from_file('/Users/jadeguisiano/Desktop/Nations Unies/One-Planet/Flagship reports for SDG analysis/SDG 9 - Infrastructure/HLPF 2019 thematic review SDG9.pdf')
SDG9_1 = raw['content']

In [None]:
# Split string when '/n' occur
SDG9_1 = SDG9_1.splitlines()   

# Delete empty string
SDG9_1 = [x for x in SDG9_1 if x]

# Delete all :  ' 
SDG9_1 = map(lambda SDG9_1:SDG9_1.strip(' '' '),SDG9_1)

# Join all string to form just one per PDF
SDG9_1 = " ".join(SDG9_1)

# Delete extra white space 
SDG9_1 = re.sub(r' +', ' ', SDG9_1)

# Delete URL
SDG9_1 = re.sub(r"http\S+", "", SDG9_1)

In [None]:
# Extract PDF text 
raw = parser.from_file('/Users/jadeguisiano/Desktop/Nations Unies/One-Planet/Flagship reports for SDG analysis/SDG 9 - Infrastructure/.UNIDO_STATISTICAL INDICATORS OF INCLUSIVE and sustainable industrialization.pdf.icloud')
SDG9_2 = raw['content']

In [None]:
# Split string when '/n' occur
SDG9_2 = SDG9_2.splitlines()   

# Delete empty string
SDG9_2 = [x for x in SDG9_2 if x]

# Delete all :  ' 
SDG9_2 = map(lambda SDG9_2:SDG9_2.strip(' '' '),SDG9_2)

# Join all string to form just one per PDF
SDG9_2 = " ".join(SDG9_2)

# Delete extra white space 
SDG9_2 = re.sub(r' +', ' ', SDG9_2)

# Delete URL
SDG9_2 = re.sub(r"http\S+", "", SDG9_2)

In [None]:
### SDG 10

In [None]:
# Extract PDF text 
raw = parser.from_file('/Users/jadeguisiano/Desktop/Nations Unies/One-Planet/Flagship reports for SDG analysis/SDG 10- reduced inequality/.Human Development Report 2019 UNDP.pdf.icloud')
SDG10_1 = raw['content']

In [None]:
# Split string when '/n' occur
SDG10_1 = SDG10_1.splitlines()   

# Delete empty string
SDG10_1 = [x for x in SDG10_1 if x]

# Delete all :  ' 
SDG10_1 = map(lambda SDG10_1:SDG10_1.strip(' '' '),SDG10_1)

# Join all string to form just one per PDF
SDG10_1 = " ".join(SDG10_1)

# Delete extra white space 
SDG10_1 = re.sub(r' +', ' ', SDG10_1)

# Delete URL
SDG10_1 = re.sub(r"http\S+", "", SDG10_1)

In [None]:
# Extract PDF text 
raw = parser.from_file('/Users/jadeguisiano/Desktop/Nations Unies/One-Planet/Flagship reports for SDG analysis/SDG 10- reduced inequality/.UNDESA_World Social Report 2020.pdf.icloud')
SDG10_2 = raw['content']

In [None]:
# Split string when '/n' occur
SDG10_2 = SDG10_2.splitlines()   

# Delete empty string
SDG10_2 = [x for x in SDG10_2 if x]

# Delete all :  ' 
SDG10_2 = map(lambda SDG10_2:SDG10_2.strip(' '' '),SDG10_2)

# Join all string to form just one per PDF
SDG10_2 = " ".join(SDG10_2)

# Delete extra white space 
SDG10_2 = re.sub(r' +', ' ', SDG10_2)

# Delete URL
SDG10_2 = re.sub(r"http\S+", "", SDG10_2)

In [None]:
### SDG 11

In [None]:
# Extract PDF text 
raw = parser.from_file('/Users/jadeguisiano/Desktop/Nations Unies/One-Planet/Flagship reports for SDG analysis/SDG 11 - Cities/HLPF 2018 thematic review SDG11.pdf')
SDG11_1 = raw['content']

In [None]:
# Split string when '/n' occur
SDG11_1 = SDG11_1.splitlines()   

# Delete empty string
SDG11_1 = [x for x in SDG11_1 if x]

# Delete all :  ' 
SDG11_1 = map(lambda SDG11_1:SDG11_1.strip(' '' '),SDG11_1)

# Join all string to form just one per PDF
SDG11_1 = " ".join(SDG11_1)

# Delete extra white space 
SDG11_1 = re.sub(r' +', ' ', SDG11_1)

# Delete URL
SDG11_1 = re.sub(r"http\S+", "", SDG11_1)

In [None]:
# Extract PDF text 
raw = parser.from_file('/Users/jadeguisiano/Desktop/Nations Unies/One-Planet/Flagship reports for SDG analysis/SDG 11 - Cities/.SDG 11 Synthesis report 2018_UN HABITAT.pdf.icloud')
SDG11_2 = raw['content']

In [None]:
# Split string when '/n' occur
SDG11_2 = SDG11_2.splitlines()   

# Delete empty string
SDG11_2 = [x for x in SDG11_2 if x]

# Delete all :  ' 
SDG11_2 = map(lambda SDG11_2:SDG11_2.strip(' '' '),SDG11_2)

# Join all string to form just one per PDF
SDG11_2 = " ".join(SDG11_2)

# Delete extra white space 
SDG11_2 = re.sub(r' +', ' ', SDG11_2)

# Delete URL
SDG11_2 = re.sub(r"http\S+", "", SDG11_2)

In [None]:
### SDG 12

In [None]:
# Extract PDF text 
raw = parser.from_file('/Users/jadeguisiano/Desktop/Nations Unies/One-Planet/Flagship reports for SDG analysis/SDG 12 - SCP/HLPF 2018 thematic review SDG12.pdf')
SDG12_1 = raw['content']

In [None]:
# Split string when '/n' occur
SDG12_1 = SDG12_1.splitlines()   

# Delete empty string
SDG12_1 = [x for x in SDG12_1 if x]

# Delete all :  ' 
SDG12_1 = map(lambda SDG12_1:SDG12_1.strip(' '' '),SDG12_1)

# Join all string to form just one per PDF
SDG12_1 = " ".join(SDG12_1)

# Delete extra white space 
SDG12_1 = re.sub(r' +', ' ', SDG12_1)

# Delete URL
SDG12_1 = re.sub(r"http\S+", "", SDG12_1)

In [None]:
# Extract PDF text 
raw = parser.from_file('/Users/jadeguisiano/Desktop/Nations Unies/One-Planet/Flagship reports for SDG analysis/SDG 12 - SCP/.IRP Global Resources Outlook 2019.pdf.icloud')
SDG12_2 = raw['content']

In [None]:
# Split string when '/n' occur
SDG12_2 = SDG12_2.splitlines()   

# Delete empty string
SDG12_2 = [x for x in SDG12_2 if x]

# Delete all :  ' 
SDG12_2 = map(lambda SDG12_2:SDG12_2.strip(' '' '),SDG12_2)

# Join all string to form just one per PDF
SDG12_2 = " ".join(SDG12_2)

# Delete extra white space 
SDG12_2 = re.sub(r' +', ' ', SDG12_2)

# Delete URL
SDG12_2 = re.sub(r"http\S+", "", SDG12_2)

In [None]:
### SDG 13

In [None]:
# Extract PDF text 
raw = parser.from_file('/Users/jadeguisiano/Desktop/Nations Unies/One-Planet/Flagship reports for SDG analysis/SDG 13 - Climate/.Emissions Gap repor 2019_UNEP.pdf.icloud')
SDG13_1 = raw['content']

In [None]:
# Split string when '/n' occur
SDG13_1 = SDG13_1.splitlines()   

# Delete empty string
SDG13_1 = [x for x in SDG13_1 if x]

# Delete all :  ' 
SDG13_1 = map(lambda SDG13_1:SDG13_1.strip(' '' '),SDG13_1)

# Join all string to form just one per PDF
SDG13_1 = " ".join(SDG13_1)

# Delete extra white space 
SDG13_1 = re.sub(r' +', ' ', SDG13_1)

# Delete URL
SDG13_1 = re.sub(r"http\S+", "", SDG13_1)

In [None]:
# Extract PDF text 
raw = parser.from_file('/Users/jadeguisiano/Desktop/Nations Unies/One-Planet/Flagship reports for SDG analysis/SDG 13 - Climate/.IPCC 5th Assessment Report.pdf.icloud')
SDG13_2 = raw['content']

In [None]:
# Split string when '/n' occur
SDG13_2 = SDG13_2.splitlines()   

# Delete empty string
SDG13_2 = [x for x in SDG13_2 if x]

# Delete all :  ' 
SDG13_2 = map(lambda SDG13_2:SDG13_2.strip(' '' '),SDG13_2)

# Join all string to form just one per PDF
SDG13_2 = " ".join(SDG13_2)

# Delete extra white space 
SDG13_2 = re.sub(r' +', ' ', SDG13_2)

# Delete URL
SDG13_2 = re.sub(r"http\S+", "", SDG13_2)

In [None]:
# Extract PDF text 
raw = parser.from_file('/Users/jadeguisiano/Desktop/Nations Unies/One-Planet/Flagship reports for SDG analysis/SDG 13 - Climate/.NDC_Outlook_Report_2019_UNDP.pdf.icloud')
SDG13_3 = raw['content']

In [None]:
# Split string when '/n' occur
SDG13_3 = SDG13_3.splitlines()   

# Delete empty string
SDG13_3 = [x for x in SDG13_3 if x]

# Delete all :  ' 
SDG13_3 = map(lambda SDG13_3:SDG13_3.strip(' '' '),SDG13_3)

# Join all string to form just one per PDF
SDG13_3 = " ".join(SDG13_3)

# Delete extra white space 
SDG13_3 = re.sub(r' +', ' ', SDG13_3)

# Delete URL
SDG13_3 = re.sub(r"http\S+", "", SDG13_3)

In [None]:
### SDG 14

In [None]:
# Extract PDF text 
raw = parser.from_file('/Users/jadeguisiano/Desktop/Nations Unies/One-Planet/Flagship reports for SDG analysis/SDG 14 - Life below water/HLPF 2017 Thematic review SDG 14.pdf')
SDG14_1 = raw['content']

In [None]:
# Split string when '/n' occur
SDG14_1 = SDG14_1.splitlines()   

# Delete empty string
SDG14_1 = [x for x in SDG14_1 if x]

# Delete all :  ' 
SDG14_1 = map(lambda SDG14_1:SDG14_1.strip(' '' '),SDG14_1)

# Join all string to form just one per PDF
SDG14_1 = " ".join(SDG14_1)

# Delete extra white space 
SDG14_1 = re.sub(r' +', ' ', SDG14_1)

# Delete URL
SDG14_1 = re.sub(r"http\S+", "", SDG14_1)

In [None]:
# Extract PDF text 
raw = parser.from_file('/Users/jadeguisiano/Desktop/Nations Unies/One-Planet/Flagship reports for SDG analysis/SDG 14 - Life below water/.The_First_Global_Integrated_Marine_Assessment_World_Ocean_Assessment.pdf.icloud')
SDG14_2 = raw['content']

In [None]:
# Split string when '/n' occur
SDG14_2 = SDG14_2.splitlines()   

# Delete empty string
SDG14_2 = [x for x in SDG14_2 if x]

# Delete all :  ' 
SDG14_2 = map(lambda SDG14_2:SDG14_2.strip(' '' '),SDG14_2)

# Join all string to form just one per PDF
SDG14_2 = " ".join(SDG14_2)

# Delete extra white space 
SDG14_2 = re.sub(r' +', ' ', SDG14_2)

# Delete URL
SDG14_2 = re.sub(r"http\S+", "", SDG14_2)

In [None]:
### SDG 15

In [None]:
# Extract PDF text 
raw = parser.from_file('/Users/jadeguisiano/Desktop/Nations Unies/One-Planet/Flagship reports for SDG analysis/SDG 15 - Life on land/HLPF 2018 background note SDG 15.pdf')
SDG15_1 = raw['content']

In [None]:
# Split string when '/n' occur
SDG15_1 = SDG15_1.splitlines()   

# Delete empty string
SDG15_1 = [x for x in SDG15_1 if x]

# Delete all :  ' 
SDG15_1 = map(lambda SDG15_1:SDG15_1.strip(' '' '),SDG15_1)

# Join all string to form just one per PDF
SDG15_1 = " ".join(SDG15_1)

# Delete extra white space 
SDG15_1 = re.sub(r' +', ' ', SDG15_1)

# Delete URL
SDG15_1 = re.sub(r"http\S+", "", SDG15_1)

In [None]:
# Extract PDF text 
raw = parser.from_file('/Users/jadeguisiano/Desktop/Nations Unies/One-Planet/Flagship reports for SDG analysis/SDG 15 - Life on land/HLPF 2018 thematic review SDG 15.pdf')
SDG15_2 = raw['content']

In [None]:
# Split string when '/n' occur
SDG15_2 = SDG15_2.splitlines()   

# Delete empty string
SDG15_2 = [x for x in SDG15_2 if x]

# Delete all :  ' 
SDG15_2 = map(lambda SDG15_2:SDG15_2.strip(' '' '),SDG15_2)

# Join all string to form just one per PDF
SDG15_2 = " ".join(SDG15_2)

# Delete extra white space 
SDG15_2 = re.sub(r' +', ' ', SDG15_2)

# Delete URL
SDG15_2 = re.sub(r"http\S+", "", SDG15_2)

In [None]:
# Extract PDF text 
raw = parser.from_file('/Users/jadeguisiano/Desktop/Nations Unies/One-Planet/Flagship reports for SDG analysis/SDG 15 - Life on land/.The State of the Worlds Forest_FAO UNEP.pdf.icloud')
SDG15_3 = raw['content']

In [None]:
# Split string when '/n' occur
SDG15_3 = SDG15_3.splitlines()   

# Delete empty string
SDG15_3 = [x for x in SDG15_3 if x]

# Delete all :  ' 
SDG15_3 = map(lambda SDG15_3:SDG15_3.strip(' '' '),SDG15_3)

# Join all string to form just one per PDF
SDG15_3 = " ".join(SDG15_3)

# Delete extra white space 
SDG15_3 = re.sub(r' +', ' ', SDG15_3)

# Delete URL
SDG15_3 = re.sub(r"http\S+", "", SDG15_3)

In [None]:
### SDG 16

In [None]:
# Extract PDF text 
raw = parser.from_file('/Users/jadeguisiano/Desktop/Nations Unies/One-Planet/Flagship reports for SDG analysis/SDG 16 - Peace/HLPF 2017 Discussion on SDG 16.pdf')
SDG16_1 = raw['content']

In [None]:
# Split string when '/n' occur
SDG16_1 = SDG16_1.splitlines()   

# Delete empty string
SDG16_1 = [x for x in SDG16_1 if x]

# Delete all :  ' 
SDG16_1 = map(lambda SDG16_1:SDG16_1.strip(' '' '),SDG16_1)

# Join all string to form just one per PDF
SDG16_1 = " ".join(SDG16_1)

# Delete extra white space 
SDG16_1 = re.sub(r' +', ' ', SDG16_1)

# Delete URL
SDG16_1 = re.sub(r"http\S+", "", SDG16_1)

In [None]:
# Extract PDF text 
raw = parser.from_file('/Users/jadeguisiano/Desktop/Nations Unies/One-Planet/Flagship reports for SDG analysis/SDG 16 - Peace/.SDG16Progress-Report-2019_Institute for Economics and Peace.pdf.icloud')
SDG16_2 = raw['content']

In [None]:
# Split string when '/n' occur
SDG16_2 = SDG16_2.splitlines()   

# Delete empty string
SDG16_2 = [x for x in SDG16_2 if x]

# Delete all :  ' 
SDG16_2 = map(lambda SDG16_2:SDG16_2.strip(' '' '),SDG16_2)

# Join all string to form just one per PDF
SDG16_2 = " ".join(SDG16_2)

# Delete extra white space 
SDG16_2 = re.sub(r' +', ' ', SDG16_2)

# Delete URL
SDG16_2 = re.sub(r"http\S+", "", SDG16_2)

In [None]:
### SDG 17

In [None]:
# Extract PDF text 
raw = parser.from_file('/Users/jadeguisiano/Desktop/Nations Unies/One-Planet/Flagship reports for SDG analysis/SDG 17 - Partnerships/.Financing for Sustainable Devleopment report 2020.pdf.icloud')
SDG17_1 = raw['content']

In [None]:
# Split string when '/n' occur
SDG17_1 = SDG17_1.splitlines()   

# Delete empty string
SDG17_1 = [x for x in SDG17_1 if x]

# Delete all :  ' 
SDG17_1 = map(lambda SDG17_1:SDG17_1.strip(' '' '),SDG17_1)

# Join all string to form just one per PDF
SDG17_1 = " ".join(SDG17_1)

# Delete extra white space 
SDG17_1 = re.sub(r' +', ' ', SDG17_1)

# Delete URL
SDG17_1 = re.sub(r"http\S+", "", SDG17_1)

In [None]:
# Extract PDF text 
raw = parser.from_file('/Users/jadeguisiano/Desktop/Nations Unies/One-Planet/Flagship reports for SDG analysis/SDG 17 - Partnerships/Financing the SDGs Moving from words to Action_Background note_ UNDESA.pdf')
SDG17_2 = raw['content']

In [None]:
# Split string when '/n' occur
SDG17_2 = SDG17_2.splitlines()   

# Delete empty string
SDG17_2 = [x for x in SDG17_2 if x]

# Delete all :  ' 
SDG17_2 = map(lambda SDG17_2:SDG17_2.strip(' '' '),SDG17_2)

# Join all string to form just one per PDF
SDG17_2 = " ".join(SDG17_2)

# Delete extra white space 
SDG17_2 = re.sub(r' +', ' ', SDG17_2)

# Delete URL
SDG17_2 = re.sub(r"http\S+", "", SDG17_2)

In [None]:
# Extract PDF text 
raw = parser.from_file('/Users/jadeguisiano/Desktop/Nations Unies/One-Planet/Flagship reports for SDG analysis/SDG 17 - Partnerships/HLPF 2018 thematic review SDG 17.pdf')
SDG17_3 = raw['content']

In [None]:
# Split string when '/n' occur
SDG17_3 = SDG17_3.splitlines()   

# Delete empty string
SDG17_3 = [x for x in SDG17_3 if x]

# Delete all :  ' 
SDG17_3 = map(lambda SDG17_3:SDG17_3.strip(' '' '),SDG17_3)

# Join all string to form just one per PDF
SDG17_3 = " ".join(SDG17_3)

# Delete extra white space 
SDG17_3 = re.sub(r' +', ' ', SDG17_3)

# Delete URL
SDG17_3 = re.sub(r"http\S+", "", SDG17_3)