## Importing necessary libraries

In [None]:
# standard library
from typing import List

# data wrangling
import numpy as np
import pandas as pd

# visualisation
import plotly.express as px
import plotly.io as pio

# nlp
import spacy

# data modelling
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score

# utils
from tqdm import tqdm

# local packages
from helpers import plot_confusion_matrix, get_top_features, fix_sdg_name

print('Loaded!')

#### Here we are using Spacy for removing unnecessary words such as stopping words which doesn't provide any meaning and disabling ner(named entity recognition).

In [None]:
# other settings
pio.templates.default = 'plotly_dark'
spacy.prefer_gpu()
nlp = spacy.load('en_core_web_sm', disable = ['ner'])


## Loading data and exploring

In [None]:
dataset = pd.read_csv('https://zenodo.org/record/5550238/files/osdg-community-dataset-v21-09-30.csv?download=1')
print('Shape:', dataset.shape)
#display(dataset['text'].values)
#display(dataset.head())

In [None]:
# calculating cumulative probability over agreement scores
df_lambda = dataset['agreement'].value_counts(normalize = True).sort_index().cumsum().to_frame(name = 'p_sum')
df_lambda.reset_index(inplace = True)
df_lambda.rename({'index': 'agreement'}, axis = 1, inplace = True)

print('Shape:', df_lambda.shape)


In [None]:
# keeping only the texts whose suggested sdg labels is accepted and the agreement score is at least .6
print('Shape before:', dataset.shape)
dataset = dataset.query('agreement >= .6 and labels_positive > labels_negative').copy()
print('Shape after :', dataset.shape)


In [None]:
df3 = dataset['sdg'].value_counts()
df3.columns = ['sdg', 'count']
print (df3.sort_index())

In [None]:
df_lambda = dataset.groupby('sdg', as_index = False).agg(count = ('text_id', 'count'))
df_lambda['share'] = df_lambda['count'].divide(df_lambda['count'].sum()).multiply(100)
print('Shape:', df_lambda.shape)
display(df_lambda.head())

## Preprocess_spacy function which takes text from the dataframe and returns meaningful words

In [None]:
def preprocess_spacy(alpha: List[str]) -> List[str]:
   
    docs = list()
    
    for doc in tqdm(nlp.pipe(alpha, batch_size = 128)):
        tokens = list()
        for token in doc:
            if token.pos_ in ['NOUN', 'VERB', 'ADJ']:
                tokens.append(token.lemma_)
        docs.append(' '.join(tokens))
    
    return docs
  

### Calling the preprocess_spacy function 

In [None]:
dataset['docs'] = preprocess_spacy(dataset['text'].values)
print('\nShape:', dataset.shape)
display(dataset.head())

## Splitting the dataset into training and testing data.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    dataset['docs'].values, 
    dataset['sdg'].values, 
    test_size = .3,
    random_state = 42
)

## Multi Classification starts here ! ! ! !
### I am using ExtraTreesClassifier for multi-classification


In [None]:
from sklearn.ensemble import ExtraTreesClassifier
pipe = Pipeline([
    ('vectoriser', TfidfVectorizer(
        ngram_range = (1, 2),
        max_df = 0.75,
        min_df = 2,
        max_features = 100_000
    )),
    ('selector', SelectKBest(f_classif, k = 5_000)),
    ('clf',ExtraTreesClassifier())
])

pipe.fit(X_train, y_train)

## Getting Model Accuracy

In [None]:
y_hat = pipe.predict(X_test)
from sklearn.metrics import accuracy_score
model_accuracy = accuracy_score(y_test, y_hat)
print('Accuracy of the Model = ', int(model_accuracy*100),'%')

## Confusion Matrix

In [None]:
plot_confusion_matrix(y_test, y_hat)

## Predicting the goal and respective probabilities for training dataset.

In [None]:
# Here I took 3rd text in the text column of dataset and predicted the SDG goal number.
predicted_goal= pipe.predict([X_test[3]])
print(predicted_goal)

In [None]:
# Predicting the probability of each and every goal.
predicted_probabilities = pipe.predict_proba([X_test[3]])
print(predicted_probabilities)

### Classification report of the model

In [None]:
print(classification_report(y_test, y_hat, zero_division = 0))

In [None]:
!pip install pdfminer

## Uploading PDF and converting to text 

In [None]:
!pip install pdfminer

In [None]:
#Importing required libraries

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
import nltk
import os
import re
import pickle
from datetime import datetime

# convert_pdf_to_text is the function which takes pdf file and converts it into text and returns text.

def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text


## Uploading PDF to google colab

In [None]:
from google.colab import files
uploaded = files.upload()


In [None]:
# Getting the uploaded file name 
var = (*uploaded,)
str2= var[0]
str2

In [None]:
# Creating path of the uploaded file
str1 = "/content/"
Final_path = str1+str2
Final_path

In [None]:
#Giving final path to the text
text = convert_pdf_to_txt(Final_path)


In [None]:
# Creating a new dataframe for giving text to preprocessspacy function.
# Because it take the values of the column as input.
# So created a dataframe and inserting txt into particular column called "docs" in the dataframe

Final_df  = pd.DataFrame()
Final_df['text'] = [text]

In [None]:
# Giving the text which is in the dataframe to preprocess_spacy function

Final_df['docs'] = preprocess_spacy(Final_df['text'].values)

In [None]:
#Predicting the Final goal of a pdf

Final_predicted_goal = pipe.predict(Final_df['docs'])
print("SDG Goal : ",Final_predicted_goal)

In [None]:
# Predicting the probabilities of every goal according to uploaded pdf.

Final_predicted_probabilities = pipe.predict_proba(Final_df['docs']).flatten()
print("Probabilities of each goal in order :",Final_predicted_probabilities)

## Printing a final dataframe with "Goal_Numbers" and "Goal_Probabilities"!!!!

In [None]:
goals_data= {'SDG Goals':['No Poverty', 'Zero Hunger', 'Good Health and Well-Being',
                             'Quality Education', 'Gender Equality', 'Clean Water And Sanitation', 
                             'Affordable Clean Energy','Decent Work And Economic Growth',
                             'Industry,Innovation And Infrastructure', 'Reduced Inqualities',
                             'Sustainable Cities And Communities', 'Responsible Consumption And Production',
                             'Climate Action', 'Life Below Earth', 'Life on Land'],
             'Goal_Number':[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]}
df1  = pd.DataFrame(goals_data)
Prob= pipe.predict_proba(Final_df['docs']).flatten() # Probabilities of each goals
df1['Probability_Scores']= Prob
Final_Table = df1.sort_values(["Probability_Scores"], ascending=False) # Sorting the goals based on highest probabilities
print(Final_Table) #Printing the final table