## 1. Importación de librerias

In [155]:
# Instalación de librerias
# librería Natural Language Toolkit, usada para trabajar con textos 
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

import pandas as pd
import numpy as np
import sys
import seaborn as sns
from collections import Counter

import re, string, unicodedata
import contractions
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split,GridSearchCV, cross_val_score, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer, TfidfTransformer
from sklearn.feature_selection import chi2
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve, confusion_matrix, \
multilabel_confusion_matrix, classification_report
from sklearn.multioutput import ClassifierChain, MultiOutputClassifier
from sklearn import metrics
from skmultilearn.problem_transform import BinaryRelevance
from imblearn.over_sampling import SMOTE

import matplotlib.pyplot as plt

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\leane\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\leane\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\leane\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## 2. Cargar datos iniciales

In [156]:
def cargaDeFramework():
    frameworkData = pd.read_csv('app/data/verifiedArticles.csv', sep=',', encoding = 'ANSI')
    return frameworkData

In [157]:
def cargaDeScopus():    
    scopusData = pd.read_csv('app/data/scopusArticles.csv', sep=',', encoding = 'utf-8')
    return scopusData

## 3. Preparación de datos

In [158]:
stop_words = stopwords.words('english')
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def delete_numbers(words):
    """Delete all interger occurrences in list of tokenized words"""
    new_words = []
    for word in words:
        if not word.isdigit():
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stop_words:
            new_words.append(word)
    return new_words
        
    
def preprocessing(words):
    words = to_lowercase(words)
    words = delete_numbers(words)
    words = remove_punctuation(words)
    words = remove_non_ascii(words)
    words = remove_stopwords(words)
    return words

In [159]:
lemmatizer = WordNetLemmatizer()
def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    new_words = []
    for word in words:
        new_word = lemmatizer.lemmatize(word)
        new_words.append(new_word)
    return new_words

In [160]:
def criteriosACategorias(frameworkData):
    frameworkData['Criterios'] = frameworkData['Criterios'].replace(['Alineación con la estrategia y objetivos', 'Selección', 'Priorización', 'Optimización'], 'Alineamiento estatégico')
    frameworkData['Criterios'] = frameworkData['Criterios'].replace(['Papel de la analitica?', 'Gobierno del portafolio'], 'Manejo de gobernanza')
    frameworkData['Criterios'] = frameworkData['Criterios'].replace(['Identificación de riesgos', 'Medición o cuantificación de riesgos'], 'Manejo de riesgo')
    frameworkData['Criterios'] = frameworkData['Criterios'].replace(['Criterios para el monitoreo y medición de valor.', 'Criterios para establecer objetivos de valor'], 'Manejo de valor')
    frameworkData['Criterios'] = frameworkData['Criterios'].replace(['Métricas para el monitoreo del portafolio de proyectos', 'KPI'], 'Monitoreo y control de portafolio')
    frameworkData['Criterios'] = frameworkData['Criterios'].replace(['Gestión de intersados', 'Criterios para la medición de beneficios.', 'Gestión de capacidades', 'Criterios financieros para la selección de proyectos', 'Criterios financieros para la ejecución del portfolio '], 'Otros')
    return frameworkData

In [161]:
def tokenizacionLematizacion(dataframe):
    dataframe["Abstract"] = dataframe["Abstract"].apply(contractions.fix)
    dataframe['Words'] = dataframe['Abstract'].apply(word_tokenize).apply(preprocessing)
    dataframe['Words'] = dataframe['Words'].apply(lemmatize_verbs)
    dataframe['Words'] = dataframe['Words'].apply(lambda x: ' '.join(map(str, x)))
    return dataframe

In [162]:
def eliminacionDeStopwords(dataframe, dataframeStopwords):
    dataframe["Words"] = dataframe["Words"].replace('|'.join(dataframeStopwords), '', regex=True)
    dataframe["Words"] = dataframe["Words"].replace(value='', regex=r'\b(?!big)(?!dat)[a-z]{1,3}\b')
    return dataframe

In [163]:
def transformacionDeFramework(frameworkData):
    frameworkData = frameworkData[frameworkData["Criterios"].notna()]
    frameworkData = frameworkData[["Title", "Abstract", "Criterios"]]
    frameworkData['Abstract'] = frameworkData['Abstract'] + " " + frameworkData['Title']
    criteriosACategorias(frameworkData)
    tokenizacionLematizacion(frameworkData)
    frameworkStopwords = ["project", "portfolio", "help", "approach", "also", "new", "used", "management", "use", "time", "vrio", "aim", "could", "work", "order", "purpose", "making", "make", "related", "first", "towards", "role", "activity", "existing", "within", "one", "need", "organization", "current", "data", "paper"]
    eliminacionDeStopwords(frameworkData, frameworkStopwords)
    return frameworkData

In [164]:
def transformacionDeScopus(scopusData):
    scopusData = scopusData[["Title", "Abstract", "Author Keywords", "Index Keywords"]]
    scopusData['Author Keywords'] = scopusData['Author Keywords'].fillna("")
    scopusData['Index Keywords'] = scopusData['Index Keywords'].fillna("")
    scopusData['Abstract'] = scopusData['Abstract'] + " " + scopusData['Title'] + " " + scopusData['Author Keywords'] + " " +  scopusData['Index Keywords']
    scopusData = scopusData.drop(columns = ['Index Keywords', 'Author Keywords'])
    tokenizacionLematizacion(scopusData)
    scopusStopwords = ["project", "portfolio", "develop", "approach", "system", "tool", "used", "team", "current", "activity", "structure", "present", "data", "need", "within", "open", "right", "time", "paper", "proceeding", "new", "different", "towards", "case", "topic", "based"]
    eliminacionDeStopwords(scopusData, scopusStopwords)
    return scopusData

## 4. Preparación de conjunto de datos preentrenados

In [192]:
def crearModeloFramework(frameworkData):
    tfidfconverterF = TfidfVectorizer(sublinear_tf=True, ngram_range=(1, 2), encoding='latin-1', min_df=7, stop_words=('english'))
    featuresF = tfidfconverterF.fit_transform(frameworkData.Words).toarray()
    print(featuresF)
    labels = frameworkData['Criterios']
    smote = SMOTE(random_state=0, k_neighbors=1)
    featuresSmote, labelsSmote = smote.fit_resample(featuresF, labels)
    model = LinearSVC()
    print(featuresSmote.shape)
    model.fit(featuresSmote, labelsSmote);
    return model

## 5. Predicción de etiquetas y actualización de datos entrantes

In [203]:
def predecirCategorias(scopusData, model):
    tfidfconverterS = TfidfVectorizer(sublinear_tf=True, ngram_range=(1, 2), encoding='latin-1', min_df=380, stop_words=('english'))
    featuresS = tfidfconverterS.fit_transform(scopusData.Words).toarray()
    print(featuresS.shape)
    scopusData["Criterios"] = model.predict(featuresS)
    return scopusData

## 6. Carga de predicción de datos

In [167]:
def cargarPrediccion(scopusData):
    scopusData.to_csv('app/data/classifiedScopusData.csv')

## 7. Ejecución de pipeline

In [204]:
frameworkData = (
    cargaDeFramework()
    .pipe(criteriosACategorias)
    .pipe(transformacionDeFramework)
)
modeloLinearSVC = crearModeloFramework(frameworkData)
scopusData = (
    cargaDeScopus()
    .pipe(transformacionDeScopus)
    .pipe(predecirCategorias, model = modeloLinearSVC)
    .pipe(cargarPrediccion)
)

[[0.         0.24649292 0.         ... 0.         0.         0.28098211]
 [0.         0.         0.15290361 ... 0.         0.12723005 0.        ]
 [0.17022849 0.18575433 0.         ... 0.21960605 0.         0.        ]
 ...
 [0.28360093 0.         0.         ... 0.         0.         0.        ]
 [0.         0.23324442 0.         ... 0.         0.         0.        ]
 [0.         0.31206137 0.         ... 0.21789671 0.         0.        ]]
(90, 68)


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  scopusData['Author Keywords'] = scopusData['Author Keywords'].fillna("")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  scopusData['Index Keywords'] = scopusData['Index Keywords'].fillna("")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveat

(2382, 68)
