## Import Libraries

In [1]:
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from scipy.sparse import csr_matrix

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import lil_matrix

import pickle
import nltk
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from unicodedata import normalize

## Define functions

In [2]:
#concatenate
def concat(*args):
    strs = [str(arg) for arg in args]
    return ','.join(strs) if strs else np.nan

#remove portuguese characters
def remove_special(txt, codif='latin-1'):
    return normalize('NFKD', txt.decode(codif)).encode('ASCII','ignore')


if __name__ == '__main__':
    from doctest import testmod
    testmod()
    
#create vocablist from category    
def get_token(text):
    stopwords = nltk.corpus.stopwords.words('portuguese')
    stemmer = nltk.stem.RSLPStemmer()
    aux = nltk.word_tokenize(text)
    freq = []
    for w in aux:
        if w not in stopwords and len(w)>2:
            freq.append(w)
    #for i in range(len(freq)):
        #freq[i] = stemmer.stem(freq[i])
    freq = nltk.FreqDist(freq)
    vocablist = []
    for w,f in freq.most_common():
        if f>0:
            vocablist.append(w)
    #vocablist = freq.items()
    return vocablist

#return the respective vector when comparing to vocablist
def get_vector(text, tokens):
    vec = lil_matrix((len(text),len(tokens)))
    for j in range(len(text)):
        ca = get_token(text[j])
        for i in range(len(tokens)):
            if tokens[i] in ca:
                vec[j,i] = 1
    return vec

def get_models(category,y,vocab_known,app):
    models = []
    for column in category:
        text = ' '.join([''.join(sentence) for sentence in vocab_known[column][app]])
        token = get_token(text)
        clf = LogisticRegression()
        X = get_vector(category[column],token)
        clf.fit(X,y)
        models.append(clf)
    return models

def get_vocabs(data, ind):
    vocabs = []
    for column in data:
        text = ' '.join([''.join(sentence) for sentence in data[column][ind]])
        token = get_token(text)
        vocabs.append(token)
    return vocabs

def extra_remove(text):
    text = text.replace('\xc3','')
    text = text.replace('\x89','')
    text = text.replace('\xa7','')
    text = text.replace('\xa1','')
    text = text.replace('\xa3','')
    text = text.replace('\xad','')
    text = text.replace('\xaa','')
    text = text.replace('\xa9','')
    text = text.replace('\xb5','')
    text = text.replace('\xb3','')
    text = text.replace('\xa0','')
    text = text.replace('\xb4','')
    return text

## Load general data

In [3]:
path  = '/Users/andremendes/OneDrive/Estudar/Estudar Projects/JFS/Data-Analysis-JFS-Matlab-Python/Data/'
general = pd.read_csv(path+'text_data_csv.csv')
teste = pd.read_csv(path+'index_unknown.csv')
index = list(teste.values.flatten()) #get the index of unknown data
labels = general.Result

## Concatenate all data per category

In [4]:
category = pd.DataFrame(columns = ['extra activities','honors','life project', 'all text'])
np_concat = np.vectorize(concat)

#initilialize
aux = np.empty(len(general),dtype=str)

#concatenate all text from each category in column vectors
for i in range(1,6):
    aux = np_concat(aux, general.iloc[:,i])
category['extra activities'] = aux;
aux = np.empty(len(general),dtype=str)
for i in range(7,9):
    aux = np_concat(aux, general.iloc[:,i])
category['honors'] = aux;
aux = np.empty(len(general),dtype=str)
for i in range(1,10):
    aux = np_concat(aux, general.iloc[:,i])
category['all text'] = aux;   
category['life project'] = np.array(general.iloc[:,9], dtype=str)

#remove special caracters
for column in category:
    category[column] = category[column].apply(lambda x: remove_special(x))

 ## Create Vocabularies and Models

In [5]:
#data from approved candidates
#vocab_app_known = category.drop(index)
#vocab_labels = labels.drop(index)
#app = vocab_labels[vocab_labels==1].index
#vocabs = get_vocabs(vocab_app_known,app)
#with open(path+'vocab_app.pkl', 'wb') as fid: pickle.dump(vocabs, fid)
    
#models = get_models(category,y,vocab_app_known,app)
#with open(path+'models.pkl', 'wb') as fid: pickle.dump(models, fid)    

## Load models and vocabularies

In [6]:
y = labels
scores = pd.DataFrame(columns = ['extra activities','honors','life project', 'all text'])
with open(path+'vocab_app.pkl', 'rb') as fid: vocabs = pickle.load(fid)
with open(path+'models.pkl', 'rb') as fid: models = pickle.load(fid)
i=0

## Get Scores

In [7]:
for column in category:    
    #get scores
    vocab = vocabs[i]
    clf = models[i]
    X = get_vector(category[column],vocab)
    predicted = clf.predict_proba(X)
    scores[column] = predicted[:,1]
    i+=1

## Save scores

In [8]:
scores.to_csv(path+"text_scores.csv")