In [80]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [202]:
import pandas as pd
import nltk 
from nltk.corpus import stopwords
import collections
import sys
import codecs
n_palabras_comunes = 10
candidates = ['francisco','pablo','calixto','rodolfo',]
pd.set_option("max_rows", None)

In [181]:
stop_words = set(stopwords.words('spanish'))

In [182]:
def write_file(route, transcription, name):
    file = codecs.open(f'{route}/Transcripcion/{name}.txt',"w","utf-8")#write mode 
    file.write(transcription) 
    file.close() 

In [183]:
def load_data(route):
    entities = pd.read_json(route)
    data = pd.DataFrame([[i["Text"], i["Type"], i["Score"]] for i in entities["Entities"]]
                            , columns=['Entitie', 'Type', 'Score'])
    return data.sort_values('Score', ascending=False).reset_index().drop(columns=['index'])

In [184]:
def load_transcripcion(facultad):
    files =[pd.read_json(f'{facultad}/Subtitulos/{name}.json', encoding='utf-8') for name in candidates]
    transcription = [sub.loc['transcripts']['results'][0]['transcript'] for sub in files]
    for i in range(len(transcription)):
        write_file(facultad, transcription[i],candidates[i])
    return pd.DataFrame(transcription, index= candidates, columns= ['transcription'])

In [185]:
def load_discurs(data):
    text = pd.DataFrame(data.lower().replace('.', "").replace(',',"").split(' '))
    return text

In [186]:
def process_most_common(df, name, n):
    return collections.Counter(df.loc[name, 'valid_words']).most_common(n)

In [187]:
def valid_words(df):
    tokens = tokenize_column(df, 'transcription')
    valid_words = pd.DataFrame([len(tokens[i]) for i in range(len(candidates))], index=candidates, columns=['valid_words'])
    return valid_words

In [188]:
def tabulate_common(df,n):
    df['valid_words'] = tokenize_column(df, 'transcription')
    most_common = [process_most_common(df, name, n) for name in candidates]
    most_common =  pd.DataFrame([i for i in most_common ])
    most_common= most_common.transpose().rename(columns={ i:candidates[i] for i in range(4)})
    return most_common

In [189]:
def tokenize_column(df, column_name):
    return(df
            .dropna()
            .apply(lambda row: nltk.word_tokenize(row[column_name]), axis=1)
            .apply(lambda tokens: list(filter(lambda token: token.isalpha(),tokens)))
            .apply(lambda tokens: list(map(lambda token: token.lower(), tokens)))
            .apply(lambda word_list: list(filter(lambda word: word not in stop_words, word_list)))       
    )


In [413]:
def load_sentiment(route):
    all_sentiment = [pd.read_json(f'{route}/Analisis_de_Sentimiento/{name}.json') for name in candidates]
    columns=['Mixed','Negative','Neutral','Positive']
    sentiment = pd.DataFrame([], columns=columns, index=candidates)
    for i in range(len(all_sentiment)):
        temp = pd.DataFrame([])
        for g in range(all_sentiment[i]['Result'].size):
            temp = temp.append([all_sentiment[i]['Result'][g]['SentimentScore']],  ignore_index=True)
        sentiment.loc[candidates[i]] = temp.mean()
    return sentiment

# Facultad de Ingeniería

In [190]:
transcription = load_transcripcion('Ingenieria')

In [203]:
palabras_utiles = valid_words(transcription)
palabras_comunes = tabulate_common(transcription, n_palabras_comunes)

In [204]:
palabras_utiles

Unnamed: 0,valid_words
francisco,1851
pablo,2577
calixto,2041
rodolfo,2416


In [205]:
palabras_comunes

Unnamed: 0,francisco,pablo,calixto,rodolfo
0,"(investigación, 27)","(universidad, 40)","(universidad, 83)","(universidad, 61)"
1,"(programa, 27)","(derecho, 28)","(rector, 21)","(tener, 23)"
2,"(estudiantes, 25)","(estudiantes, 27)","(propuesta, 17)","(estudiantes, 22)"
3,"(supuesto, 25)","(hoy, 23)","(entonces, 16)","(hacer, 21)"
4,"(universidad, 24)","(hacer, 17)","(facultad, 15)","(magdalena, 19)"
5,"(tema, 20)","(ser, 15)","(va, 15)","(vamos, 19)"
6,"(posibilidad, 19)","(dos, 15)","(estudiantes, 14)","(importante, 17)"
7,"(puedan, 17)","(profesores, 14)","(hacer, 13)","(ser, 17)"
8,"(hacer, 14)","(cuatro, 14)","(ser, 13)","(nivel, 17)"
9,"(tener, 14)","(cine, 14)","(aquí, 12)","(van, 15)"


# Facultad de Ciencias de la Salud

In [194]:
transcription = load_transcripcion('Salud')

In [206]:
palabras_utiles = valid_words(transcription)
palabras_comunes = tabulate_common(transcription, n_palabras_comunes)

In [207]:
palabras_utiles

Unnamed: 0,valid_words
francisco,1851
pablo,2577
calixto,2041
rodolfo,2416


In [208]:
palabras_comunes

Unnamed: 0,francisco,pablo,calixto,rodolfo
0,"(investigación, 27)","(universidad, 40)","(universidad, 83)","(universidad, 61)"
1,"(programa, 27)","(derecho, 28)","(rector, 21)","(tener, 23)"
2,"(estudiantes, 25)","(estudiantes, 27)","(propuesta, 17)","(estudiantes, 22)"
3,"(supuesto, 25)","(hoy, 23)","(entonces, 16)","(hacer, 21)"
4,"(universidad, 24)","(hacer, 17)","(facultad, 15)","(magdalena, 19)"
5,"(tema, 20)","(ser, 15)","(va, 15)","(vamos, 19)"
6,"(posibilidad, 19)","(dos, 15)","(estudiantes, 14)","(importante, 17)"
7,"(puedan, 17)","(profesores, 14)","(hacer, 13)","(ser, 17)"
8,"(hacer, 14)","(cuatro, 14)","(ser, 13)","(nivel, 17)"
9,"(tener, 14)","(cine, 14)","(aquí, 12)","(van, 15)"


# Facultad de Humanidades

In [198]:
transcription = load_transcripcion('Humanidades')

In [414]:
palabras_comunes = tabulate_common(transcription, n_palabras_comunes )
palabras_utiles = valid_words(transcription)
sentiment = load_sentiment('Ingenieria')

In [415]:
palabras_utiles

Unnamed: 0,valid_words
francisco,1851
pablo,2577
calixto,2041
rodolfo,2416


In [416]:
palabras_comunes

Unnamed: 0,francisco,pablo,calixto,rodolfo
0,"(investigación, 27)","(universidad, 40)","(universidad, 83)","(universidad, 61)"
1,"(programa, 27)","(derecho, 28)","(rector, 21)","(tener, 23)"
2,"(estudiantes, 25)","(estudiantes, 27)","(propuesta, 17)","(estudiantes, 22)"
3,"(supuesto, 25)","(hoy, 23)","(entonces, 16)","(hacer, 21)"
4,"(universidad, 24)","(hacer, 17)","(facultad, 15)","(magdalena, 19)"
5,"(tema, 20)","(ser, 15)","(va, 15)","(vamos, 19)"
6,"(posibilidad, 19)","(dos, 15)","(estudiantes, 14)","(importante, 17)"
7,"(puedan, 17)","(profesores, 14)","(hacer, 13)","(ser, 17)"
8,"(hacer, 14)","(cuatro, 14)","(ser, 13)","(nivel, 17)"
9,"(tener, 14)","(cine, 14)","(aquí, 12)","(van, 15)"


In [417]:
sentiment

Unnamed: 0,Mixed,Negative,Neutral,Positive
francisco,0.125788,0.0170564,0.614211,0.242945
pablo,0.0716548,0.0559598,0.531929,0.340457
calixto,0.0140588,0.0196071,0.652026,0.314308
rodolfo,0.0145522,0.0852771,0.586234,0.313937
