In [80]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [180]:
import pandas as pd
import nltk 
from nltk.corpus import stopwords
import collections
import sys
import codecs
candidates = ['francisco','pablo','calixto','rodolfo',]
pd.set_option("max_rows", None)

In [181]:
stop_words = set(stopwords.words('spanish'))

In [182]:
def write_file(route, transcription, name):
    file = codecs.open(f'{route}/Transcripcion/{name}.txt',"w","utf-8")#write mode 
    file.write(transcription) 
    file.close() 

In [183]:
def load_data(route):
    entities = pd.read_json(route)
    data = pd.DataFrame([[i["Text"], i["Type"], i["Score"]] for i in entities["Entities"]]
                            , columns=['Entitie', 'Type', 'Score'])
    return data.sort_values('Score', ascending=False).reset_index().drop(columns=['index'])

In [184]:
def load_transcripcion(facultad):
    files =[pd.read_json(f'{facultad}/Subtitulos/{name}.json', encoding='utf-8') for name in candidates]
    transcription = [sub.loc['transcripts']['results'][0]['transcript'] for sub in files]
    for i in range(len(transcription)):
        write_file(facultad, transcription[i],candidates[i])
    return pd.DataFrame(transcription, index= candidates, columns= ['transcription'])

In [185]:
def load_discurs(data):
    text = pd.DataFrame(data.lower().replace('.', "").replace(',',"").split(' '))
    return text

In [186]:
def process_most_common(df, name, n):
    return collections.Counter(df.loc[name, 'valid_words']).most_common(n)

In [187]:
def valid_words(df):
    tokens = tokenize_column(df, 'transcription')
    valid_words = pd.DataFrame([len(tokens[i]) for i in range(len(candidates))], index=candidates, columns=['valid_words'])
    return valid_words

In [188]:
def tabulate_common(df,n):
    df['valid_words'] = tokenize_column(df, 'transcription')
    most_common = [process_most_common(df, name, n) for name in candidates]
    most_common =  pd.DataFrame([i for i in most_common ])
    most_common= most_common.transpose().rename(columns={ i:candidates[i] for i in range(4)})
    return most_common

In [189]:
def tokenize_column(df, column_name):
    return(df
            .dropna()
            .apply(lambda row: nltk.word_tokenize(row[column_name]), axis=1)
            .apply(lambda tokens: list(filter(lambda token: token.isalpha(),tokens)))
            .apply(lambda tokens: list(map(lambda token: token.lower(), tokens)))
            .apply(lambda word_list: list(filter(lambda word: word not in stop_words, word_list)))       
    )


# Facultad de Ingeniería

In [190]:
transcription = load_transcripcion('Ingenieria')

In [191]:
palabras_utiles = valid_words(transcription)
palabras_comunes = tabulate_common(transcription, 5 )


In [192]:
palabras_utiles

Unnamed: 0,valid_words
francisco,1827
pablo,2913
calixto,1847
rodolfo,2171


In [193]:
palabras_comunes

Unnamed: 0,francisco,pablo,calixto,rodolfo
0,"(universidad, 60)","(universidad, 47)","(universidad, 90)","(universidad, 60)"
1,"(ingeniería, 25)","(facultad, 28)","(estudiantes, 26)","(ser, 22)"
2,"(hoy, 17)","(ingeniería, 24)","(calidad, 21)","(hacer, 21)"
3,"(estudiantes, 16)","(estudiantes, 23)","(rector, 19)","(tener, 21)"
4,"(supuesto, 16)","(profesores, 20)","(hacer, 19)","(magdalena, 18)"


# Facultad de Ciencias de la Salud

In [194]:
transcription = load_transcripcion('Salud')

In [195]:
palabras_utiles = valid_words(transcription)
palabras_comunes = tabulate_common(transcription, 5 )

In [196]:
palabras_utiles

Unnamed: 0,valid_words
francisco,1752
pablo,2838
calixto,2110
rodolfo,2117


In [197]:
palabras_comunes

Unnamed: 0,francisco,pablo,calixto,rodolfo
0,"(universidad, 37)","(universidad, 51)","(universidad, 95)","(universidad, 43)"
1,"(supuesto, 32)","(estudiantes, 36)","(estudiantes, 31)","(ser, 21)"
2,"(estudiantes, 28)","(salud, 32)","(hacer, 24)","(hacer, 18)"
3,"(programa, 21)","(si, 25)","(calidad, 19)","(magdalena, 17)"
4,"(ejemplo, 21)","(mil, 22)","(voy, 14)","(vamos, 15)"


# Facultad de Humanidades

In [198]:
transcription = load_transcripcion('Humanidades')

In [199]:
palabras_comunes = tabulate_common(transcription, 5 )
palabras_utiles = valid_words(transcription)

In [200]:
palabras_utiles

Unnamed: 0,valid_words
francisco,1851
pablo,2577
calixto,2041
rodolfo,2416


In [201]:
palabras_comunes

Unnamed: 0,francisco,pablo,calixto,rodolfo
0,"(investigación, 27)","(universidad, 40)","(universidad, 83)","(universidad, 61)"
1,"(programa, 27)","(derecho, 28)","(rector, 21)","(tener, 23)"
2,"(estudiantes, 25)","(estudiantes, 27)","(propuesta, 17)","(estudiantes, 22)"
3,"(supuesto, 25)","(hoy, 23)","(entonces, 16)","(hacer, 21)"
4,"(universidad, 24)","(hacer, 17)","(facultad, 15)","(magdalena, 19)"
