# Obtenga la matriz de correlación para saber idiomas distintos considerando que un usuario sabe un idioma si indicó un nivel de 1 o superior (⭐⭐)

In [398]:
import pandas as pd
lang = pd.read_csv('../data/languages.csv')
lang

Unnamed: 0,babel_user,babel_lang,babel_level
0,4502458,FR,2
1,5928200,aa,0
2,46918,ab,0
3,2050449,ab,1
4,4715583,ace,0
...,...,...,...
35661,2089885,zh-Hant-HK,1
35662,2089885,zh-Hant-TW,3
35663,5036533,zu,0
35664,5928200,zu,0


In [399]:
lang.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35666 entries, 0 to 35665
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   babel_user   35666 non-null  int64 
 1   babel_lang   35662 non-null  object
 2   babel_level  35666 non-null  object
dtypes: int64(1), object(2)
memory usage: 836.1+ KB


In [400]:
lang.babel_level.unique()

array(['2', '0', '1', '3', '4', 'N', '5'], dtype=object)

In [401]:
# Cambio el dtype de babeL_level a enteros.
# Asigno a los nativos a nivel 5 y los xx-5 a nivel 6.
def traducir_nivel(string):
    traduccion = {
        '0':0,
        '1':1,
        '2':2,
        '3':3,
        '4':4,
        'N':5,
        '5':6,
    }

    return traduccion[string]

lang.babel_level = lang.babel_level.map(traducir_nivel)
# Considero las provenencias de los idiomas como el mismo idioma.
lang.babel_lang = lang.babel_lang.map(lambda x: str(x).split('-')[0].lower())
lang.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35666 entries, 0 to 35665
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   babel_user   35666 non-null  int64 
 1   babel_lang   35666 non-null  object
 2   babel_level  35666 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 836.1+ KB


In [402]:
# Filtro los entries que tienen un nivel de babel menor a 1.
idiomas = lang[lang.babel_level >= 1]
idiomas

Unnamed: 0,babel_user,babel_lang,babel_level
0,4502458,fr,2
3,2050449,ab,1
6,4494742,acf,1
8,70090,af,1
9,407454,af,1
...,...,...,...
35659,5915818,zh,5
35660,2089885,zh,4
35661,2089885,zh,1
35662,2089885,zh,3


In [403]:
# Organizo un poco el df, quito la columna babel_level porque ya no la necesito.
idiomas = idiomas.drop('babel_level', axis=1)
idiomas = idiomas.rename(columns={'babel_user':'user', 'babel_lang':'lang'})
idiomas

Unnamed: 0,user,lang
0,4502458,fr
3,2050449,ab
6,4494742,acf
8,70090,af
9,407454,af
...,...,...
35659,5915818,zh
35660,2089885,zh
35661,2089885,zh
35662,2089885,zh


In [404]:
# Genero una pivot table que muestra que idiomas sabe cada usuario.
idiomas = idiomas.pivot_table(index='user', columns='lang', aggfunc='size', fill_value=0)
idiomas

lang,ab,acf,af,agr,akk,aln,am,an,ang,ar,...,war,wuu,xcw,yi,yua,yue,zap,zea,zh,zza
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
30,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
38,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
262,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
267,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
294,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6318249,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6327193,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6327824,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6331490,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [405]:
# Consigo la matriz de correlacion de los idiomas.
idiomas.corr()

lang,ab,acf,af,agr,akk,aln,am,an,ang,ar,...,war,wuu,xcw,yi,yua,yue,zap,zea,zh,zza
lang,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ab,1.000000,-0.000093,-0.000293,-0.000093,-0.000093,-0.000131,-0.000131,-0.000700,-0.000227,-0.001183,...,-0.000093,-0.000093,-0.000093,-0.000262,-0.000131,-0.000278,-0.000093,-0.000093,-0.001218,-0.000093
acf,-0.000093,1.000000,-0.000293,-0.000093,-0.000093,-0.000131,-0.000131,-0.000700,-0.000227,-0.001183,...,-0.000093,-0.000093,-0.000093,-0.000262,-0.000131,-0.000278,-0.000093,-0.000093,-0.001218,-0.000093
af,-0.000293,-0.000293,1.000000,-0.000293,-0.000293,-0.000414,-0.000414,-0.002216,-0.000717,0.021380,...,-0.000293,-0.000293,-0.000293,-0.000828,-0.000414,-0.000878,-0.000293,-0.000293,-0.003852,-0.000293
agr,-0.000093,-0.000093,-0.000293,1.000000,1.000000,-0.000131,-0.000131,-0.000700,-0.000227,0.078226,...,-0.000093,-0.000093,-0.000093,-0.000262,-0.000131,-0.000278,-0.000093,-0.000093,0.065250,-0.000093
akk,-0.000093,-0.000093,-0.000293,1.000000,1.000000,-0.000131,-0.000131,-0.000700,-0.000227,0.078226,...,-0.000093,-0.000093,-0.000093,-0.000262,-0.000131,-0.000278,-0.000093,-0.000093,0.065250,-0.000093
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yue,-0.000278,-0.000278,-0.000878,-0.000278,-0.000278,-0.000393,-0.000393,-0.002102,-0.000680,-0.003550,...,-0.000278,0.333210,-0.000278,-0.000786,-0.000393,1.000000,-0.000278,-0.000278,0.284477,-0.000278
zap,-0.000093,-0.000093,-0.000293,-0.000093,-0.000093,-0.000131,-0.000131,-0.000700,-0.000227,-0.001183,...,-0.000093,-0.000093,-0.000093,-0.000262,0.707074,-0.000278,1.000000,-0.000093,-0.001218,-0.000093
zea,-0.000093,-0.000093,-0.000293,-0.000093,-0.000093,-0.000131,-0.000131,-0.000700,-0.000227,-0.001183,...,-0.000093,-0.000093,-0.000093,-0.000262,-0.000131,-0.000278,-0.000093,1.000000,-0.001218,-0.000093
zh,-0.001218,-0.001218,-0.003852,0.065250,0.065250,-0.001722,-0.001722,0.017264,0.024158,0.058321,...,-0.001218,0.065250,-0.001218,-0.003445,-0.001722,0.284477,-0.001218,-0.001218,1.000000,-0.001218
