In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 

sns.set_style('darkgrid')
sns.set_context('notebook')

In [None]:
df = pd.read_csv("./data2/ana2_BTS_geoOnly_210213_morning_CDT.csv",index_col='index')
df

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df['place'].isnull().value_counts(normalize=True)

In [None]:
df['country'].isnull().value_counts(normalize=True)

In [None]:
df.dropna(axis=0, how='any', inplace=True)
df.info()

In [None]:
df['place'] = df.place.apply(lambda x: x[1:-1].split(','))
df.place.head()

In [None]:
city_list = []
country_list = []
for row in df.place: 
    city_list.append(row[:-1])
    country_list.append(row[-1])
print('cities:\n',city_list,'\n')
print('countries:\n',country_list)

In [None]:
df.insert(6, "place_1", city_list, True)
df.head()

In [None]:
df.insert(7, "place_2", country_list, True)
df.head()

#### for debugging purpose: you can uncomment lines below

In [None]:
#df.drop(['place_1','place_2'],axis=1,inplace=True)
#df.head()

In [None]:
df.drop(['place','time'],axis=1,inplace=True)
df.head()

In [None]:
def top_count_bar_chart(xlist, ylist, col, title):
    
    sns.set_context('notebook')
    fig, ax = plt.subplots(ncols=1, nrows=1, figsize=(12,6))
    
    ax.bar(xlist, ylist, color=col)
    ax.set_title('{}'.format(title), size=24)
    ax.set_xticklabels(xlist, fontsize=14, rotation=30, ha='right')
    ax.set_xlabel('Countries', size=24)
    ax.set_ylabel('Twitter User Counts', size=24)
    ax.set(yscale='log')

In [None]:
top_count_bar_chart(list(df["country"].value_counts().keys()[:15]), list(df["country"].value_counts().values[:15]), 'red', 
                "21/02/13 morning Twitter Filtered Streams")

In [None]:
dfg = df.groupby(['country'])['language'].count().sort_values(ascending=True).reset_index()
dfg

In [None]:
fig, ax = plt.subplots(ncols=1, nrows=1, figsize=(10,8))
ax.barh(list(dfg.country), list(dfg.language), color='blue')
ax.set_title('21/02/13 morning Twitter Filtered Streams', size=24)
ax.set_xticklabels(ax.get_xticks(), size = 14)
ax.set_xlabel('Twitter User Counts', size=24)
ax.set_ylabel('Countries', size=24)
ax.set_yticklabels(dfg.country, fontsize=14, rotation=30, ha='right')
plt.show()

In [None]:
df.groupby(['country'])['language'].value_counts()

In [None]:
df.groupby(['country'])['language'].value_counts().keys()

In [None]:
print(df.groupby(['country'])['language'].value_counts().keys()[0])
print(df.groupby(['country'])['language'].value_counts().keys()[0][0])
print(df.groupby(['country'])['language'].value_counts().keys()[0][1])

In [None]:
count_lan = defaultdict(list)
for i in df.groupby(['country'])['language'].value_counts().keys():
    count_lan[i[0]].append(i[1])
count_lan

#### for debugging purpose: you can uncomment/comment lines below

In [None]:
#print(count_lan['Argentina'])
#print(count_lan['Belgium'])
#print(count_lan['Brazil'])

In [None]:
print(count_lan.keys())
print(count_lan.values())

In [None]:
count_lantype = dict()
for i in count_lan:
    count_lantype[i] = np.size(count_lan[i])
count_lantype

#### for debugging purpose: you can uncomment/comment lines below

In [None]:
#count_lantype.pop(0)
#count_lantype.pop(1)
#count_lantype

In [None]:
#sorted_count_lantype = sorted(count_lantype, key=count_lantype.get, reverse=True)
sorted_count_lantype = {k: v for k, v in sorted(count_lantype.items(), key=lambda x:x[1], reverse=False)}
sorted_count_lantype

In [None]:
print(sorted_count_lantype.keys())
print(sorted_count_lantype.values())

In [None]:
fig, ax = plt.subplots(ncols=1, nrows=1, figsize=(10,8))
ax.barh(list(sorted_count_lantype.keys()), list(sorted_count_lantype.values()), color='blue')
ax.set_title('21/02/13 morning Twitter Filtered Streams', size=24)
ax.set_xticklabels(ax.get_xticks(), size = 14)
ax.set_xlabel('Language Type Counts', size=24)
ax.set_ylabel('Countries', size=24)
ax.set_yticklabels(list(sorted_count_lantype.keys()), fontsize=14, rotation=30, ha='right')
plt.show()

In [None]:
count_lan

In [None]:
dict_geolang = dict()
dict_geoList = []
dict_lanList = []
for i in count_lan:
    for j in count_lan[i]:
        dict_geolang.setdefault(i, []).append(j)
        dict_geoList.append(i)
        dict_lanList.append(j)
print(dict_geolang)
print(dict_geoList)
print(dict_lanList)

In [None]:
data = {'Country': dict_geoList,'Language': dict_lanList}

In [None]:
data2 = {'Country': list(dict_geolang.keys()),'Language': list(dict_geolang.values())}

In [None]:
df2 = pd.DataFrame(data, columns = ['Country','Language']).set_index('Country')
df2

In [None]:
df5 = pd.DataFrame(data2, columns = ['Country','Language'])
df5

In [None]:
langtypes_list = []
for i in df5['Language']:
    for j in i:
        langtypes_list.append(j)
print(set(langtypes_list))

In [None]:
df6 = df5.copy()
for i in set(langtypes_list):
    df6[i] = np.zeros(shape=df6.shape[0])
df6.columns

In [None]:
df6

In [None]:
for idx, row in df6.iterrows():
    if row['Language']:
        for i in row['Language']:
            df6.loc[idx, i] = 1
df6

In [None]:
df6.drop(['Language'], axis=1, inplace=True)
df6.set_index('Country', inplace=True)
df6

https://indianaiproduction.com/seaborn-heatmap/

https://stackoverflow.com/questions/34706845/change-xticklabels-fontsize-of-seaborn-heatmap

In [None]:
sns.set(font_scale=1.6)
plt.figure(figsize=(10,10))
cbar_kws = {"orientation":"vertical", 
            "shrink":1.,
            'extend':'min', 
            'extendfrac':0.05, 
            "ticks":np.arange(0,2), 
            "drawedges":True,
           }
sns.heatmap(df6, cmap='coolwarm', center=0.5, linewidths=2, square= True, annot=True, cbar_kws=cbar_kws)

In [None]:
df3 = pd.read_csv("./data2/ana2_BTS_geoOnly_210213_morning_CDT.csv",index_col='index')
df3.head()

In [None]:
df3p = df3.drop(['date','hour','retweets','min','place','replies','likes','quotes','sec','time','tweet'],axis=1)
df3p.dropna(axis=0, how='any', inplace=True)
df3p

In [None]:
langs_list = []
for i in df3p['language']:
    langs_list.append(i)
print(set(langs_list))

In [None]:
df4 = df3p.copy()
for i in set(langs_list):
    df4[i] = np.zeros(shape=df4.shape[0])
df4.columns

In [None]:
for idx, row in df4.iterrows():
    if row['language']:
        df4.loc[idx, row['language']] = 1
df4

In [None]:
df4.drop(['language'], axis=1, inplace=True)
df4

In [None]:
df4.set_index('country', inplace=True)
df4

In [None]:
#dfgp = df.groupby(['country'])
#for name, group in dfgp:
#    print(name, ' ', group['language'])
#    print('--------------------------------')

In [None]:
#loc_lan = defaultdict(list)
#for name, group in dfgp:
#    for i in group['language']:
#        loc_lan[name].append(group['language'])
#loc_lan