In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
from tools.record_tools import complete_records
import seaborn as sns
from scipy.stats import entropy
sns.set()

# Add missing information / Complete records

## Politics

In [6]:

path = Path('output_data/MPD_export_1846_1920.csv') 
df = pd.read_csv(path,index_col=0)
print(df[~df['S-POL'].isnull()].shape)
df = complete_records(df,'S-POL')
print(df[~df['S-POL'].isnull()].shape)


(36548, 19)
(38050, 20)


## Price

In [7]:
convert_price = {'1d':1, '2d':2 ,'½ d':.5,'3d':3,
    '1 ½ d':1.5, '6d':6,'4d':4, 
    '5d':5, '3 ½ d':3.5, '4 ½ d':4.5,
    '2 ½ d':2.5, '1s':12, 'Gratis':.0, 
    '9d':9, '7d':7, '8d':8, 'gratis':0, 
    '5s':60, '21s':252, '10s':120, '20s':240, 
    '2s':24, '8s':96, '£ 3 3s':756, '12s':144, 
    '1':1,'One Halfpenny':.5, '1 ½':1.5, 
    '26s':312, '7s':84,'½<CON>d':.5, 
    '2':2,'£ 1 12s':384,'24s':288, 
    '3 ½':3.5,'15s':189, 'Halfpenny':.5,
    '£ 1 19s':468,'40s':480,'1 ½ 4':1.5,
    '13s':156, '£ 1 10s':360, '6':6, 
    ' ½ d':.5, '6s':72,'Gra':0, 
    '42s':504,'5 ½ d':5.5,'One Penny':1,
    'Sixpence':72, '£ 2 2s':504, '£2 ':480,
    '4s':48,'£ 1 5s':252,'£ 2 16s':672 ,
    '4 ½':4.5,'3s':36}


df['first_price'] = df['S-PRICE'].apply(lambda x: x.split('<SEP>')[0] if isinstance(x,str) else x)
df['first_price'] = df['first_price'].apply(lambda x: convert_price.get(x,None))
print(df[~df['first_price'].isnull()].shape)
df = complete_records(df,'first_price')
print(df[~df['first_price'].isnull()].shape)

(40609, 21)
(42200, 22)


In [8]:
df.columns

Index(['index', 'id', 'S-TITLE', 'S-POL', 'CATEGORY', 'DISTRICT',
       'DISTRICT_PUB', 'COUNTY', 'S-PRICE', 'D-EST', 'D-PUB', 'E-LOC', 'E-ORG',
       'E-PER', 'S-TITLE-ALT', 'TEXT', 'DISTRICT_DESCRIPTION', 'year',
       'chain_id', 'value_S-POL_source_idx', 'first_price',
       'value_first_price_source_idx'],
      dtype='object')

In [9]:
df['IN_JISC'] = False
df.loc[~df['JISC'].isnull(),'IN_JISC'] = True
df['IN_JISC'].value_counts()

KeyError: 'JISC'

In [None]:
df_prov = df[df['CATEGORY'].isin(['provincial','welsh','scottish'])]
df_prov.shape

In [None]:
df_prov = df_prov[df_prov.year <= 1900]

In [None]:
has_label = lambda x, labels: 1 if set([i.strip() for i in str(x).split(';')]).intersection(set(labels)) else 0
cat2labels = {'lib':['liberal','whig','li'],'con':['conservative'],'neut':['neutral','non-party','independent']}
for c, labels in cat2labels.items():
    for l in labels:
        df_prov[c] = df_prov['S-POL'].apply(has_label,labels=labels)


In [None]:
by_year = df_prov.groupby(['year'])['id'].count()
by_year_jisc = df_prov[df_prov.IN_JISC==True].groupby(['year'])['id'].count()


In [None]:
df_prov[df_prov.IN_JISC==True].groupby('year')['lib'].sum()

In [None]:
for col,lab in [('b','lib'),('g','neut'),('r','con')]:
    df_prov.groupby('year')[lab].mean().plot(color=col).plot(color=col,style='_',linewidth=2, alpha=1)
    df_prov[df_prov['IN_JISC']==True].groupby('year')[lab].mean().plot(color=col,style='-',linewidth=1, alpha=.5)

In [None]:
dfs_jisc, dfs_all = [], []
for l in ['lib','con','neut']:
    dfs_jisc.append(df_prov[df_prov.IN_JISC==True].groupby('year')[l].mean())
    dfs_all.append(df_prov.groupby('year')[l].mean())
dfs_jisc = pd.concat(dfs_jisc,axis=1)
dfs_all = pd.concat(dfs_all,axis=1)

In [None]:
entropies = {}
for year in dfs_all.index:
    entropies[year] = entropy(dfs_all.loc[year],dfs_jisc.loc[year])

In [None]:
pd.DataFrame.from_dict(entropies,orient='index').plot()
