## Prepare for Binomial model

In [6]:
import pandas as pd
import numpy as np

In [8]:
votes = pd.DataFrame.from_csv('VotesRaw.csv')

In [10]:
np.unique(votes['QuestionId'].values).size

2516

In [5]:
votes['Ans_count'].value_counts()

2     7182
3     5820
1     4121
4     3764
5     2255
6     1212
8      680
10     660
7      616
9      504
11     143
12      84
Name: Ans_count, dtype: int64

In [5]:
votesnnet = pd.DataFrame.from_csv('QtnnetAll.csv')
votesnnet.groupby(['QuestionId','Age']).sum()['Votes'].value_counts()

1     6978
2     2273
3     1022
4      498
5      290
6      187
7      114
8       66
9       36
11      26
10      23
12      15
13      10
14       7
15       4
18       4
16       3
20       2
19       2
21       1
Name: Votes, dtype: int64

In [None]:
## Check uniqueness of qidt-position pair for plm and mlogit format from R
votes.groupby(['QuestionId','Age','AnsRank']).apply(lambda df:df.shape[0]>1).reset_index(drop=True).any()

In [None]:
#There should not be only 0 choice
votes.groupby(['QuestionId','Age']).apply(lambda df: (df.Votes==0).all()).reset_index(drop=True).any()

In [None]:
from functools import partial

nn_feats =['QuestionId','Age','AnsRank','Votes','ReScore','Norm_Pos','Norm_Pos_2','Norm_DRank']
def replicate(df,pos=1):
    #assert df.Ans_count.iloc[0] >= pos,"Target position must be less than or eq than Tot nb of answers."
    binary_ch = pd.DataFrame()
    failures = int(df[df.AnsRank!=pos].Votes.sum())
    #assert df[df.AnsRank==pos].Votes.shape[0]==1,"There must be one ans at a given position per Qid-age pair."
    try:
        successes = int(df[df.AnsRank==pos].Votes.iloc[0])
    except:
        print "No pos%d in df"%pos
        print df
    if failures > 0:
        pos_feats = df[df.AnsRank==pos][nn_feats]
        pos_feats.loc[:,'Votes'] = 0 
        binary_ch = binary_ch.append([pos_feats]*failures,ignore_index=True)
    if successes > 0:
        pos_feats = df[df.AnsRank==pos][nn_feats]
        pos_feats.loc[:,'Votes'] = 1 
        binary_ch = binary_ch.append([pos_feats]*successes,ignore_index=True)
    return binary_ch

tgt_pos = 3
replicated_bin = pd.DataFrame()
for k,g in votes.groupby(['QuestionId','Age']):
    assert (int(g.Ans_count.iloc[0])==g.Ans_count).all(), 'Ans count must be unique per Qid-age pair'
    if int(g.Ans_count.iloc[0]) >= tgt_pos:
        replicated_bin = replicated_bin.append(replicate(g,pos=tgt_pos))
        
replicated_bin.to_csv('BinCh%d.csv'%(tgt_pos))       
replicated_bin.head(10)

In [None]:
replicated_bin[replicated_bin.Votes>1]

In [None]:
replicated_bin.shape

## Prepare for nnet, replicate rows

In [None]:
qt_idx = votes.groupby(['QuestionId','Age']).count().reset_index(level=[0,1],drop=False)[['QuestionId','Age']]
#qt_idx.to_csv('Qt_idx.csv')

In [None]:
# nnet to predict for a fixed nbr of choices
for_nnet = pd.DataFrame()
nb_choices_nnet = 2
for k,g in votes[votes.Ans_count==nb_choices_nnet].groupby(['Votes']):
    if k>1:
        for_nnet = for_nnet.append([g]*(int(k-1)),ignore_index=True)
    else:
        for_nnet = for_nnet.append(g,ignore_index=True)
for_nnet.loc[for_nnet.Votes>1,'Votes']=1
for_nnet = for_nnet[['QuestionId','AnsRank','Ans_count','AnsId','Age','Votes','ReScore','Norm_Pos','Norm_Pos_2','Norm_DRank']]
for_nnet = for_nnet[for_nnet.Votes>0]
for_nnet = pd.merge(for_nnet,qt_idx.reset_index(drop=False),how='left',on=['QuestionId','Age'])
for_nnet.to_csv('Qtnnet%d.csv'%nb_choices_nnet)
print for_nnet.shape
for_nnet.head()

## Prepare for mlogit, fixed nb of choices

In [None]:
votes_qt_idx = pd.merge(votes,qt_idx.reset_index(drop=False),how='left',on=['QuestionId','Age'])
votes_qt_idx = votes_qt_idx[['AnsId','QuestionId','Ans_count','index','Votes','AnsRank',
                             'Age','ReScore','Norm_Pos','Norm_Pos_2','Norm_DRank','EPbias']]

In [None]:
nb_choices = 2
features = ['AnsId','QuestionId','index','Votes','AnsRank','Age',
            'ReScore','Norm_Pos','Norm_Pos_2','Norm_DRank','EPbias']
fixed_ch = votes_qt_idx[votes_qt_idx.Ans_count==nb_choices][features]
fixed_ch.to_csv('Qt%d.csv'%(nb_choices))

## Prepare for plm package, as a panel 

In [None]:
# test this part later
features = ['ReScore','Norm_Pos','Norm_Pos_2','Norm_DRank','EPbias']
def prepare_logit(df):
    s_df = df.sort_values(by='AnsRank')
    ref = s_df.iloc[-1]
    feats_df = s_df.apply(lambda row: row[features] - ref[features], axis=1).iloc[:-1]
    feats_df['AnsId'] = s_df['AnsId'][:-1]
    feats_df['AnsRank'] = s_df['AnsRank'][:-1]
    feats_df['Votes'] = s_df['Votes'][:-1]
    return feats_df

In [None]:
votes_logit = votes.groupby(['QuestionId','Age']).apply(prepare_logit).reset_index(level=[0,1],drop=False)

In [None]:
get_comp = lambda df: int(df.shape[0])>1 
tt = votes_logit.groupby(['QuestionId','Age','AnsRank']).apply(get_comp).reset_index(drop=True)
sum(tt)

In [None]:
print votes_logit.shape
print votes_logit[votes_logit.Votes==0].shape

In [None]:
map_qt_idx = votes_logit.groupby(['QuestionId','Age']).count().reset_index(level=[0,1],drop=False)[['QuestionId','Age']]
map_qt_idx.head()

In [None]:
map_qt_idx.duplicated().any()

In [None]:
map_qt_idx.to_csv('Qt_idx.csv')

In [None]:
## Keep in mind that features are now differences for last choice for a given question-time pair
idxed_X = pd.merge(votes_logit,map_qt_idx.reset_index(drop=False),how='left',on=['QuestionId','Age'])

for_r = idxed_X[['index','QuestionId','AnsId','AnsRank','Votes','Age',
                 'ReScore','Norm_Pos','Norm_Pos_2','Norm_DRank','EPbias']]

In [None]:
for_r.to_csv('QtPanelFull.csv')

In [None]:
for_r.shape

In [None]:
for_r[for_r.Votes==0].shape

In [None]:
for_r[pd.isnull(for_r).any(axis=1)]

In [None]:
sum(for_r.groupby('index').count().reset_index(drop=True).Votes>1)

In [None]:
for_r.groupby(['index','AnsRank']).count()