In [22]:
%matplotlib inline
import pandas as pd
import numpy as np

In [47]:
a_votes = pd.DataFrame.from_csv('votes-answers.csv', index_col=None)

a_votes['age'] = (pd.to_datetime(a_votes.VoteCreation,format='%Y-%m-%d %H:%M:%S.%f')
                  -pd.to_datetime(a_votes.AnsCreation,format='%Y-%m-%d %H:%M:%S.%f')).apply(lambda x: x.astype('timedelta64[D]').item().days)
a_votes['age'] = a_votes['age'] + 1
a_votes.drop(a_votes.columns[[0, 1]], axis=1, inplace=True)
a_votes.head(10)

Unnamed: 0,VoteType,AnsId,QuestionId,age
0,2,4,1,0
1,2,4,1,0
2,3,4,1,0
3,2,4,1,1436
4,2,5,1,0
5,3,5,1,108
6,2,6,2,0
7,2,8,2,0
8,2,9,3,0
9,2,9,3,0


In [48]:
a_comments = pd.DataFrame.from_csv('comment-ans.csv', index_col=None)

a_comments['age'] = (pd.to_datetime(a_comments.CommentCreation,format='%Y-%m-%d %H:%M:%S.%f')
                  -pd.to_datetime(a_comments.AnsCreation,format='%Y-%m-%d %H:%M:%S.%f')).apply(lambda x: x.astype('timedelta64[D]').item().days)
a_comments['age'] = a_comments['age'] + 1
a_comments.drop(a_comments.columns[[0, 1]], axis=1, inplace=True)
a_comments.head(10)

Unnamed: 0,AnsId,age
0,4,1
1,4,1
2,5,1
3,9,1
4,9,1
5,9,1
6,10,1
7,10,1
8,10,5
9,11,1


In [49]:
get_score = lambda x: sum(x.VoteType==2) - sum(x.VoteType==3)
get_votes = lambda x: sum(x.VoteType==2) + sum(x.VoteType==3)
f = {'Score' : get_score,'Votes' : get_votes}
a_groups = a_votes.sort_values(by='age').groupby(['QuestionId','AnsId','age']).apply(lambda df: pd.Series({'Score':get_score(df),
                                                                                                          'Votes':get_votes(df)}))
a_groups = a_groups.reset_index(level=[0,1,2],drop=False)
a_groups.head(15)

Unnamed: 0,QuestionId,AnsId,age,Score,Votes
0,1,4,0,1,3
1,1,4,1436,1,1
2,1,5,0,1,1
3,1,5,108,-1,1
4,1,56,0,2,2
5,1,56,1,3,3
6,1,56,4,1,1
7,1,56,7,1,1
8,1,56,157,1,1
9,1,56,905,1,1


In [50]:
a_groups_c = a_groups.groupby(['AnsId']).apply(lambda df: pd.concat([df['AnsId'],df['QuestionId'],df['age'],df['Votes'],df['Score'].cumsum()],axis=1))
a_groups_c.head(15)

Unnamed: 0,AnsId,QuestionId,age,Votes,Score
0,4,1,0,3,1
1,4,1,1436,1,2
2,5,1,0,1,1
3,5,1,108,1,0
4,56,1,0,2,2
5,56,1,1,3,5
6,56,1,4,1,6
7,56,1,7,1,7
8,56,1,157,1,8
9,56,1,905,1,9


In [51]:
votes_com = pd.merge(a_groups_c, a_comments, how='left', on=['AnsId'],suffixes=['_vot', '_com'])
votes_com = votes_com.groupby(['AnsId']).apply(lambda df: pd.concat([df['AnsId'],df['QuestionId'],df['age_vot'],df['Votes'],df['Score'],df['age_vot']>=df['age_com']],axis=1))
votes_com.columns.values[5]='gt'
votes_com.head(10)

Unnamed: 0,AnsId,QuestionId,age_vot,Votes,Score,gt
0,4,1,0,3,1,False
1,4,1,0,3,1,False
2,4,1,1436,1,2,True
3,4,1,1436,1,2,True
4,5,1,0,1,1,False
5,5,1,108,1,0,True
6,56,1,0,2,2,False
7,56,1,0,2,2,False
8,56,1,0,2,2,False
9,56,1,0,2,2,False


In [52]:
tmp_sum_com = votes_com.groupby(['AnsId','age_vot']).apply(lambda df: sum(df['gt']))
tmp_sum_com.reset_index(drop=False).head()

Unnamed: 0,AnsId,age_vot,0
0,4,0,0
1,4,1436,2
2,5,0,0
3,5,108,1
4,6,0,0


## First Time series cleaned (model for the number of votes)

In [152]:
votes_com_f = pd.merge(votes_com,tmp_sum_com.reset_index(drop=False),on=['AnsId','age_vot'],how='inner')
votes_com_f.drop(votes_com_f.columns[[5]], axis=1, inplace=True)
votes_com_f.columns.values[5]='Comments'
votes_com_f = votes_com_f.drop_duplicates()
votes_com_f.head(10)

Unnamed: 0,AnsId,QuestionId,age_vot,Votes,Score,Comments
0,4,1,0,3,1,0
2,4,1,1436,1,2,2
4,5,1,0,1,1,0
5,5,1,108,1,0,1
6,56,1,0,2,2,0
10,56,1,1,3,5,4
14,56,1,4,1,6,4
18,56,1,7,1,7,4
22,56,1,157,1,8,4
26,56,1,905,1,9,4


In [173]:
votes_com_f[votes_com_f['AnsId'].isnull() | votes_com_f['age_vot'].isnull() | votes_com_f['Votes'].isnull() | votes_com_f['Score'].isnull() | votes_com_f['Comments'].isnull()]

Unnamed: 0,AnsId,QuestionId,age_vot,Votes,Score,Comments


In [174]:
# A few auxiliar functions

def rank_iter(df):
    cache = {}
    for row in df.itertuples():
        cache[row.AnsId] = row.Score
        # rank, nb_ans
        rank = sorted(cache, key= lambda k:cache[k],reverse=True).index(row.AnsId) + 1
        yield (rank , len(cache), row.Index)
            
def rank_ans(df):
    ranks, ans_count, indices = zip(*rank_iter(df))
    return [pd.Series(ranks,name="Ans_rank", index=indices), pd.Series(ans_count,name="Ans_count", index=indices)]

get_ranks = lambda df: pd.concat([df['AnsId'],df['age_vot'],df['Votes'],df['Score'],df['Comments']] + rank_ans(df),axis=1)
sort_age_score = lambda df: df.sort_values(by=['age_vot','Score'],ascending=[True,False])

In [163]:
votes_com_f.groupby(['QuestionId']).apply(lambda df: sort_age_score(df)).head(15)

Unnamed: 0_level_0,Unnamed: 1_level_0,AnsId,QuestionId,age_vot,Votes,Score,Comments
QuestionId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,6,56,1,0,2,2,0
1,0,4,1,0,3,1,0
1,4,5,1,0,1,1,0
1,30,210,1,0,1,1,0
1,10,56,1,1,3,5,4
1,33,7030,1,1,1,1,1
1,45,15104,1,1,3,-3,1
1,31,210,1,3,1,2,0
1,14,56,1,4,1,6,4
1,18,56,1,7,1,7,4


In [166]:
votes_com_f.groupby(['QuestionId']).apply(lambda df: get_ranks(sort_age_score(df)))

Unnamed: 0_level_0,Unnamed: 1_level_0,AnsId,age_vot,Votes,Score,Ans_rank,Ans_count
QuestionId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0,4,0,3,1,1,1
1,1,,,,,2,2
1,2,4,1436,1,2,3,3
1,3,,,,,2,4
1,4,5,0,1,1,1,4
1,5,5,108,1,0,5,5
1,6,56,0,2,2,6,6
1,7,,,,,2,6
1,8,,,,,1,6
1,9,,,,,1,6


In [105]:
a = ['s','a','5']
a.index('a')
dd = {'b':55,'ff':1,'gg':6}
sorted(dd, key= lambda x: dd[x],reverse=True)

['b', 'gg', 'ff']

In [18]:
from itertools import izip
[e for e in izip(*[(1,2),(1,2),(1,2),(1,2)])]

[(1, 1, 1, 1), (2, 2, 2, 2)]

In [24]:
def ful():
    for e in range(4):
        yield 1,2

In [161]:
a1,a2 = zip(*ful())
print a1
print a2
c = pd.Series(a1,name="bla"), pd.Series(a2,name="blo")
c[0]

(1, 1, 1, 1)
(2, 2, 2, 2)


0    1
1    1
2    1
3    1
Name: bla, dtype: int32

In [175]:
pd.concat([pd.Series([1,2,3],index=["A","B","C"]),pd.Series([6,7,8],index=["B","C","E"])],axis=1)

Unnamed: 0,0,1
A,1.0,
B,2.0,6.0
C,3.0,7.0
E,,8.0


In [176]:
df = pd.DataFrame({'AnsId': range(300), 'Score': [e-300 for e in range(300)]})

for row in df.itertuples():
    print (row.AnsId,row.Score, row.Index)
    print row

(0, -300, 0)
Pandas(Index=0, AnsId=0, Score=-300)
(1, -299, 1)
Pandas(Index=1, AnsId=1, Score=-299)
(2, -298, 2)
Pandas(Index=2, AnsId=2, Score=-298)
(3, -297, 3)
Pandas(Index=3, AnsId=3, Score=-297)
(4, -296, 4)
Pandas(Index=4, AnsId=4, Score=-296)
(5, -295, 5)
Pandas(Index=5, AnsId=5, Score=-295)
(6, -294, 6)
Pandas(Index=6, AnsId=6, Score=-294)
(7, -293, 7)
Pandas(Index=7, AnsId=7, Score=-293)
(8, -292, 8)
Pandas(Index=8, AnsId=8, Score=-292)
(9, -291, 9)
Pandas(Index=9, AnsId=9, Score=-291)
(10, -290, 10)
Pandas(Index=10, AnsId=10, Score=-290)
(11, -289, 11)
Pandas(Index=11, AnsId=11, Score=-289)
(12, -288, 12)
Pandas(Index=12, AnsId=12, Score=-288)
(13, -287, 13)
Pandas(Index=13, AnsId=13, Score=-287)
(14, -286, 14)
Pandas(Index=14, AnsId=14, Score=-286)
(15, -285, 15)
Pandas(Index=15, AnsId=15, Score=-285)
(16, -284, 16)
Pandas(Index=16, AnsId=16, Score=-284)
(17, -283, 17)
Pandas(Index=17, AnsId=17, Score=-283)
(18, -282, 18)
Pandas(Index=18, AnsId=18, Score=-282)
(19, -281, 19)