ValueError: cannot reindex from a duplicate axis  
https://stackoverflow.com/questions/27236275/what-does-valueerror-cannot-reindex-from-a-duplicate-axis-mean  

公式参考は https://pandas.pydata.org/docs/user_guide/indexing.html?highlight=valueerror  
このバグか? https://github.com/pandas-dev/pandas/issues/30667 


In [1]:
import pandas as pd
import numpy as np
pd.options.display.notebook_repr_html = False  # jupyter notebook上での出力形式を制御するために書いています。無くても動きます。

In [2]:
# 動作環境の確認
print(pd.__version__)
print(np.__version__)

1.0.1
1.18.1


## reindexで起きる

In [3]:
s = pd.Series([1, 2, 3])
s

0    1
1    2
2    3
dtype: int64

In [4]:
s.reindex([1, 2, 3])

1    2.0
2    3.0
3    NaN
dtype: float64

In [5]:
s = pd.Series(np.arange(4), index=['a', 'a', 'b', 'c'])
s

a    0
a    1
b    2
c    3
dtype: int64

In [6]:
labels = ['c', 'd']

In [7]:
s.reindex(labels)

ValueError: cannot reindex from a duplicate axis

In [46]:
s.loc[s.index.intersection(labels)].reindex(labels)

c    3.0
d    NaN
dtype: float64

## mergeで起きる  （いや、起きない?）

In [8]:
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.merge.html

df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
                    'value': [1, 2, 3, 5]})
df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],
                    'value': [5, 6, 7, 8]})

In [9]:
df1.merge(df2)

  lkey  value rkey
0  foo      5  foo

In [10]:
df1

  lkey  value
0  foo      1
1  bar      2
2  baz      3
3  foo      5

In [11]:
df2

  rkey  value
0  foo      5
1  bar      6
2  baz      7
3  foo      8

In [12]:
df1.merge(df2, left_on='lkey', right_on='rkey')

  lkey  value_x rkey  value_y
0  foo        1  foo        5
1  foo        1  foo        8
2  foo        5  foo        5
3  foo        5  foo        8
4  bar        2  bar        6
5  baz        3  baz        7

## joinで起きる?

### 普通のjoin


In [13]:
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                     'B': ['B0', 'B1', 'B2']},
                    index=['K0', 'K1', 'K2'])
left

     A   B
K0  A0  B0
K1  A1  B1
K2  A2  B2

In [14]:
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                      'D': ['D0', 'D2', 'D3']},
                     index=['K0', 'K2', 'K3'])
right

     C   D
K0  C0  D0
K2  C2  D2
K3  C3  D3

In [15]:
left.join(right)

     A   B    C    D
K0  A0  B0   C0   D0
K1  A1  B1  NaN  NaN
K2  A2  B2   C2   D2

### rightのindexを重複

In [16]:
right_dup_1 = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                      'D': ['D0', 'D2', 'D3']},
                     index=['K0', 'K2', 'K2'])
right_dup_1

     C   D
K0  C0  D0
K2  C2  D2
K2  C3  D3

In [17]:
left.join(right_dup_1)

     A   B    C    D
K0  A0  B0   C0   D0
K1  A1  B1  NaN  NaN
K2  A2  B2   C2   D2
K2  A2  B2   C3   D3

In [18]:
right_dup_2 = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                      'D': [111, 222, 333]},
                     index=['K0', 'K2', 'K2'])
right_dup_2

     C    D
K0  C0  111
K2  C2  222
K2  C3  333

In [19]:
left.join(right_dup_1)

     A   B    C    D
K0  A0  B0   C0   D0
K1  A1  B1  NaN  NaN
K2  A2  B2   C2   D2
K2  A2  B2   C3   D3

### leftのindexを重複

In [20]:
left_dup_1 = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                     'B': ['B0', 'B1', 'B2']},
                    index=['K0', 'K0', 'K2'])
left_dup_1

     A   B
K0  A0  B0
K0  A1  B1
K2  A2  B2

In [21]:
left_dup_1.join(right)

     A   B   C   D
K0  A0  B0  C0  D0
K0  A1  B1  C0  D0
K2  A2  B2  C2  D2

### leftとrightのindexを重複

In [22]:
left_dup_1.join(right_dup_1)

     A   B   C   D
K0  A0  B0  C0  D0
K0  A1  B1  C0  D0
K2  A2  B2  C2  D2
K2  A2  B2  C3  D3

In [23]:
left.join(right, how='inner')

     A   B   C   D
K0  A0  B0  C0  D0
K2  A2  B2  C2  D2

In [24]:
left_dup_1.join(right, how='inner')

     A   B   C   D
K0  A0  B0  C0  D0
K0  A1  B1  C0  D0
K2  A2  B2  C2  D2

## 新規行の割り当てで起きる_1

https://qiita.com/waterada/items/c239a6d0424537cfcfb9

In [25]:
import pandas as pd
A = pd.DataFrame(data={'a':[1, 2, 3], 'b': ['b1','b2','b3'], 'c': ['c1','c2','c3']}).set_index('a')
C = pd.DataFrame(data={'a':[2, 2, 3]}).set_index('a')

C['b'] = A.b

#    b
# a
# 2  b2
# 2  b2
# 3  b3

# 当然だが right 側がキー重複してるとエラー
D = pd.DataFrame(data={'a':[2, 2, 3], 'g':['g1','g2','g3']}).set_index('a')
A['g'] = D.g

ValueError: cannot reindex from a duplicate axis

## 新規行の割り当てで起きる_2 →起きない……　あと変なバグが起きてる……

In [26]:
a = np.arange(35).reshape(5,7)

In [27]:
df = pd.DataFrame(a, ['x', 'y', 'u', 'z', 'w'], list(range(10, 15)) + ['p', 'p'])


In [28]:
df

   10  11  12  13  14   p   p
x   0   1   2   3   4   5   6
y   7   8   9  10  11  12  13
u  14  15  16  17  18  19  20
z  21  22  23  24  25  26  27
w  28  29  30  31  32  33  34

In [29]:
df.values.dtype

dtype('int64')

In [30]:
df.loc['sums'] = df.sum(axis=0)

In [31]:
df

      10  11  12  13  14   p    p
x      0   1   2   3   4   5    6
y      7   8   9  10  11  12   13
u     14  15  16  17  18  19   20
z     21  22  23  24  25  26   27
w     28  29  30  31  32  33   34
sums  70  75  80  85  90  95  100

In [32]:
df.loc[:, 'p']

       p    p
x      5    6
y     12   13
u     19   20
z     26   27
w     33   34
sums  95  100

In [33]:
df

      10  11  12  13  14   p    p
x      0   1   2   3   4   5    6
y      7   8   9  10  11  12   13
u     14  15  16  17  18  19   20
z     21  22  23  24  25  26   27
w     28  29  30  31  32  33   34
sums  70  75  80  85  90  95  100

In [34]:
df.iloc[2, 5] = 1.23
df

      10  11  12  13  14      p       p
x      0   1   2   3   4   5.00    6.00
y      7   8   9  10  11  12.00   13.00
u     14  15  16  17  18   1.23    1.23
z     21  22  23  24  25  26.00   27.00
w     28  29  30  31  32  33.00   34.00
sums  70  75  80  85  90  95.00  100.00

In [35]:
# ココを有効にしてsums行を消したら、下で謎のバグにはならない。
# df.drop(index='sums', inplace=True)
# df

In [36]:
df.loc['sums'] = df.sum(axis=0)
df

         10     11     12     13     14       p       p
x       0.0    1.0    2.0    3.0    4.0    5.00    6.00
y       7.0    8.0    9.0   10.0   11.0   12.00   13.00
u      14.0   15.0   16.0   17.0   18.0    1.23    1.23
z      21.0   22.0   23.0   24.0   25.0   26.00   27.00
w      28.0   29.0   30.0   31.0   32.0   33.00   34.00
sums  140.0  150.0  160.0  170.0  180.0  181.23  181.23

## GitHubに報告されているバグ挙動?

In [37]:
import pandas 
import numpy as np

a = np.array([[1,2],[3,4]]) 

# DO NOT WORKS
b = np.array([[0.5,6],[7,8]])  
# b = np.array([[.5,6],[7,8]])  # The same problem

# This one works fine:
# b = np.array([[5,6],[7,8]]) 

dfA = pandas.DataFrame(a)
# This works fine EVEN using .5, because the columns name is different
# dfA = pandas.DataFrame(a, columns=['a','b'])
dfB = pandas.DataFrame(b)

df_new = pandas.concat([dfA, dfB], axis = 1)

print(df_new[df_new > 5])

ValueError: cannot reindex from a duplicate axis

In [38]:
dfA

   0  1
0  1  2
1  3  4

In [39]:
dfB

     0    1
0  0.5  6.0
1  7.0  8.0

In [40]:
df_new > 5

       0      1      0     1
0  False  False  False  True
1  False  False   True  True

## 列名が重複したDataFrame

https://stackoverflow.com/questions/30788061/valueerror-cannot-reindex-from-a-duplicate-axis-using-isin-with-pandas

In [41]:
df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
                    'value': [1, 2, 3, 5]})

In [42]:
df1

  lkey  value
0  foo      1
1  bar      2
2  baz      3
3  foo      5

In [43]:
df5 = df1[['lkey', 'value', 'value']]

In [44]:
df5

  lkey  value  value
0  foo      1      1
1  bar      2      2
2  baz      3      3
3  foo      5      5

In [45]:
df5[df5['value'] == 3]

ValueError: cannot reindex from a duplicate axis