# pandas



In [6]:
# 10 minutes to pandas
import numpy as np
import pandas as pd

In [7]:
# Creating a Series
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [8]:
# Creating a DataFrame by passing a NumPy array
dates = pd.date_range("20130101", periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [9]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.840824,-0.660727,-1.274568,0.53762
2013-01-02,1.450069,0.323901,0.553063,-2.103812
2013-01-03,0.282565,-1.481304,2.096098,0.080748
2013-01-04,-0.88664,1.323688,0.924795,0.308255
2013-01-05,0.66245,-0.697337,0.185034,1.748388
2013-01-06,-0.769688,0.002201,-0.097323,-0.550833


In [10]:
# Creating a DataFrame by passing a dictionary of objects where the keys are the column labels and the values are the column values
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)

df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [11]:
# The columns of the resulting DataFrame have different dtypes
df2.dtypes

A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object

In [15]:
# If you're using IPython, tab completion for column names (as well as public attributes) is automatically enabled. Here's a subset of the attributes that will be completed:
df2.<TAB> # noqa: E225, E999

SyntaxError: invalid syntax (3094613661.py, line 2)

In [16]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,-0.840824,-0.660727,-1.274568,0.53762
2013-01-02,1.450069,0.323901,0.553063,-2.103812
2013-01-03,0.282565,-1.481304,2.096098,0.080748
2013-01-04,-0.88664,1.323688,0.924795,0.308255
2013-01-05,0.66245,-0.697337,0.185034,1.748388


In [17]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,-0.88664,1.323688,0.924795,0.308255
2013-01-05,0.66245,-0.697337,0.185034,1.748388
2013-01-06,-0.769688,0.002201,-0.097323,-0.550833


In [18]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [19]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [20]:
df.to_numpy()

array([[-0.84082361, -0.66072732, -1.27456819,  0.53761975],
       [ 1.450069  ,  0.32390149,  0.5530632 , -2.10381221],
       [ 0.28256465, -1.48130363,  2.09609772,  0.08074771],
       [-0.8866404 ,  1.32368759,  0.92479517,  0.30825533],
       [ 0.66245021, -0.69733704,  0.18503411,  1.74838801],
       [-0.7696882 ,  0.00220065, -0.09732309, -0.55083266]])

In [21]:
df2.dtypes

A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object

In [22]:
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [27]:
# shows a quick statistic summary of your data
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.017011,-0.198263,0.39785,0.003394
std,0.97007,0.97348,1.120114,1.278813
min,-0.88664,-1.481304,-1.274568,-2.103812
25%,-0.82304,-0.688185,-0.026734,-0.392938
50%,-0.243562,-0.329263,0.369049,0.194502
75%,0.567479,0.243476,0.831862,0.480279
max,1.450069,1.323688,2.096098,1.748388


In [26]:
# Transposing, 行列转置
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-0.840824,1.450069,0.282565,-0.88664,0.66245,-0.769688
B,-0.660727,0.323901,-1.481304,1.323688,-0.697337,0.002201
C,-1.274568,0.553063,2.096098,0.924795,0.185034,-0.097323
D,0.53762,-2.103812,0.080748,0.308255,1.748388,-0.550833


In [28]:
# sorts by an axis
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,0.53762,-1.274568,-0.660727,-0.840824
2013-01-02,-2.103812,0.553063,0.323901,1.450069
2013-01-03,0.080748,2.096098,-1.481304,0.282565
2013-01-04,0.308255,0.924795,1.323688,-0.88664
2013-01-05,1.748388,0.185034,-0.697337,0.66245
2013-01-06,-0.550833,-0.097323,0.002201,-0.769688


In [29]:
# sort by values
df.sort_values(by="B")

Unnamed: 0,A,B,C,D
2013-01-03,0.282565,-1.481304,2.096098,0.080748
2013-01-05,0.66245,-0.697337,0.185034,1.748388
2013-01-01,-0.840824,-0.660727,-1.274568,0.53762
2013-01-06,-0.769688,0.002201,-0.097323,-0.550833
2013-01-02,1.450069,0.323901,0.553063,-2.103812
2013-01-04,-0.88664,1.323688,0.924795,0.308255


In [30]:
# Getitem([])

# For a DataFrame, passing a single label selects a columns and yields a Series equivalent to df.A
df["A"]

2013-01-01   -0.840824
2013-01-02    1.450069
2013-01-03    0.282565
2013-01-04   -0.886640
2013-01-05    0.662450
2013-01-06   -0.769688
Freq: D, Name: A, dtype: float64

In [31]:
# For a DataFrame, passing a slice : selects matching rows:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-0.840824,-0.660727,-1.274568,0.53762
2013-01-02,1.450069,0.323901,0.553063,-2.103812
2013-01-03,0.282565,-1.481304,2.096098,0.080748


In [32]:
df["20130102":"20130104"]

Unnamed: 0,A,B,C,D
2013-01-02,1.450069,0.323901,0.553063,-2.103812
2013-01-03,0.282565,-1.481304,2.096098,0.080748
2013-01-04,-0.88664,1.323688,0.924795,0.308255


In [33]:
# Selection by label

# Selecting a row matching a label
df.loc[dates[0]]

A   -0.840824
B   -0.660727
C   -1.274568
D    0.537620
Name: 2013-01-01 00:00:00, dtype: float64

In [34]:
# Selecting all rows (:) with a select column labels
df.loc[:, ["A", "B"]]

Unnamed: 0,A,B
2013-01-01,-0.840824,-0.660727
2013-01-02,1.450069,0.323901
2013-01-03,0.282565,-1.481304
2013-01-04,-0.88664,1.323688
2013-01-05,0.66245,-0.697337
2013-01-06,-0.769688,0.002201
