In [2]:


# 5.1 Introduction to pandas Data Structures
# 5.1.1 Series
# Series is a one-dimensional labeled array capable of holding any data type
# (integers, strings, floating point numbers, Python objects, etc.).
# The axis labels are collectively referred to as the index.
# The basic method to create a Series is to call:
# Series(data, index=index)
# where data can be a list, numpy array, or a dictionary.
# If data is a dictionary, the index will be the sorted dictionary keys.
# If data is an ndarray, the index will be range(n) by default.
# The index can be explicitly passed as a parameter.
# The data type of the Series is automatically inferred from the data.
# The Series object has many attributes and methods that allow for easy manipulation of the data.
# The most important ones are:
# - index: the index of the Series
# - values: the data of the Series as a numpy array
# - dtype: the data type of the Series
# - size: the number of elements in the Series
# - shape: the shape of the Series (number of elements,)
# - empty: a boolean indicating whether the Series is empty
# - T: the transpose of the Series (not applicable for 1D data)
# - axes: the axes of the Series (index, values)
# - head(n): returns the first n elements of the Series
# - tail(n): returns the last n elements of the Series
# - sample(n): returns a random sample of n elements from the Series
# - describe(): returns a summary of the Series (count, mean, std, min, 25%, 50%, 75%, max)
# - info(): returns a summary of the Series (index, dtype, non-null values, memory usage)
# - to_frame(): converts the Series to a DataFrame
# - to_csv(): writes the Series to a CSV file
# - to_json(): writes the Series to a JSON file
# - to_dict(): converts the Series to a dictionary
# - to_numpy(): converts the Series to a numpy array
# - to_list(): converts the Series to a list
# - to_string(): converts the Series to a string
# - to_html(): converts the Series to HTML
# - to_latex(): converts the Series to LaTeX
# - to_markdown(): converts the Series to Markdown
# - to_sql(): writes the Series to a SQL database
# - to_pickle(): writes the Series to a pickle file
# - to_clipboard(): copies the Series to the clipboard
# - to_excel(): writes the Series to an Excel file
# - to_hdf(): writes the Series to an HDF5 file
# - to_feather(): writes the Series to a Feather file
# - to_parquet(): writes the Series to a Parquet file
# - to_orc(): writes the Series to an ORC file
# - to_sas(): writes the Series to a SAS file
# - to_spss(): writes the Series to a SPSS file
# - to_stata(): writes the Series to a Stata file
# - to_sql(): writes the Series to a SQL database
# - to_json(): writes the Series to a JSON file
# - to_html(): writes the Series to HTML
# - to_latex(): writes the Series to LaTeX


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import Series, DataFrame

obj = pd.Series([4, 7, -5, 3])
print(obj)


0    4
1    7
2   -5
3    3
dtype: int64


In [4]:
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
print(obj2)


d    4
b    7
a   -5
c    3
dtype: int64


In [5]:
print(obj2.index)  # Index(['d', 'b', 'a', 'c'], dtype='object')

Index(['d', 'b', 'a', 'c'], dtype='object')


In [6]:
print(obj2['a'])  # -5

-5


In [7]:
obj2["d"]=6
obj2

d    6
b    7
a   -5
c    3
dtype: int64

In [8]:
obj2[['c', 'a', 'd']]  # c    3, a   -5, d    6



c    3
a   -5
d    6
dtype: int64

In [9]:
obj2[obj2 > 0]  # d    6, b    7, c    3
# Boolean indexing


d    6
b    7
c    3
dtype: int64

In [10]:
obj2 * 2  # d    12, b    14, a   -10, c     6
# Vectorized operations


d    12
b    14
a   -10
c     6
dtype: int64

In [11]:
np.exp(obj2)  # d    403.428793, b    1096.633158, a    0.006738, c    20.085537
# Exponential function


d     403.428793
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [12]:
"b" in obj2 , "e" in obj2 
# Membership test

(True, False)

In [13]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = pd.Series(sdata)
print(obj3)  
# Ohio    35000, Texas    71000, Oregon    16000, Utah     5000
# The index is automatically sorted by the keys of the dictionary


Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64


In [14]:
obj4 = pd.Series(sdata, index=['California', 'Ohio', 'Oregon', 'Texas'])
print(obj4)

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64


In [15]:
obj3.to_dict()  # {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
# Convert Series to dictionary

{'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [16]:
pd.isna(obj4)  # False
# Check for missing values

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [17]:
pd.notna(obj4)  # True
# Check for non-missing values

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [18]:
# shorcut isna() and notna() are also available as isnull() and notnull()
# The isnull() and notnull() methods are also available as isna() and notna()   
# respectively.

obj4.isna(), obj4.isnull()  # False
# Check for missing values

(California     True
 Ohio          False
 Oregon        False
 Texas         False
 dtype: bool,
 California     True
 Ohio          False
 Oregon        False
 Texas         False
 dtype: bool)

In [19]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [20]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [21]:
obj3 + obj4  # California      NaN, Ohio        70000.0, Oregon     16000.0, Texas      71000.0, Utah          NaN
# Add two Series with different indices, the result is a new Series with the union of the indices

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [22]:
obj4.name = 'population'
obj4.index.name = 'state'
print(obj4)  # state

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64


In [23]:
obj  # population

0    4
1    7
2   -5
3    3
dtype: int64

In [24]:
obj.index = ['Ei','Ning','Candace','Navia']
print(obj)  # Ei       4, Ning     7, Candace -5, Navia    3
# Change the index of the Series

Ei         4
Ning       7
Candace   -5
Navia      3
dtype: int64


In [25]:
# Data Frame
# A DataFrame is a two-dimensional labeled data structure with columns of potentially different types.
# It is similar to a spreadsheet or SQL table, or a dictionary of Series objects.
# The DataFrame has an index and columns, which can be thought of as the axes of the data.
# The DataFrame can be created from a variety of data sources, including:
# - A dictionary of Series or DataFrames
# - A two-dimensional numpy array
# - A list of dictionaries
# - A list of lists
# - A CSV file
# - An Excel file
# - A SQL database
# - A JSON file
# - A HTML file
# - A LaTeX file
# - A Feather file
# - A Parquet file
# - An ORC file
# - A SAS file
# - A SPSS file
# - A Stata file
# - A HDF5 file
# - A pickle file
# - A clipboard
# - A SQL database
# - A JSON file
# - A HTML file
# - A LaTeX file
# - A Feather file
# - A Parquet file
# - An ORC file
# - A SAS file
# - A SPSS file



In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import Series, DataFrame
# DataFrame is a two-dimensional labeled data structure 
# with columns of potentially different types.

data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada',
                'Nevada','Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
print(frame)  # state   year  pop
# 0    Ohio  2000  1.5   

    state  year  pop
0    Ohio  2000  1.5
1    Ohio  2001  1.7
2    Ohio  2002  3.6
3  Nevada  2001  2.4
4  Nevada  2002  2.9
5  Nevada  2003  3.2


In [27]:
frame.head()  # Display the first 5 rows of the DataFrame
# 1    Ohio  2001  1.7
# 2    Ohio  2002  3.6
# 3  Nevada  2001  2.4
# 4  Nevada  2002  2.9
# 5  Nevada  2003  3.2
# The DataFrame is created from a dictionary of lists, where the keys are the column names and the values are the data.

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [28]:
frame.tail()  # Display the last 5 rows of the DataFrame
# 0    Ohio  2000  1.5  
# 1    Ohio  2001  1.7
# 2    Ohio  2002  3.6
# 3  Nevada  2001  2.4
# 4  Nevada  2002  2.9
# 5  Nevada  2003  3.2
# The DataFrame is created from a dictionary of lists, where the keys are the column names and the values are the data


Unnamed: 0,state,year,pop
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [29]:
pd.DataFrame(data, columns=['year', 'state', 'pop'])  # Display the DataFrame with specified columns
# year  state  pop      
# 0  2000   Ohio  1.5
# 1  2001   Ohio  1.7
# 2  2002   Ohio  3.6
# 3  2001 Nevada  2.4
# 4  2002 Nevada  2.9
# 5  2003 Nevada  3.2



Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [30]:
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop','debt'], 
                      index=['a', 'b', 'c', 'd', 'e', 'f'])
print(frame2)  # Display the DataFrame with specified columns and index
# year  state  pop  debt

   year   state  pop debt
a  2000    Ohio  1.5  NaN
b  2001    Ohio  1.7  NaN
c  2002    Ohio  3.6  NaN
d  2001  Nevada  2.4  NaN
e  2002  Nevada  2.9  NaN
f  2003  Nevada  3.2  NaN


In [31]:
frame2.columns  # Display the columns of the DataFrame
# Index(['year', 'state', 'pop', 'debt'], dtype='object')

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [32]:
frame2['state'] # Display the 'state' column of the DataFrame

a      Ohio
b      Ohio
c      Ohio
d    Nevada
e    Nevada
f    Nevada
Name: state, dtype: object

In [33]:
frame2.year  # Display the 'year' column of the DataFrame
# The 'year' column of the DataFrame

a    2000
b    2001
c    2002
d    2001
e    2002
f    2003
Name: year, dtype: int64

In [34]:
frame2.loc['a'], frame2.loc['b'] # Display the row with index 'a' or array number 0  of the DataFrame
# year     2000

(year     2000
 state    Ohio
 pop       1.5
 debt      NaN
 Name: a, dtype: object,
 year     2001
 state    Ohio
 pop       1.7
 debt      NaN
 Name: b, dtype: object)

In [35]:
frame2.iloc[1],frame2.iloc[2] # Display the row with index 'b' or array number 1  of the DataFrame
# year     2001

(year     2001
 state    Ohio
 pop       1.7
 debt      NaN
 Name: b, dtype: object,
 year     2002
 state    Ohio
 pop       3.6
 debt      NaN
 Name: c, dtype: object)

In [36]:
frame2["debt"] = 16.5  # Assign a value to the 'debt' column of the DataFrame
frame2  # Display the DataFrame with the updated 'debt' column

Unnamed: 0,year,state,pop,debt
a,2000,Ohio,1.5,16.5
b,2001,Ohio,1.7,16.5
c,2002,Ohio,3.6,16.5
d,2001,Nevada,2.4,16.5
e,2002,Nevada,2.9,16.5
f,2003,Nevada,3.2,16.5


In [37]:
frame2['debt'] = np.arange(6.0)  # Assign a range of values to the 'debt' column of the DataFrame
frame2  # Display the DataFrame with the updated 'debt' column

Unnamed: 0,year,state,pop,debt
a,2000,Ohio,1.5,0.0
b,2001,Ohio,1.7,1.0
c,2002,Ohio,3.6,2.0
d,2001,Nevada,2.4,3.0
e,2002,Nevada,2.9,4.0
f,2003,Nevada,3.2,5.0


In [38]:
val = pd.Series([-1.2, -1.5, -1.7], index=['c', 'd', 'e'])  
# Create a Series with specified index
frame2['debt'] = val  # Assign the Series to the 'debt' column of the DataFrame
frame2  # Display the DataFrame with the updated 'debt' column

Unnamed: 0,year,state,pop,debt
a,2000,Ohio,1.5,
b,2001,Ohio,1.7,
c,2002,Ohio,3.6,-1.2
d,2001,Nevada,2.4,-1.5
e,2002,Nevada,2.9,-1.7
f,2003,Nevada,3.2,


In [39]:
frame2['eastern'] = frame2.state == 'Ohio'  # Create a new column 'eastern' based on the 'state' column
frame2  # Display the DataFrame with the new 'eastern' column
# eastern  year  state  pop  debt

Unnamed: 0,year,state,pop,debt,eastern
a,2000,Ohio,1.5,,True
b,2001,Ohio,1.7,,True
c,2002,Ohio,3.6,-1.2,True
d,2001,Nevada,2.4,-1.5,False
e,2002,Nevada,2.9,-1.7,False
f,2003,Nevada,3.2,,False


In [40]:
del frame2['eastern']  # Delete the 'eastern' column from the DataFrame
frame2.columns  # Display the columns of the DataFrame
# Index(['year', 'state', 'pop', 'debt'], dtype='object')

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [41]:
frame2  # Display the DataFrame with the updated columns


Unnamed: 0,year,state,pop,debt
a,2000,Ohio,1.5,
b,2001,Ohio,1.7,
c,2002,Ohio,3.6,-1.2
d,2001,Nevada,2.4,-1.5
e,2002,Nevada,2.9,-1.7
f,2003,Nevada,3.2,


In [42]:
population = {'Nevada': {2001: 2.4, 2002: 2.9, 2003: 3.2},
              'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
frame3 = pd.DataFrame(population)  # Create a DataFrame from a nested dictionary
print(frame3)  # Display the DataFrame

      Nevada  Ohio
2001     2.4   1.7
2002     2.9   3.6
2003     3.2   NaN
2000     NaN   1.5


In [43]:
frame3.T  # Transpose the DataFrame

Unnamed: 0,2001,2002,2003,2000
Nevada,2.4,2.9,3.2,
Ohio,1.7,3.6,,1.5


In [44]:
pd.DataFrame(population, index=[2001, 2002, 2003])  # Create a DataFrame with specified index
# Display the DataFrame with specified index
# 2001  Nevada  2.4
# 2002  Nevada  2.9

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,3.2,


In [45]:
pdata = {"Ohio": frame3['Ohio'][:-1],
         "Nevada": frame3['Nevada'][:2]} 
# Create a dictionary of Series from the DataFrame
frame4 = pd.DataFrame(pdata)  # Create a DataFrame from the dictionary of Series    
print(frame4)  # Display the DataFrame
# Ohio  Nevada
# 2000  1.5    NaN
# 2001  1.7    2.4
# 2002  3.6    2.9
# The DataFrame is created from a dictionary of Series, where the keys are the column names and the values are the data.

      Ohio  Nevada
2001   1.7     2.4
2002   3.6     2.9
2003   NaN     NaN


In [46]:
frame3.index.name = 'year'  # Set the index name of the DataFrame
frame3.columns.name = 'state'  # Set the column name of the DataFrame
print(frame3)  # Display the DataFrame with index and column names
# year  state  pop  debt

state  Nevada  Ohio
year               
2001      2.4   1.7
2002      2.9   3.6
2003      3.2   NaN
2000      NaN   1.5


In [47]:
frame3.to_numpy()  # Convert the DataFrame to a numpy array

array([[2.4, 1.7],
       [2.9, 3.6],
       [3.2, nan],
       [nan, 1.5]])

In [48]:
frame2.to_numpy()  # Convert the DataFrame to a numpy array

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, nan],
       [2002, 'Ohio', 3.6, -1.2],
       [2001, 'Nevada', 2.4, -1.5],
       [2002, 'Nevada', 2.9, -1.7],
       [2003, 'Nevada', 3.2, nan]], dtype=object)

In [49]:
# Index Objects
# An index object is an immutable array that holds the axis labels and other metadata (like the data type).
# It is the object that defines the axis labels and the data type of the axis labels.

obj = pd.Series(np.arange(3.0), index=['a', 'b', 'c'])
print(obj)  # Display the Series

a    0.0
b    1.0
c    2.0
dtype: float64


In [50]:
index = obj.index  # Get the index of the Series
print(index)  # Display the index

Index(['a', 'b', 'c'], dtype='object')


In [51]:
index[:1], index[:2], index[1:]   
 # Get the first element of the index
# Index(['a'], dtype='object'), Index(['a', 'b'], dtype='object')
# The index is a pandas Index object, which is immutable 
# and cannot be modified.


(Index(['a'], dtype='object'),
 Index(['a', 'b'], dtype='object'),
 Index(['b', 'c'], dtype='object'))

In [53]:
# To "modify" an index, create a new Index object
new_index = index.tolist()
new_index[1] = 'd'
index = pd.Index(new_index)
print(index)

Index(['a', 'd', 'c'], dtype='object')


In [None]:
labels = pd.Index(np.arange(3))  # Create an index with labels
print(labels)  # Display the index
# Index([0, 1, 2], dtype='int64')


Index([0, 1, 2], dtype='int64')


In [None]:
obj2 = pd.Series([1.5, -2.5, 0], index=labels)  
# Create a Series with the index
print(obj2)  # Display the Series
# 0    1.5
# 1   -2.5
# 2    0.0
# dtype: float64

0    1.5
1   -2.5
2    0.0
dtype: float64


In [None]:
obj2.index is labels  # Check if the index of the Series is the same as the labels
# True


True

In [None]:
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2003,3.2,
2000,,1.5


In [None]:
frame3.columns  # Display the columns of the DataFrame
# Index(['year', 'state', 'pop', 'debt'], dtype='object', name='state')

Index(['Nevada', 'Ohio'], dtype='object', name='state')

In [None]:
2003 in frame3.columns, 2003 in frame3.index 
# Check if 2003 is in the columns of the DataFrame
# True, False

(False, True)

In [None]:
pd.Index(["yelan", "yelan", "keqing","keqing"])
# Create an index with duplicate labels
# Index(['yelan', 'yelan', 'keqing', 'keqing'], dtype='object')

Index(['yelan', 'yelan', 'keqing', 'keqing'], dtype='object')

Index(['yelan', 'yelan', 'keqing', 'keqing'], dtype='object')