# <b>Chapter 13: Introducing Pandas Objects</b>

In [2]:
import numpy as np
import pandas as pd
# import matplotlib as mpl
# import matplotlib.pyplot as plt
# import seaborn as sns
# from datetime import datetime
# from dateutil import parser
# from pandas_datareader import data

In [3]:
import pandas
pandas.__version__

'2.2.3'

## <b>pandas series objects </b>

**A `Pandas Series` is a one-dimensional array of indexed data. It can be created from a
`list` or `array` as follows:**

In [4]:
#creating a Pandas Series object from a Numpy array:
arr = np.array([10, 20, 30, 40])

# Convert to Series
series = pd.Series(arr)

print(series)

0    10
1    20
2    30
3    40
dtype: int32


In [5]:
#creating a Pandas Series object from a list:
data=pd.Series([0.25,0.5,0.75,1.0])
# The variable 'data' becomes an object of the 'pandas.Series class'.
print(type(data)) #<class 'pandas.core.series.Series'>

'''
So:
- 'data' is an object (or instance).
- Of the class 'pandas.Series'.
- This class is used to represent one-dimensional labeled data (like a column in a table or a time series).
'''
print(data)

<class 'pandas.core.series.Series'>
0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64


In [6]:
'''
'data.values' and 'data.index' are attributes (or properties) of the 'pandas.Series' object.
'''

"\n'data.values' and 'data.index' are attributes (or properties) of the 'pandas.Series' object.\n"

In [7]:
data.values # This is a 'NumPy array' containing the actual data values in the Series.

array([0.25, 0.5 , 0.75, 1.  ])

In [8]:
data.index #This is a 'Pandas Index' object that contains the labels (by default: integers starting from 0).

RangeIndex(start=0, stop=4, step=1)

In [9]:
'''
🔁 Analogy:
If you think of a Series like a column in Excel:

values = the actual numbers in the cells

index = the row labels (e.g., 0, 1, 2…)
'''

'\n🔁 Analogy:\nIf you think of a Series like a column in Excel:\n\nvalues = the actual numbers in the cells\n\nindex = the row labels (e.g., 0, 1, 2…)\n'

<b>Let's dig into the most useful attributes and methods of a `pandas.Series` object like your `data`.</b>  
| Attribute         | What it gives you                                         |
|------------------|-----------------------------------------------------------|
| `data.values`     | The underlying NumPy array (the data itself)              |
| `data.index`      | The index (labels for each value)                         |
| `data.dtype`      | Data type of the elements (e.g., `float64`, `int64`)      |
| `data.shape`      | Shape (tuple) — like `(4,)` for a 1D array                |
| `data.size`       | Number of elements in the Series                          |
| `data.name`       | Name of the Series (default: `None`, but can be set)      |
| `data.ndim`       | Number of dimensions (always `1` for Series)              |
| `data.isnull()`   | Boolean Series showing if each value is null              |
| `data.notnull()`  | Opposite of `isnull()`                                    |


In [10]:
data[1]

0.5

In [11]:
data[1:3]

1    0.50
2    0.75
dtype: float64

In [12]:
data=pd.Series([1,2,3,4,5], index=['a','b','c','d','e'])
data

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [13]:
# data[0]  #1
data['a'] #1

1

In [14]:
data['a']

1

In [15]:
data=pd.Series([1,2,3,4,5], index=['2','5','3','4','1'])
data

2    1
5    2
3    3
4    4
1    5
dtype: int64

In [16]:
# data[2]   #error
data['1']

5

In [17]:
data=pd.Series([0.25,0.5,0.75,1.0], index = [0, 1, 6, 3])
data

0    0.25
1    0.50
6    0.75
3    1.00
dtype: float64

In [18]:
# data[2]   #error
data[6]

0.75

In [19]:
population_dict = {'California': 38332521,
'Texas': 26448193,
'New York': "19651127",
'Florida': 19552860,
'Illinois': 12882135}
population = pd.Series(population_dict)
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: object

In [20]:
print(population.dtype)

object


In [21]:
population_dict = {'California': 38332521,
'Texas': 26448193,
'New York': 19651127,
'Florida': 19552860,
'Illinois': 12882135}
population = pd.Series(population_dict)
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [22]:
population['California']

38332521

In [23]:
population['California':'Florida']

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
dtype: int64

In [24]:
pd.Series([2,4,6])

0    2
1    4
2    6
dtype: int64

In [25]:
pd.Series(5, index=[1,3,5])

1    5
3    5
5    5
dtype: int64

In [26]:
pd.Series({2:'a', 4:'b',6:'c'})

2    a
4    b
6    c
dtype: object

In [27]:
pd.Series({2: 'a', 4: 'b', 6: 'c'}, dtype='string')

2    a
4    b
6    c
dtype: string

In [28]:
pd.Series({2:'a', 4:'b',6:'c'}, index=[2,6])

2    a
6    c
dtype: object

## <b>The Pandas DataFrame Object</b>

**Let's explore the most useful attributes and methods of a `pandas.DataFrame` object.**

| Attribute / Method     | What it gives you                                                                  |
|------------------------|-------------------------------------------------------------------------------------|
| `df.values`            | The underlying NumPy array (2D) with the data                                      |
| `df.columns`           | Labels of the columns (a `pd.Index` object)                                        |
| `df.index`             | Row labels (can be integers or custom labels)                                      |
| `df.dtypes`            | Data types of each column                                                           |
| `df.shape`             | Shape of the DataFrame as a tuple `(rows, columns)`                                |
| `df.size`              | Total number of elements (`rows × columns`)                                        |
| `df.ndim`              | Number of dimensions (always `2` for DataFrame)                                     |
| `df.head(n)`           | First `n` rows of the DataFrame (default: 5)                                       |
| `df.tail(n)`           | Last `n` rows of the DataFrame                                                     |
| `df.describe()`        | Summary statistics for numeric columns                                              |
| `df.info()`            | Summary of the DataFrame structure (columns, non-null counts, types)               |
| `df.T`                 | Transposed DataFrame (rows become columns and vice versa)                          |
| `df.isnull()`          | DataFrame of same shape with `True` for null/missing values                        |
| `df.notnull()`         | Opposite of `isnull()`                                                              |
| `df.copy()`            | Returns a deep copy of the DataFrame                                               |
| `df.columns.tolist()`  | List of column names (useful for iteration or selection)                           |


In [29]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
'Florida': 170312, 'Illinois': 149995}

In [30]:
area = pd.Series(area_dict)
area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64

In [31]:
states = pd.DataFrame({'population' : population, 'area' : area})
states

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [32]:
states.index

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

In [33]:
states.values

array([[38332521,   423967],
       [26448193,   695662],
       [19651127,   141297],
       [19552860,   170312],
       [12882135,   149995]], dtype=int64)

In [34]:
states.columns

Index(['population', 'area'], dtype='object')

In [35]:
states['area']

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [36]:
# Creating a 2D array (3 rows and 4 columns)
array_2d = np.array([[1, 2, 3, 4],
                     [5, 6, 7, 8],
                     [9, 10, 11, 12]])

array_2d[0] #array([1, 2, 3, 4]) Returns the first row as a 1D array. Shape: (4,) (i.e. just a flat row)
array_2d[:1]    #array([[1, 2, 3, 4]]) Returns the first row as a 2D array (a "slice" of rows). Shape: (1, 4) (1 row, 4 columns)

array([[1, 2, 3, 4]])

In [37]:
print(array_2d.size, array_2d.shape, array_2d.ndim, array_2d.dtype)

12 (3, 4) 2 int32


In [38]:
'''
population_dict = {'California': 38332521,
'Texas': 26448193,
'New York': 19651127,
'Florida': 19552860,
'Illinois': 12882135}
population = pd.Series(population_dict)
population
'''
pd.DataFrame(population, columns = ['populations'])

Unnamed: 0,populations
California,38332521
Texas,26448193
New York,19651127
Florida,19552860
Illinois,12882135


In [39]:
population['California']

38332521

In [40]:
data = [{'a': i, 'b': 2 * i} for i in range(3)]
data
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


In [41]:
pd.DataFrame({'a': 1, 'b': 4}, {'b':4, 'c': 7})

Unnamed: 0,a,b
b,1,4
c,1,4


In [42]:
pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}])

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [43]:
z = np.zeros(4)
z

array([0., 0., 0., 0.])

In [44]:
z.dtype

dtype('float64')

In [45]:
pd.DataFrame(np.random.rand(3,2), columns=['first','second'], index=['a','b','c'])

Unnamed: 0,first,second
a,0.448476,0.996975
b,0.060232,0.256986
c,0.176817,0.18477


In [46]:
A = np.zeros(3, dtype=[('A', 'i8'), ('B', 'f8')])
A

array([(0, 0.), (0, 0.), (0, 0.)], dtype=[('A', '<i8'), ('B', '<f8')])

In [47]:
A.dtype

dtype([('A', '<i8'), ('B', '<f8')])

In [48]:
v = np.zeros(3, dtype= {'names':('A', 'B'), 'formats':('<i8','<f8')})
v

array([(0, 0.), (0, 0.), (0, 0.)], dtype=[('A', '<i8'), ('B', '<f8')])

In [49]:
w = pd.DataFrame(v)
w

Unnamed: 0,A,B
0,0,0.0
1,0,0.0
2,0,0.0


In [50]:
w.index

RangeIndex(start=0, stop=3, step=1)

In [51]:
u = w.index[2]
u

2

## <b>The Pandas Index Object</b>

In [52]:
ind = pd.Index([i for i in range(4)])
ind

Index([0, 1, 2, 3], dtype='int64')

In [53]:
ind=pd.Index([2,4,7,9])
ind

Index([2, 4, 7, 9], dtype='int64')

In [54]:
ind[::3]

Index([2, 9], dtype='int64')

In [55]:
print(ind.size, ind.shape, ind.ndim, ind.dtype)

4 (4,) 1 int64


In [56]:
# ind[1] = 12     #Index does not support mutable operations

In [57]:
#Here are some common set arithmetic operations using Python’s built-in set data structure, with examples:
A = {1, 2, 3, 4}
B = {3, 4, 5, 6}

#Union
A | B
# Output: {1, 2, 3, 4, 5, 6}

A.union(B)  # Equivalent

#Intersection
A & B
# Output: {3, 4}

A.intersection(B)  # Equivalent

#Difference
A - B
# Output: {1, 2}

A.difference(B)  # Equivalent

#Symmetric Difference
A ^ B
# Output: {1, 2, 5, 6}

A.symmetric_difference(B)  # Equivalent

#Subset and Superset Checks
A.issubset(B)      # False
A.issuperset(B)    # False
{3, 4}.issubset(A) # True


True

In [58]:
#now we're exploring set operations with Pandas 'Index' objects.
A = pd.Index([1, 3, 5, 7, 9,8])
B = pd.Index([1, 7, 5, 3, 9,11])


#Union
A | B
# Output: Index([1, 7, 5, 7, 9, 11], dtype='int64')

A.union(B)
# output: Index([1, 3, 5, 7, 8, 9, 11], dtype='int64')
# Note: Output is sorted by default in A.union(B). The | operator retains the order of elements as they first appear.


#Intersection
A & B
# Output: Index([1, 3, 5, 3, 9, 8], dtype='int64')

A.intersection(B)  
# Output: Index([1, 3, 5, 7, 9], dtype='int64')

#Difference
A - B
# Output: Index([0, -4, 0, 4, 0, -3], dtype='int64')

A.difference(B)  
# Output: Index([8], dtype='int64')

#Symmetric Difference
A ^ B
# Output: Index([0, 4, 0, 4, 0, 3], dtype='int64')

A.symmetric_difference(B)  
# Output: Index([8, 11], dtype='int64')

#Subset and Superset Checks
# A.issubset(B)      # 'Index' object has no attribute 'issubset'
# A.issuperset(B)    # 'Index' object has no attribute 'issubset'
{3, 4}.issubset(A) # False
{3, 8}.issubset(A) # True

True

In [61]:
A | B

Index([1, 7, 5, 7, 9, 11], dtype='int64')

In [60]:
indA = pd.Index([1, 3, 5, 7, 109, 14])
indB = pd.Index([1, 7, 5, 3, 9, 11])

result = indA & indB
print(result)
print(type(result))
'''
#output:
Index([1, 3, 5, 3, 9, 10], dtype='int64')
<class 'pandas.core.indexes.base.Index'>
'''


Index([1, 3, 5, 3, 9, 10], dtype='int64')
<class 'pandas.core.indexes.base.Index'>


"\n#output:\nIndex([1, 3, 5, 3, 9, 10], dtype='int64')\n<class 'pandas.core.indexes.base.Index'>\n"