In [17]:
# Data Visualization
# BTech Computer Science Stream , January 2025
# Week 4 - Pandas Demonstration Notebook
# Sujithra,  Reg Number , Date: 06/01/2025

In [18]:
import numpy as np
import pandas as pd

# Why Pandas?
1. Pandas contains **data structures and data manipulation tools** designed to make data cleaning and analysis fast and convenient in Python.
2. Pandas is often used with numerical computing tools like NumPy and SciPy, analytical libraries like statsmodels and scikit-learn, and data visualization libraries like matplotlib.
3. Pandas adopts significant parts of NumPy's idiomatic style of array-based computing, especially **array-based functions** and a preference for **data processing without for loops**.
4. Pandas is designed for working with **tabular or heterogeneous data**. NumPy, by contrast, is best suited for working with homogeneously typed numerical array data.
5. Important data structures as **Series** and **Dataframe**

# **Series**

In [19]:
# A Series is a one-dimensional array-like object containing a sequence of values  of the same type
# and an associated array of data labels, called its index.
ser_x=pd.Series([4,7,-5,-3])
ser_x

0    4
1    7
2   -5
3   -3
dtype: int64

In [20]:
ser_x.array
ser_x.index

RangeIndex(start=0, stop=4, step=1)

In [21]:
# Series with an index identifying each data point with a label can be created
ser_y= pd.Series([4, 7, -5, 3], index=["d", "b", "a", "c"])
ser_y.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [22]:
# Labels in the index can be used when selecting single values or a set of values:
ser_y["d"] = 6
ser_y[["c", "a", "d"]]

c    3
a   -5
d    6
dtype: int64

In [23]:
# Can use NumPy-like operations, such as filtering with a Boolean array, scalar multiplication, or applying math functions
# The index-value link will be preserved
ser_y
ser_y[ser_y > 0]
# ser_y * 2
# np.exp(ser_y)
# "b" in ser_y

d    6
b    7
c    3
dtype: int64

In [24]:
#Series can be thought of as a fixed-length, ordered dictionary, as it is a mapping of index values to data values.
# Dictionaries can be converted to series too, for example , creating a dictionary of state and population.
dict_x = {"Ohio": 35000, "Texas": 71000, "Oregon": 16000, "Utah": 5000}
dict_x
ser_cities = pd.Series(dict_x)
ser_cities

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [25]:
#The dictionary keys can be overriden by passing an index with the dictionary keys in the order we want
states = ["California", "Ohio", "Oregon", "Texas"]
ser_z = pd.Series(dict_x, index=states)
ser_z

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [26]:
# Values found in sdata were placed in the appropriate locations,since no value for "California" was found, it appears as NaN
# (Not a Number), which is considered in pandas to mark missing or NA values.
# Since "Utah" was not included in states, it is excluded from the resulting object.
# “missing,” “NA,” or “null” can be used interchangeably to refer to missing data.
# The isna and notna functions in pandas should be used to detect missing data:
pd.isna(ser_z)
# pd.notna(ser_z)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [27]:
# A useful Series feature for many applications is that it automatically aligns by index label in arithmetic operations:
ser_cities + ser_z

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

# **DataFrame**

A **DataFrame** represents a rectangular table of data and contains an ordered, named collection of columns each of which can be a different value type (numeric, string, Boolean, etc.).
The DataFrame has both a row and column index; it can be thought of as a dictionary of Series all sharing the same index.

In [28]:
data = {"state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"],
        "year": [2000, 2001, 2002, 2001, 2002, 2003],
        "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
# data
data_x = pd.DataFrame(data)
data_x

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [29]:
# In Jupyter notebook, pandas DataFrame objects will be displayed as a more browser-friendly HTML table.
# Head and tail to display first and last 5 rows.
# If you specify a sequence of columns, the DataFrame’s columns will be arranged in that order:
# data_x
# data_x.head()
# data_x.tail()
pd.DataFrame(data, columns=["year", "state", "pop"])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [30]:
# Columns of data frames can be retrieved using :
data_x["pop"]
#data_x.year

0    1.5
1    1.7
2    3.6
3    2.4
4    2.9
5    3.2
Name: pop, dtype: float64

In [31]:
# Columns can be added to data frame, columns can be deleted using del
data_x["debt"] = 16.5
data_x
# data_x["eastern"] = data_x["state"] == "Ohio"
# del data_x["eastern"]
data_x


Unnamed: 0,state,year,pop,debt
0,Ohio,2000,1.5,16.5
1,Ohio,2001,1.7,16.5
2,Ohio,2002,3.6,16.5
3,Nevada,2001,2.4,16.5
4,Nevada,2002,2.9,16.5
5,Nevada,2003,3.2,16.5


In [32]:
# Reshaping np array into a data frame
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=["Ohio", "Colorado", "Utah", "New York"],
                    columns=["one", "two", "three", "four"])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [33]:
# drop method will return a new object with the indicated value or values deleted from an axis:
data.drop(index=["Colorado", "Ohio"])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [34]:
# To access specific columns
data["two"]
#data[["three", "one"]]
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [35]:
# Slicing or selecting data with a Boolean array:
# data
# data[2:3]
data[data["three"] > 7]

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [36]:
# Can create a DataFrame with all Boolean values produced by comparing with a scalar value:
data < 5
# data[data < 5] = 0
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [37]:
### DataFrame has special attributes loc and iloc for label-based and integer-based indexing, respectively.
### Since DataFrame is two-dimensional, you can select a subset of the rows and columns with NumPy-like
### notation using either axis labels (loc) or integers (iloc).
### .loc (Label-based Indexing)- Accesses data using labels (index names or column names).
### Can handle boolean conditions.Inclusive of the start and end labels in slicing.
data.loc[["Colorado", "Ohio"]]
# data.iloc[2]
# data.iloc[[2, 1]]
# data.iloc[2, [3, 0, 1]]


Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Ohio,0,1,2,3


In [38]:
data.loc[:"Utah", "two"]

Ohio        1
Colorado    5
Utah        9
Name: two, dtype: int64

In [39]:
# Boolean arrays can be used with loc but not iloc:
data.loc[data.three >= 2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [40]:
#Arithmetic and Data Alignment
# Pandas can add objects, if any index pairs are not the same,
# the respective index in the result will be the union of the index pairs.
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=["a", "c", "d", "e"])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1],
               index=["a", "c", "e", "f", "g"])
s1
# s2

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [41]:
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [42]:
#In the case of DataFrame, alignment is performed on both rows and columns:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list("bcd"),
                   index=["Ohio", "Texas", "Colorado"])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list("bde"),
                   index=["Utah", "Ohio", "Texas", "Oregon"])
df1
#df2

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [43]:
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [44]:
#To sort lexicographically by row or column label, use the sort_index method,
# which returns a new, sorted object:
obj = pd.Series(np.arange(4), index=["d", "a", "b", "c"])
# obj
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [45]:
# With a DataFrame, you can sort by index on either axis:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
                     index=["three", "one"],
                     columns=["d", "a", "b", "c"])
frame
frame.sort_index()
frame.sort_index(axis="columns")

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [46]:
frame.sort_index(axis="columns", ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [47]:
#To sort a Series by its values, use its sort_values method:
obj = pd.Series([4, 7, -3, 2])
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

In [48]:
#To sort a Series by its values, use its sort_values method:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values()
#obj.sort_values(na_position="first")

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [49]:
# When sorting a DataFrame, you can use the data in one or more columns as the sort keys.
# To do so, pass one or more column names to sort_values:
dat_frame = pd.DataFrame({"b": [4, 7, -3, 2], "a": [0, 1, 0, 1]})
dat_frame
dat_frame.sort_values("b")

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


In [50]:
dat_frame.sort_values(["a", "b"])

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


In [51]:
### Ranking assigns ranks from one through the number of valid data points in an array,
### starting from the lowest value. The rank methods for Series and DataFrame are the place to look;
### by default, rank breaks ties by assigning each group the mean rank:

obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj
# obj.rank()
# obj.rank(method="first")
#obj.rank(ascending=False)

0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64

In [52]:
#DataFrame can compute ranks over the rows or the columns:
frame = pd.DataFrame({"b": [4.3, 7, -3, 2], "a": [0, 1, 0, 1],
                      "c": [-2, 5, 8, -2.5]})
frame
frame.rank(axis="columns")

Unnamed: 0,b,a,c
0,3.0,2.0,1.0
1,3.0,1.0,2.0
2,1.0,2.0,3.0
3,3.0,2.0,1.0


In [53]:
# Summarizing and Computing Descriptive Statistics
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                  [np.nan, np.nan], [0.75, -1.3]],
                  index=["a", "b", "c", "d"],
                  columns=["one", "two"])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [59]:
# df.sum()
# df.sum(axis="columns")
# df.mean(axis="columns")
# df.idxmax()
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [60]:
# The most important Data Summary is the 5 number summary, used to describe the distribution of data
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [56]:
# On nonnumeric data, describe produces alternative summary statistics:
obj = pd.Series(["a", "a", "b", "c"] * 4)
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

In [57]:
# Unique Values, Value Counts, and Membership
obj = pd.Series(["c", "a", "d", "a", "a", "b", "b", "c", "c"])
#uniques = obj.unique()
obj.value_counts()

c    3
a    3
b    2
d    1
Name: count, dtype: int64

In [58]:
mask = obj.isin(["b", "c"])
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [61]:
100>>4

6