In [1]:
import pandas as pd

# Creating a simple Series
obj = pd.Series([4, 7, -5, 3])
print(obj)


0    4
1    7
2   -5
3    3
dtype: int64


In [2]:
print(obj.array)
print(obj.index)


<NumpyExtensionArray>
[4, 7, -5, 3]
Length: 4, dtype: int64
RangeIndex(start=0, stop=4, step=1)


In [3]:
# Often, you’ll want to create a Series with an index identifying each data point with a label:
obj2 = pd.Series([4, 7, -5, 3], index=["d", "b", "a", "c"])
print(obj2)
print(obj2.index)


d    4
b    7
a   -5
c    3
dtype: int64
Index(['d', 'b', 'a', 'c'], dtype='object')


In [4]:
obj2["b"]

7

In [5]:
obj2[1]




  obj2[1]


7

In [6]:
obj2[2:4]

a   -5
c    3
dtype: int64

In [7]:
obj2[[1, 3]]

  obj2[[1, 3]]


b    7
c    3
dtype: int64

In [8]:
obj2[obj2 < 2]

a   -5
dtype: int64

In [9]:
#cjecking data in obj2
print("b" in obj2)
print("e" in obj2)


True
False


In [10]:
# Should you have data contained in a Python dictionary, you can create a Series from it by passing the dictionary:
sdata = {"Ohio": 35000, "Texas": 71000, "Oregon": 16000, "Utah": 5000}
obj3 = pd.Series(sdata)
print(obj3)


Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64


In [11]:
# A Series can be converted back to a dictionary with its to_dict method:
obj3.to_dict()

{'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [12]:
# When you are only passing a dictionary, 
# the index in the resulting Series will respect the order of the keys according to the dictionary’s keys method, 
# which depends on the key insertion order.
#  You can override this by passing an index with the dictionary keys in the order you want them to appear in the resulting Series:

states = ["California", "Ohio", "Oregon", "Texas"]   #there is no value for california so it will appaear NAN this i s called missing value
obj4 = pd.Series(sdata, index=states)
print(obj4)



California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64


In [13]:
# The isna and notna functions in pandas should be used to detect missing data:
print(pd.isna(obj4))
print("\n")
print(pd.notna(obj4))
print("\n")
print(obj4.isna())


California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool


California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool


California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool


In [14]:
print(obj3)
print(obj4)


Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64


In [15]:
# A useful Series feature for many applications is that it automatically aligns by index label in arithmetic operations:
print(obj3+obj4)

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64


In [16]:
obj4.name = "population"
obj4.index.name = "state"
print(obj4)


state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64


## DataFrame

A DataFrame represents a rectangular table of data and contains an ordered, named collection of columns, each of which can be a different value type (numeric, string, Boolean, etc.). The DataFrame has both a row and column index; it can be thought of as a dictionary of Series all sharing the same index.

In [17]:
data = {
    "state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"],
    "year": [2000, 2001, 2002, 2001, 2002, 2003],
    "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]
}
frame = pd.DataFrame(data)
print(frame)


    state  year  pop
0    Ohio  2000  1.5
1    Ohio  2001  1.7
2    Ohio  2002  3.6
3  Nevada  2001  2.4
4  Nevada  2002  2.9
5  Nevada  2003  3.2


In [18]:
# For large DataFrames, the head method selects only the first five rows:
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [19]:
# Similarly, tail returns the last five rows:
frame.tail()

Unnamed: 0,state,year,pop
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [20]:
# Specifying Column Order
# If you specify a sequence of columns, the DataFrame’s columns will be arranged in that order:
data = {"state": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada", "Nevada"],
        "year": [2000, 2001, 2002, 2001, 2002, 2003],
        "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
pd.DataFrame(data, columns=["year", "state", "pop"])


Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [21]:
# If you pass a column that isn’t contained in the dictionary, it will appear with missing values in the result:
frame2=pd.DataFrame(data, columns=["year", "state", "pop", "debt"])
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,
5,2003,Nevada,3.2,


In [22]:
# Retrieving Columns as Series
# A column in a DataFrame can be retrieved as a Series:
frame2["state"]


0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
5    Nevada
Name: state, dtype: object

In [23]:
# Using dot attribute notation:
frame2.year

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64

In [24]:
# Retrieving Rows
# Rows can be retrieved by position or name with the special iloc and loc attributes:
frame2.loc[1]

year     2001
state    Ohio
pop       1.7
debt      NaN
Name: 1, dtype: object

In [25]:
# Using iloc:

print(frame2)
print(frame2.iloc[2])

   year   state  pop debt
0  2000    Ohio  1.5  NaN
1  2001    Ohio  1.7  NaN
2  2002    Ohio  3.6  NaN
3  2001  Nevada  2.4  NaN
4  2002  Nevada  2.9  NaN
5  2003  Nevada  3.2  NaN
year     2002
state    Ohio
pop       3.6
debt      NaN
Name: 2, dtype: object


In [26]:
# Modifying Columns
# Columns can be modified by assignment:
frame2["debt"]=242.4
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,242.4
1,2001,Ohio,1.7,242.4
2,2002,Ohio,3.6,242.4
3,2001,Nevada,2.4,242.4
4,2002,Nevada,2.9,242.4
5,2003,Nevada,3.2,242.4


In [27]:
import numpy as np
frame2["debt"] = np.arange(6.)
frame2


Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,0.0
1,2001,Ohio,1.7,1.0
2,2002,Ohio,3.6,2.0
3,2001,Nevada,2.4,3.0
4,2002,Nevada,2.9,4.0
5,2003,Nevada,3.2,5.0


In [28]:
# Adding and Deleting Columns
# Assigning a column that doesn’t exist will create a new column:
frame2["eastern"]=frame["state"]=="Ohio"

frame2

Unnamed: 0,year,state,pop,debt,eastern
0,2000,Ohio,1.5,0.0,True
1,2001,Ohio,1.7,1.0,True
2,2002,Ohio,3.6,2.0,True
3,2001,Nevada,2.4,3.0,False
4,2002,Nevada,2.9,4.0,False
5,2003,Nevada,3.2,5.0,False


In [29]:
# The del keyword will delete columns:
del frame2["eastern"]
frame2.columns


Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [31]:
# Create a Nested Dictionary of Dictionaries
# Here, we'll create a dictionary named populations that contains data for Ohio and Nevada across different years.

populations = {
    "Ohio": {2000: 1.5, 2001: 1.7, 2002: 3.6},
    "Nevada": {2001: 2.4, 2002: 2.9}
}

frame3=pd.DataFrame(populations)
print(frame3)

      Ohio  Nevada
2000   1.5     NaN
2001   1.7     2.4
2002   3.6     2.9


In [33]:
#transpose the dataframe
print(frame3.T)

        2000  2001  2002
Ohio     1.5   1.7   3.6
Nevada   NaN   2.4   2.9


In [34]:
# Specify an Explicit Index
# When an explicit index is specified, the inner dictionary keys are not used.

print(pd.DataFrame(populations, index=[2001, 2002, 2003]))

      Ohio  Nevada
2001   1.7     2.4
2002   3.6     2.9
2003   NaN     NaN


In [35]:
# Assign Names to Index and Columns
# Assigning names to the index and columns of the DataFrame.
frame3.index.name="year"
frame3.columns.name="state"
print(frame3)

state  Ohio  Nevada
year               
2000    1.5     NaN
2001    1.7     2.4
2002    3.6     2.9


In [36]:
# Convert DataFrame to NumPy Array
# The to_numpy method returns the data in the DataFrame as a 2D NumPy array.
print(frame3.to_numpy())

[[1.5 nan]
 [1.7 2.4]
 [3.6 2.9]]


In [37]:
# Reindex a Series:
# Rearranges the data to align with the new index, introducing NaN for missing index values.

obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=["d", "b", "a", "c"])
obj2 = obj.reindex(["a", "b", "c", "d", "e"])
print(obj)

print(obj2)


d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64
a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64


In [39]:
# Reindex with forward fill to propagate the last valid observation.
obj3 = pd.Series(["blue", "purple", "yellow"], index=[0, 2, 4])
obj3.reindex(np.arange(6), method="ffill")
obj3

0      blue
2    purple
4    yellow
dtype: object

In [40]:
# Create a DataFrame with custom index and columns:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)), index=["a", "c", "d"], columns=["Ohio", "Texas", "California"])
frame


Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [41]:
# Reindex the DataFrame:
frame2 = frame.reindex(index=["a", "b", "c", "d"])
frame2


Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [42]:
# Reindex columns:
states = ["Texas", "Utah", "California"]
frame.reindex(columns=states)


Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [43]:
# Dropping Entries from an Axis
# Create a Series and drop an entry:
obj = pd.Series(np.arange(5.), index=["a", "b", "c", "d", "e"])
new_obj = obj.drop("c")
new_obj



a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [45]:
# Drop multiple entries from a Series:
obj.drop(["d", "c"])


a    0.0
b    1.0
e    4.0
dtype: float64

In [48]:
# Create a DataFrame and drop rows:
data = pd.DataFrame(np.arange(16).reshape((4, 4)), index=["Ohio", "Colorado", "Utah", "New York"], columns=["one", "two", "three", "four"])
data.drop(index=["Colorado", "Ohio"])


Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [49]:
# Drop columns:
data.drop(columns=["two"])


Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [53]:
# Indexing, Selection, and Filtering
# Series indexing examples:
obj = pd.Series(np.arange(4.), index=["a", "b", "c", "d"])
print(obj["b"])
print(obj.iloc[1])
print(obj.iloc[2:4])
print(obj[["b", "a", "d"]])
print(obj.iloc[[1, 3]])
print(obj[obj < 2])


1.0
1.0
c    2.0
d    3.0
dtype: float64
b    1.0
a    0.0
d    3.0
dtype: float64
b    1.0
d    3.0
dtype: float64
a    0.0
b    1.0
dtype: float64


In [54]:
# Series indexing with loc:
obj.loc[["b", "a", "d"]]


b    1.0
a    0.0
d    3.0
dtype: float64

In [56]:
# Series indexing with iloc:
obj1 = pd.Series([1, 2, 3], index=[2, 0, 1])
obj2 = pd.Series([1, 2, 3], index=["a", "b", "c"])
print(obj1.iloc[[0, 1, 2]])
print(obj2.iloc[[0, 1, 2]])


2    1
0    2
1    3
dtype: int64
a    1
b    2
c    3
dtype: int64


In [59]:
# DataFrame indexing examples:
data = pd.DataFrame(np.arange(16).reshape((4, 4)), index=["Ohio", "Colorado", "Utah", "New York"], columns=["one", "two", "three", "four"])
print(data)
print("\n")
print(data["two"])
print(data[["three", "one"]])
print(data[:2])
print(data[data["three"] > 5])


          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15


Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32
          three  one
Ohio          2    0
Colorado      6    4
Utah         10    8
New York     14   12
          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
          one  two  three  four
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15


In [62]:
# DataFrame boolean indexing:

print(data < 5)

print("\n")
data[data < 5] = 0
print(data)


            one    two  three   four
Ohio       True   True   True   True
Colorado   True  False  False  False
Utah      False  False  False  False
New York  False  False  False  False


          one  two  three  four
Ohio        0    0      0     0
Colorado    0    5      6     7
Utah        8    9     10    11
New York   12   13     14    15


In [67]:
# Selection with loc and iloc
print(data)
print("\n")

print(data.loc["Colorado"])
print("\n")
print(data.loc[["Colorado", "New York"]])
print("\n")
print(data.loc["Colorado", ["two", "three"]])


          one  two  three  four
Ohio        0    0      0     0
Colorado    0    5      6     7
Utah        8    9     10    11
New York   12   13     14    15


one      0
two      5
three    6
four     7
Name: Colorado, dtype: int32


          one  two  three  four
Colorado    0    5      6     7
New York   12   13     14    15


two      5
three    6
Name: Colorado, dtype: int32


Sorting and Ranking in Pandas

In [68]:
# Sorting a Series by index:
obj = pd.Series(np.arange(4), index=["d", "a", "b", "c"])
print(obj)

d    0
a    1
b    2
c    3
dtype: int32


In [69]:
#sorting 
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int32

In [72]:
# Sorting a DataFrame by index:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)), index=["three", "one"], columns=["d", "a", "b", "c"])
print(frame)
print("\n\n")
print("sorting by index \n\n",frame.sort_index())

       d  a  b  c
three  0  1  2  3
one    4  5  6  7



sorting by index 

        d  a  b  c
one    4  5  6  7
three  0  1  2  3


In [73]:
# Sorting a DataFrame by columns:
frame.sort_index(axis="columns")

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [74]:
frame.sort_index(axis="columns", ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [78]:
# Sorting a Series by its values:
obj = pd.Series([4, 7, -3, 2])
print(obj)
print("\n\n")

print("sorting by values\n",obj.sort_values())

0    4
1    7
2   -3
3    2
dtype: int64



sorting by values
 2   -3
3    2
0    4
1    7
dtype: int64


In [79]:
# Sorting a Series with NaN values:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
print(obj)

0    4.0
1    NaN
2    7.0
3    NaN
4   -3.0
5    2.0
dtype: float64


In [80]:

obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [81]:
#set the position  of the NaN values at the beginning
obj.sort_values(na_position="first")

1    NaN
3    NaN
4   -3.0
5    2.0
0    4.0
2    7.0
dtype: float64

In [83]:
# Sorting a DataFrame by values:

frame = pd.DataFrame({"b": [4, 7, -3, 2], "a": [0, 1, 0, 1]})
print(frame)

print("\n")
print("sorting the values of the b \n",frame.sort_values("b"))

print("\n")

print("sorting values of a and b \n",frame.sort_values(["a", "b"]))
       


   b  a
0  4  0
1  7  1
2 -3  0
3  2  1


sorting the values of the b 
    b  a
2 -3  0
3  2  1
0  4  0
1  7  1


sorting values of a and b 
    b  a
2 -3  0
0  4  0
3  2  1
1  7  1


In [85]:
# Ranking a Series:

obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
print(obj)

print("ranking")
print(obj.rank())

0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64
ranking
0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64


In [86]:
#  Ranking with method="first"
obj.rank(method="first")


0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [87]:
# Ranking a DataFrame:
frame = pd.DataFrame({"b": [4.3, 7, -3, 2], "a": [0, 1, 0, 1], "c": [-2, 5, 8, -2.5]})
frame

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


In [88]:
frame.rank(axis="columns")

Unnamed: 0,b,a,c
0,3.0,2.0,1.0
1,3.0,1.0,2.0
2,1.0,2.0,3.0
3,3.0,2.0,1.0


## Summarizing and Computing Descriptive Statistics

In [89]:
df = pd.DataFrame({
    'one': [1.4, 7.1, np.nan, 0.75],
    'two': [np.nan, -4.5, np.nan, -1.3]
}, index=['a', 'b', 'c', 'd'])

print(df)

    one  two
a  1.40  NaN
b  7.10 -4.5
c   NaN  NaN
d  0.75 -1.3


In [90]:
# This calculates the sum of each column.
# Missing values (NaN) are excluded by default.

print(df.sum())

one    9.25
two   -5.80
dtype: float64


In [91]:
# Sum Across Rows

print(df.sum(axis="columns"))

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64


In [92]:
#calculating mean
df.mean()

one    3.083333
two   -2.900000
dtype: float64

In [93]:
#mean of the rows
df.mean(axis="columns")

a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64

In [96]:
# Cumulative Sum
print(df)
print(df.cumsum())

    one  two
a  1.40  NaN
b  7.10 -4.5
c   NaN  NaN
d  0.75 -1.3
    one  two
a  1.40  NaN
b  8.50 -4.5
c   NaN  NaN
d  9.25 -5.8


In [97]:
# This provides a summary of statistics for each column.
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [99]:
# This counts the number of non-NaN values in each column.
df.count()

one    3
two    2
dtype: int64

In [101]:
# This calculates the minimum value for each column.
df.min()

one    0.75
two   -4.50
dtype: float64

In [102]:
#maximum value
df.max()

one    7.1
two   -1.3
dtype: float64

In [103]:
# This returns the index of the minimum and maximum value for each column.
df.idxmin()

one    d
two    b
dtype: object

In [104]:
df.idxmax()

one    b
two    d
dtype: object

In [106]:
# This calculates the product of all values in each column.
df.prod()

one    7.455
two    5.850
dtype: float64

In [107]:
#standerd deviation
df.std()

one    3.493685
two    2.262742
dtype: float64

In [108]:
# This calculates the 50th percentile (median) for each column.
df.quantile(0.5)

one    1.4
two   -2.9
Name: 0.5, dtype: float64

In [110]:
# This calculates the cumulative minimum for each column.
df.cummin()

Unnamed: 0,one,two
a,1.4,
b,1.4,-4.5
c,,
d,0.75,-4.5


In [111]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [112]:
df.cummax()

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,7.1,-1.3
