# **Pandas**

DataFrame = Excel Spreadsheet
Series = Single Column
Index = row labels



# Step 1: Creating a Dataframe

In [6]:
import pandas as pd

data = {
    "name": ["Alice", "Bob", "Charlie"],
    "age": [25, 30, 35],
    "salary": [70000, 80000, 90000]
}

df = pd.DataFrame(data)

print(df)

      name  age  salary
0    Alice   25   70000
1      Bob   30   80000
2  Charlie   35   90000


# Step 2: Rows Vs Columns


**Columns**

In [7]:
df.columns

Index(['name', 'age', 'salary'], dtype='object')

**Rows**


In [8]:
len(df)

3

# Step 3: .shape - Returns: (rows, columns)

In [9]:
df.shape

(3, 3)

# Step 4: What is a Series?

In [10]:
df['age']

Unnamed: 0,age
0,25
1,30
2,35


That is a series.

Rule:


*   df["one column"] ---> Series
*   df[["two_columns"]] ---> DataFrame


Example:


In [11]:
df [["age", "salary"]]

Unnamed: 0,age,salary
0,25,70000
1,30,80000
2,35,90000


# Step 5: Selecting Rows (Two Ways)

Position based (iloc)

In [12]:
df.iloc[0] # First row
df.iloc[1:3] # Rows 1 and 2

Unnamed: 0,name,age,salary
1,Bob,30,80000
2,Charlie,35,90000


Label-based(iloc)


In [13]:
df.loc[0]

Unnamed: 0,0
name,Alice
age,25
salary,70000


# Step 6: Filtering Rows (Crititcal)

1.   List item
2.   List item



In [14]:
df["age"] > 30

Unnamed: 0,age
0,False
1,False
2,True


# Step 7: Multiple Conditions

In [15]:
df[(df["age"] > 25) & (df["salary"] > 75000)]

Unnamed: 0,name,age,salary
1,Bob,30,80000
2,Charlie,35,90000


  Rules:


*   Use & not and
*   Wrap each condition in parentheses



# Step 8: Adding a Column

In [18]:
df["high_earner"] = df["salary"] > 80000

print(df)

      name  age  salary  high_earner
0    Alice   25   70000        False
1      Bob   30   80000        False
2  Charlie   35   90000         True


This new column is a series

# Step 9: Missing Data(Very Important)

In [20]:
df.isnull()

Unnamed: 0,name,age,salary,high_earner
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False


Returns True where data is missing

## Count missing Values

In [22]:
df.isnull().sum()

Unnamed: 0,0
name,0
age,0
salary,0
high_earner,0


## Drop rows with missing data:

In [23]:
df.dropna()

Unnamed: 0,name,age,salary,high_earner
0,Alice,25,70000,False
1,Bob,30,80000,False
2,Charlie,35,90000,True


## Fill missing data:

In [24]:
df.fillna(0)

Unnamed: 0,name,age,salary,high_earner
0,Alice,25,70000,False
1,Bob,30,80000,False
2,Charlie,35,90000,True


# Step 10: Aggregation. (Simple Stats)

In [25]:
df["salary"].mean()
df["age"].max()
df.describe()

Unnamed: 0,age,salary
count,3.0,3.0
mean,30.0,80000.0
std,5.0,10000.0
min,25.0,70000.0
25%,27.5,75000.0
50%,30.0,80000.0
75%,32.5,85000.0
max,35.0,90000.0
