## DATAFRAMES

In [2]:
import pandas as pd
import numpy as np 
import pdfplumber
import logging

## 1. What is a DataFrame?
#### A DataFrame is a 2-dimensional labeled data structure in pandas, similar to a table in SQL or Excel, with rows and columns.

#### Key Characteristics:
- Rows: Identified by an index (default is integers starting at 0).
- Columns: Named and can hold different data types (strings, numbers, etc.).
- Data Types: Each column can have a different type (e.g., int, float, object).

## 2. Creating DataFrames
- From a Dictionary

In [4]:
# import pandas as pd

data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35],
    'Salary': [50000, 60000, 70000]
}
df = pd.DataFrame(data)
df


Unnamed: 0,Name,Age,Salary
0,Alice,25,50000
1,Bob,30,60000
2,Charlie,35,70000


- From a List of Dictionaries

In [6]:
data = [
    {'Name': 'Alice', 'Age': 25, 'Salary': 50000},
    {'Name': 'Bob', 'Age': 30, 'Salary': 60000},
    {'Name': 'Charlie', 'Age': 35, 'Salary': 70000}
]
df = pd.DataFrame(data)
df


Unnamed: 0,Name,Age,Salary
0,Alice,25,50000
1,Bob,30,60000
2,Charlie,35,70000


- From a CSV File

In [10]:
emp_df = pd.read_csv('employee_data.csv')
emp_df

Unnamed: 0,ID,Name,Age,Department,Location,Salary
0,1,Ankit Sharma,28,Engineering,Delhi,85000
1,2,Neha Gupta,32,Marketing,Mumbai,78000
2,3,Rahul Verma,29,Engineering,Bangalore,95000
3,4,Swati Patel,35,Finance,Chennai,110000
4,5,Vikram Singh,40,HR,Delhi,70000
5,6,Suman Reddy,26,Engineering,Hyderabad,90000
6,7,Meena Kaur,30,Sales,Mumbai,67000
7,8,Arun Kumar,34,Marketing,Delhi,80000
8,9,Preeti Joshi,27,Engineering,Bangalore,87000
9,10,Akash Singh,36,Finance,Hyderabad,120000


## 3. Basic Operations
### View Data
1. df.head(n) – First n rows (default is 5).
2. df.tail(n) – Last n rows.
3. df.shape – (rows, columns).
4. df.info() – Summary of DataFrame.
5. df.describe() – Descriptive statistics for numerical columns.

#### Access Columns
1. df['ColumnName'] – Access a column as a Series.
2. df[['Col1', 'Col2']] – Access multiple columns as a DataFrame.
3. Access Rows
4. By Index: df.loc[index] or df.iloc[index].

In [11]:
emp_df[['ID','Name','Age']]

Unnamed: 0,ID,Name,Age
0,1,Ankit Sharma,28
1,2,Neha Gupta,32
2,3,Rahul Verma,29
3,4,Swati Patel,35
4,5,Vikram Singh,40
5,6,Suman Reddy,26
6,7,Meena Kaur,30
7,8,Arun Kumar,34
8,9,Preeti Joshi,27
9,10,Akash Singh,36


In [12]:
emp_df.loc[1]

ID                     2
Name          Neha Gupta
Age                   32
Department     Marketing
Location          Mumbai
Salary             78000
Name: 1, dtype: object

In [None]:
emp_df.loc[:] ## emp_df.loc[:] [starting row : Ending rows ]

Unnamed: 0,ID,Name,Age,Department,Location,Salary
0,1,Ankit Sharma,28,Engineering,Delhi,85000
1,2,Neha Gupta,32,Marketing,Mumbai,78000
2,3,Rahul Verma,29,Engineering,Bangalore,95000


## Rename Columns

In [None]:
# df.rename(columns={'OldName': 'NewName'}, inplace=True)

## Drop Columns or Rows

In [None]:
df.drop(columns=['ColumnName'], inplace=True)  # Drop a column
df.drop(index=0, inplace=True)                 # Drop a row

## Grouping and Aggregation
- Group Data

In [17]:
# grouped = df.groupby('ColumnName')['AnotherColumn'].sum()

emp_df_grouped_df = emp_df.groupby('Department')['Name']
emp_df_grouped_df


<pandas.core.groupby.generic.SeriesGroupBy object at 0x000002209C49DD90>

## Iterating Over DataFrames

In [19]:
for index, row in df.iterrows():
    print(row)


ID                       1
Name          Ankit Sharma
Age                     28
Department     Engineering
Location             Delhi
Salary               85000
Name: 0, dtype: object
ID                     2
Name          Neha Gupta
Age                   32
Department     Marketing
Location          Mumbai
Salary             78000
Name: 1, dtype: object
ID                      3
Name          Rahul Verma
Age                    29
Department    Engineering
Location        Bangalore
Salary              95000
Name: 2, dtype: object
ID                      4
Name          Swati Patel
Age                    35
Department        Finance
Location          Chennai
Salary             110000
Name: 3, dtype: object
ID                       5
Name          Vikram Singh
Age                     40
Department              HR
Location             Delhi
Salary               70000
Name: 4, dtype: object
ID                      6
Name          Suman Reddy
Age                    26
Department    Enginee