# Dataframe

### 1. Creation of DataFrames

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Empty DataFrame
df_empty = pd.DataFrame()
df_empty

In [3]:
# From a list
data_list = [1, 2, 3, 4, 5]
df_list = pd.DataFrame(data_list, columns=['Numbers'])
df_list

Unnamed: 0,Numbers
0,1
1,2
2,3
3,4
4,5


In [4]:
# From a dictionary
data_dict = {'Name': ['Alice', 'Bob', 'Charlie'], 'Age': [25, 30, 35]}
df_dict = pd.DataFrame(data_dict)
df_dict

Unnamed: 0,Name,Age
0,Alice,25
1,Bob,30
2,Charlie,35


In [5]:
# From a NumPy array
data_array = np.array([[10, 20], [30, 40], [50, 60]])
df_array = pd.DataFrame(data_array, columns=['A', 'B'])
df_array

Unnamed: 0,A,B
0,10,20
1,30,40
2,50,60


In [6]:
# Creating a DataFrame with Custom Index and Columns
data = np.array([[1, 2], [3, 4], [5, 6]])
df = pd.DataFrame(data, index=['row1', 'row2', 'row3'], columns=['col1', 'col2'])
df

Unnamed: 0,col1,col2
row1,1,2
row2,3,4
row3,5,6


### 2. Viewing Data

In [7]:
# From a dictionary
data_dict = {'Name': ['Alice', 'Bob', 'Charlie'], 'Age': [25, 30, 35]}
df_dict = pd.DataFrame(data_dict)
df_dict

Unnamed: 0,Name,Age
0,Alice,25
1,Bob,30
2,Charlie,35


In [8]:
# Head of DataFrame
print(df_dict.head(2))  # First 2 rows

    Name  Age
0  Alice   25
1    Bob   30


In [9]:
# Tail of DataFrame
print(df_dict.tail(2))  # Last 2 rows

      Name  Age
1      Bob   30
2  Charlie   35


In [10]:
# DataFrame info 
print(df_dict.info())  # Summary

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    3 non-null      object
 1   Age     3 non-null      int64 
dtypes: int64(1), object(1)
memory usage: 180.0+ bytes
None


In [11]:
# DataFrame description
print(df_dict.describe())  # Statistics for numeric columns

        Age
count   3.0
mean   30.0
std     5.0
min    25.0
25%    27.5
50%    30.0
75%    32.5
max    35.0


### 3. Dealing with Rows and Columns

In [12]:
# Creating a DataFrame with Custom Index and Columns
data = np.array([[1, 2], [3, 4], [5, 6]])
df = pd.DataFrame(data, index=['row1', 'row2', 'row3'], columns=['col1', 'col2'])
df

Unnamed: 0,col1,col2
row1,1,2
row2,3,4
row3,5,6


In [13]:
# Column selection
print(df['col1'])  # Single column
# print(df_dict[['col1', 'col2']])  # Multiple columns

row1    1
row2    3
row3    5
Name: col1, dtype: int64


In [14]:
# Row selection
print(df.loc['row1'])  # By row label

col1    1
col2    2
Name: row1, dtype: int64


In [15]:
print(df.iloc[1])  # By row index

col1    3
col2    4
Name: row2, dtype: int64


In [16]:
# Cell selection
print(df.loc['row1', 'col1'])  # By row and column names

1


In [17]:
print(df.iloc[1, 0])  # By row and column indices

3


### 4. Filtering and Conditional Selection

In [18]:
# Filter rows where Age > 25
filtered_df = df_dict[df_dict['Age'] > 25]
print(filtered_df)

      Name  Age
1      Bob   30
2  Charlie   35


In [19]:
# Using query string syntax
########
filtered_query = df_dict.query('Age > 25')
print(filtered_query)

      Name  Age
1      Bob   30
2  Charlie   35


### 5. Sorting Data

In [20]:
# Sorting by column values
sorted_df = df_dict.sort_values(by='Age')
print(sorted_df)

      Name  Age
0    Alice   25
1      Bob   30
2  Charlie   35


In [21]:
# Sorting by index
sorted_index_df = df_dict.sort_index()
print(sorted_index_df)

      Name  Age
0    Alice   25
1      Bob   30
2  Charlie   35


### 6. Working with Missing Data

In [22]:
# Introducing missing data
df = pd.DataFrame({
    'Name': ['A', 'B', 'C'],
    'Age': [25, None, 35]
})
df

Unnamed: 0,Name,Age
0,A,25.0
1,B,
2,C,35.0


In [23]:
# Checking for missing values
df.isnull()

Unnamed: 0,Name,Age
0,False,False
1,False,True
2,False,False


In [24]:
# Filling missing values
df_filled = df.fillna(value=10)
df_filled

Unnamed: 0,Name,Age
0,A,25.0
1,B,10.0
2,C,35.0


In [25]:
# Filling missing values with mean
df_filled_mean = df.fillna(value=df['Age'].mean())
df_filled_mean

Unnamed: 0,Name,Age
0,A,25.0
1,B,30.0
2,C,35.0


In [26]:
# Dropping rows with missing values
df_dropped = df.dropna()
df_dropped

Unnamed: 0,Name,Age
0,A,25.0
2,C,35.0


### 7. Adding and Removing Data

In [27]:
# Adding a new column
df_dict['Country'] = ['USA', 'Canada', 'UK']
df_dict

Unnamed: 0,Name,Age,Country
0,Alice,25,USA
1,Bob,30,Canada
2,Charlie,35,UK


In [28]:
# Removing a column
df_removed_col = df_dict.drop(columns=['Country'])
df_removed_col

Unnamed: 0,Name,Age
0,Alice,25
1,Bob,30
2,Charlie,35


In [29]:
# Adding new rows
new_row = {'Name': 'David', 'Age': 40}
new_row

{'Name': 'David', 'Age': 40}

In [30]:
# Convert the new row to a DataFrame and use pd.concat()
new_row_df = pd.DataFrame([new_row])
new_row_df

Unnamed: 0,Name,Age
0,David,40


In [31]:
# Concatenate the new row to the existing DataFrame
df_appended = pd.concat([df_dict, new_row_df], ignore_index=True)
df_appended

Unnamed: 0,Name,Age,Country
0,Alice,25,USA
1,Bob,30,Canada
2,Charlie,35,UK
3,David,40,


In [32]:
# Dropping rows by index
df_removed_row = df_appended.drop(index=1) # drop second row
df_removed_row

Unnamed: 0,Name,Age,Country
0,Alice,25,USA
2,Charlie,35,UK
3,David,40,


### 8. Iterating over Rows and Columns

In [33]:
# Iterating over rows
for index, row in df_dict.iterrows():
    print(f"Row {index}: {row['Name']} is {row['Age']} years old")

Row 0: Alice is 25 years old
Row 1: Bob is 30 years old
Row 2: Charlie is 35 years old


In [34]:
# Iterating over columns
for col in df_dict.columns:
    print(df_dict[col])

0      Alice
1        Bob
2    Charlie
Name: Name, dtype: object
0    25
1    30
2    35
Name: Age, dtype: int64
0       USA
1    Canada
2        UK
Name: Country, dtype: object


### 9. Grouping and Aggregation

Some functions used in the **aggregation** are:

Function Description:

- sum()         :Compute sum of column values
- min()          :Compute min of column values
- max()         :Compute max of column values
- mean()       :Compute mean of column
- size()          :Compute column sizes
- describe()  :Generates descriptive statistics
- first()          :Compute first of group values
- last()          :Compute last of group values
- count()       :Compute count of column values
- std()           :Standard deviation of column
- var()           :Compute variance of column
- sem()         :Standard error of the mean of column

In [50]:
df = pd.DataFrame([[9, 4, 8, 9], 
                   [8, 10, 7, 6], 
                   [7, 6, 8, 5],[7, 16, 18, 15]], 
                  columns=['Maths',  'English',  
                           'Science', 'History']) 
df

Unnamed: 0,Maths,English,Science,History
0,9,4,8,9
1,8,10,7,6
2,7,6,8,5
3,7,16,18,15


In [51]:
df.sum()

Maths      31
English    36
Science    41
History    35
dtype: int64

In [52]:
df.describe()
#########

Unnamed: 0,Maths,English,Science,History
count,4.0,4.0,4.0,4.0
mean,7.75,9.0,10.25,8.75
std,0.957427,5.291503,5.188127,4.5
min,7.0,4.0,7.0,5.0
25%,7.0,5.5,7.75,5.75
50%,7.5,8.0,8.0,7.5
75%,8.25,11.5,10.5,10.5
max,9.0,16.0,18.0,15.0


- We used agg() function to calculate the sum, min, and max of each column in our dataset.

In [53]:
df

Unnamed: 0,Maths,English,Science,History
0,9,4,8,9
1,8,10,7,6
2,7,6,8,5
3,7,16,18,15


In [54]:
df.agg(['sum', 'min', 'max'])

Unnamed: 0,Maths,English,Science,History
sum,31,36,41,35
min,7,4,7,5
max,9,16,18,15


**Grouping** is used to group data using some criteria from our dataset. It is used as split-apply-combine strategy.

- Splitting the data into groups based on some criteria.
- Applying a function to each group independently.
- Combining the results into a data structure.

In [55]:
df

Unnamed: 0,Maths,English,Science,History
0,9,4,8,9
1,8,10,7,6
2,7,6,8,5
3,7,16,18,15


In [56]:
# grouping based on “Maths”
a = df.groupby('Maths').sum()
a

Unnamed: 0_level_0,English,Science,History
Maths,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
7,22,26,20
8,10,7,6
9,4,8,9


In [57]:
# grouping based on “Maths” and "Science"
b = df.groupby(['Maths', 'Science']) 
b.first() 

Unnamed: 0_level_0,Unnamed: 1_level_0,English,History
Maths,Science,Unnamed: 2_level_1,Unnamed: 3_level_1
7,8,6,5
7,18,16,15
8,7,10,6
9,8,4,9
