# Exercise 8: Dataframe creation and basic Operations

## Create a dataframe

In [273]:
import pandas as pd

In [274]:
data = pd.DataFrame(
    {'Name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
    'Age': [23, 25, 24, 22, 23],
    'Grade': ['A', 'B', 'A', 'C', 'A'],
    'City': ['New York', 'Paris', 'London', 'Tokyo', 'Sydney']},
    index = [1, 2, 3, 4, 5]
)

## Display the first three rows

In [275]:
data.head(3)

Unnamed: 0,Name,Age,Grade,City
1,Alice,23,A,New York
2,Bob,25,B,Paris
3,Charlie,24,A,London


## Display the basic information about the data

In [276]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, 1 to 5
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    5 non-null      object
 1   Age     5 non-null      int64 
 2   Grade   5 non-null      object
 3   City    5 non-null      object
dtypes: int64(1), object(3)
memory usage: 200.0+ bytes


## Display summary statistics

In [277]:
data.describe()

Unnamed: 0,Age
count,5.0
mean,23.4
std,1.140175
min,22.0
25%,23.0
50%,23.0
75%,24.0
max,25.0


## Select only the name and age columns

In [278]:
data[['Name', 'Age']]

Unnamed: 0,Name,Age
1,Alice,23
2,Bob,25
3,Charlie,24
4,Diana,22
5,Eve,23


# Exercise 9: Data selection and Filtering

## Filter students with age > 23

In [279]:
data[data['Age'] > 23]

Unnamed: 0,Name,Age,Grade,City
2,Bob,25,B,Paris
3,Charlie,24,A,London


## Filter Students with Grade A

In [280]:
data[data['Grade'] == 'A']

Unnamed: 0,Name,Age,Grade,City
1,Alice,23,A,New York
3,Charlie,24,A,London
5,Eve,23,A,Sydney


## Filter Students from New york or London

In [281]:
data[(data['City'] == "New York") | (data['City'] == 'London')]

Unnamed: 0,Name,Age,Grade,City
1,Alice,23,A,New York
3,Charlie,24,A,London


## Select students with age between 22 and 24

In [282]:
data[(data['Age'] >= 22) & (data['Age'] <= 24)]

Unnamed: 0,Name,Age,Grade,City
1,Alice,23,A,New York
3,Charlie,24,A,London
4,Diana,22,C,Tokyo
5,Eve,23,A,Sydney


## Sort the dataframe by age in decending order

In [283]:
data.sort_values('Age', ascending=False)

Unnamed: 0,Name,Age,Grade,City
2,Bob,25,B,Paris
3,Charlie,24,A,London
1,Alice,23,A,New York
5,Eve,23,A,Sydney
4,Diana,22,C,Tokyo


# Exercise 10: Data Cleaning

## Create the Dataframe

In [284]:
data_dirty = pd.DataFrame(
    {
        'Name' : ['Alice', 'Bob', None, 'Diana', 'Eve'],
        'Age' : [23, None, 22, 24, 23],
        'Salary': [5000, 6000, 4500, None, 5500]
    },
    index = [1, 2, 3, 4, 5]
)

## check for missing values

In [285]:
data_dirty.isna()

Unnamed: 0,Name,Age,Salary
1,False,False,False
2,False,True,False
3,True,False,False
4,False,False,True
5,False,False,False


## Fill missing values in Age with the mean

In [286]:
age_mean = data['Age'].mean()

data_dirty['Age'].fillna(age_mean, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_dirty['Age'].fillna(age_mean, inplace=True)


## Drop rows with missing name

In [287]:
data_dirty = data_dirty.dropna(subset=['Name'], axis=0)

data_dirty

Unnamed: 0,Name,Age,Salary
1,Alice,23.0,5000.0
2,Bob,23.4,6000.0
4,Diana,24.0,
5,Eve,23.0,5500.0


## Replace Missing salary with 0

In [288]:
data_cleaned = data_dirty.fillna(0)
data_cleaned

Unnamed: 0,Name,Age,Salary
1,Alice,23.0,5000.0
2,Bob,23.4,6000.0
4,Diana,24.0,0.0
5,Eve,23.0,5500.0
