In [None]:
!pip install pandas

In [None]:
import pandas as pd

In [None]:
Creating a DataFrame from dictionary

In [1]:
import pandas as pd
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Age': [24, 27, 22, 32, 29],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'],
    'Salary': [70000, 80000, 65000, 120000, 95000]
}
df = pd.DataFrame(data)
print(df)

      Name  Age         City  Salary
0    Alice   24     New York   70000
1      Bob   27  Los Angeles   80000
2  Charlie   22      Chicago   65000
3    David   32      Houston  120000
4      Eve   29      Phoenix   95000


In [None]:
display

In [2]:
# first 5 values
print("Head of the DataFrame:\n", df.head())

Head of the DataFrame:
       Name  Age         City  Salary
0    Alice   24     New York   70000
1      Bob   27  Los Angeles   80000
2  Charlie   22      Chicago   65000
3    David   32      Houston  120000
4      Eve   29      Phoenix   95000


In [3]:
# first 2 values
print("First 2 values:\n", df.head(2))

First 2 values:
     Name  Age         City  Salary
0  Alice   24     New York   70000
1    Bob   27  Los Angeles   80000


In [4]:
# last 5 values
print("\nTail of the DataFrame:\n", df.tail())


Tail of the DataFrame:
       Name  Age         City  Salary
0    Alice   24     New York   70000
1      Bob   27  Los Angeles   80000
2  Charlie   22      Chicago   65000
3    David   32      Houston  120000
4      Eve   29      Phoenix   95000


In [5]:
# shape of the dataframe
print("\nShape of the DataFrame:", df.shape)


Shape of the DataFrame: (5, 4)


In [6]:
# summary statistics
print("\nSummary statistics:\n", df.describe())


Summary statistics:
              Age         Salary
count   5.000000       5.000000
mean   26.800000   86000.000000
std     3.962323   22192.341021
min    22.000000   65000.000000
25%    24.000000   70000.000000
50%    27.000000   80000.000000
75%    29.000000   95000.000000
max    32.000000  120000.000000


In [None]:
Column operations

In [7]:
# column names
print("\nColumn names:", df.columns)


Column names: Index(['Name', 'Age', 'City', 'Salary'], dtype='object')


In [8]:
# a single column
print("\nSelect a single column (Age):\n", df['Age'])


Select a single column (Age):
 0    24
1    27
2    22
3    32
4    29
Name: Age, dtype: int64


In [9]:
# multiple columns
print("\nSelect multiple columns (Name and Salary):\n", df[['Name', 'Salary']])


Select multiple columns (Name and Salary):
       Name  Salary
0    Alice   70000
1      Bob   80000
2  Charlie   65000
3    David  120000
4      Eve   95000


In [10]:
# Filter rows based on a condition
print("\nRows where Age > 25:\n", df[df['Age'] > 25])


Rows where Age > 25:
     Name  Age         City  Salary
1    Bob   27  Los Angeles   80000
3  David   32      Houston  120000
4    Eve   29      Phoenix   95000


In [11]:
# Add a new column
df['Experience'] = [2, 5, 1, 10, 7]
print("\nDataFrame after adding Experience column:\n", df)


DataFrame after adding Experience column:
       Name  Age         City  Salary  Experience
0    Alice   24     New York   70000           2
1      Bob   27  Los Angeles   80000           5
2  Charlie   22      Chicago   65000           1
3    David   32      Houston  120000          10
4      Eve   29      Phoenix   95000           7


In [12]:
# Remove a column
df.drop('City', axis=1, inplace=True)
print("\nDataFrame after removing City column:\n", df)


DataFrame after removing City column:
       Name  Age  Salary  Experience
0    Alice   24   70000           2
1      Bob   27   80000           5
2  Charlie   22   65000           1
3    David   32  120000          10
4      Eve   29   95000           7


In [13]:
# Rename columns
df.rename(columns={'Name': 'Employee Name'}, inplace=True)
print("\nDataFrame after renaming columns:\n", df)


DataFrame after renaming columns:
   Employee Name  Age  Salary  Experience
0         Alice   24   70000           2
1           Bob   27   80000           5
2       Charlie   22   65000           1
3         David   32  120000          10
4           Eve   29   95000           7


In [None]:
Handling missing values

In [14]:
# Adding a NaN value for demonstration
df.at[2, 'Age'] = None
print("\nDataFrame with NaN value:\n", df)


DataFrame with NaN value:
   Employee Name   Age  Salary  Experience
0         Alice  24.0   70000           2
1           Bob  27.0   80000           5
2       Charlie   NaN   65000           1
3         David  32.0  120000          10
4           Eve  29.0   95000           7


In [15]:
# Fill missing values
df['Age'].fillna(df['Age'].mean(), inplace=True)
print("\nDataFrame after filling missing values:\n", df)


DataFrame after filling missing values:
   Employee Name   Age  Salary  Experience
0         Alice  24.0   70000           2
1           Bob  27.0   80000           5
2       Charlie  28.0   65000           1
3         David  32.0  120000          10
4           Eve  29.0   95000           7


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)


In [None]:
Grouping and sorting data

In [17]:
# Grouping data
grouped = df.groupby('Experience').mean()
print("\nGrouped DataFrame by Experience:\n", grouped)

TypeError: agg function failed [how->mean,dtype->object]

In [18]:
# Sorting data
sorted_df = df.sort_values(by='Salary', ascending=False)
print("\nDataFrame sorted by Salary (descending):\n", sorted_df)


DataFrame sorted by Salary (descending):
   Employee Name   Age  Salary  Experience
3         David  32.0  120000          10
4           Eve  29.0   95000           7
1           Bob  27.0   80000           5
0         Alice  24.0   70000           2
2       Charlie  28.0   65000           1


In [None]:
Saving data to csv and reading from it

In [19]:
# Saving DataFrame to a CSV file
df.to_csv('employee_data.csv', index=False)
print("\nDataFrame saved to 'employee_data.csv'")

# Reading DataFrame from a CSV file
df_from_csv = pd.read_csv('employee_data.csv')
print("\nDataFrame read from 'employee_data.csv':\n", df_from_csv)


DataFrame saved to 'employee_data.csv'

DataFrame read from 'employee_data.csv':
   Employee Name   Age  Salary  Experience
0         Alice  24.0   70000           2
1           Bob  27.0   80000           5
2       Charlie  28.0   65000           1
3         David  32.0  120000          10
4           Eve  29.0   95000           7
