# ***PANDAS***

```
### Introduction to Data Manipulation with Pandas and Numpy

In this session, we continue our discussion on Python, focusing on data manipulation and analysis using the pandas library. Pandas is a crucial library for performing exploratory data analysis and feature engineering. We will cover these topics and work with a sample CSV file to demonstrate exploratory data analysis and handling missing values.

```



In [None]:
#Series and Dataframes

In [None]:
import pandas as pd

In [None]:
data = [1,2,3,4,5]
series = pd.Series(data)
print(series)
print(type(series))

0    1
1    2
2    3
3    4
4    5
dtype: int64
<class 'pandas.core.series.Series'>


In [None]:
data = [
    [1,2,3],
    [4,5,6],
    [7,8,9]
]
df = pd.Series(data)
print(df)
print(type(df))

0    [1, 2, 3]
1    [4, 5, 6]
2    [7, 8, 9]
dtype: object
<class 'pandas.core.series.Series'>


In [None]:
# from Dict - dict keys itself are taken as keys
data = {'a':1,'b':2,'c':3}
series_dict = pd.Series(data)
print(series_dict)
print(type(series_dict))

a    1
b    2
c    3
dtype: int64
<class 'pandas.core.series.Series'>


In [None]:
data = [10,20,30,40,50]
index = ['a','b','c','d','e']
series = pd.Series(data=data, index=index)
print(series)
print(type(series))

a    10
b    20
c    30
d    40
e    50
dtype: int64
<class 'pandas.core.series.Series'>


In [None]:
data = [10,20,30,40,50]
index = ['a','b','c','d','e']
series = pd.Series(data, index)
print(series)
print(type(series))

a    10
b    20
c    30
d    40
e    50
dtype: int64
<class 'pandas.core.series.Series'>


In [None]:
#Dataframes
# create a dataframe from a dictionary of list

data = {
    'Names':['Krishna','shankar','priya'],
    'Age':[33,23,30],
    'City':['Chennai','Florida','Swiss']
}

df = pd.DataFrame(data)
print(df)
print(type(df))

     Names  Age     City
0  Krishna   33  Chennai
1  shankar   23  Florida
2    priya   30    Swiss
<class 'pandas.core.frame.DataFrame'>


In [None]:
data = [[1,2,3],[4,5,6,],[7,8,9]]
df = pd.DataFrame(data)
print(df)
print(type(df))

   0  1  2
0  1  2  3
1  4  5  6
2  7  8  9
<class 'pandas.core.frame.DataFrame'>


In [None]:
#create dataframe from a list of dictionaries
data = [
        {'Name':'Krishna','Age':33,'City':'Chennai'},
        {'Name':'shankar','Age':23,'City':'Florida'},
        {'Name':'priya','Age':30,'City':'Swiss'}
 ]
df = pd.DataFrame(data)
print(df)
print(type(df))

      Name  Age     City
0  Krishna   33  Chennai
1  shankar   23  Florida
2    priya   30    Swiss
<class 'pandas.core.frame.DataFrame'>


In [None]:
#Read data from a csv file
df = pd.read_csv("https://raw.githubusercontent.com/Laxminarayen/Inceptez-batch-25-Classwork/refs/heads/main/Python-Class10-Pandas-Intermediate/chipotle.tsv", delimiter='\t')
print(df)
print(type(df))

df.head()
df.tail()

HTTPError: HTTP Error 404: Not Found

In [None]:
#create dataframe from a list of dictionaries
data = [
        {'Name':'Krishna','Age':33,'City':'Chennai'},
        {'Name':'shankar','Age':23,'City':'Florida'},
        {'Name':'priya','Age':30,'City':'Swiss'}
 ]
df = pd.DataFrame(data)
print(df)
print(type(df))

      Name  Age     City
0  Krishna   33  Chennai
1  shankar   23  Florida
2    priya   30    Swiss
<class 'pandas.core.frame.DataFrame'>


In [None]:
df['Name']

Unnamed: 0,Name
0,Krishna
1,shankar
2,priya


In [None]:
print(type(df['Name'])) #column becomes a series

<class 'pandas.core.series.Series'>


In [None]:
df.loc[0]

Unnamed: 0,0
Name,Krishna
Age,33
City,Chennai


In [None]:
df.iloc[0][1]

  df.iloc[0][1]


np.int64(33)

In [None]:
#Accessing specified element using at
print(df.at[0,'Age'])

33


In [None]:
#Accessing specified element using iat
print(df.iat[2,2])

Swiss


In [None]:
#Data manipulation with Dataframes
df

Unnamed: 0,Name,Age,City
0,Krishna,33,Chennai
1,shankar,23,Florida
2,priya,30,Swiss


In [None]:
df['salary']='1000'

In [None]:
#Adding a new column
df['Salary_New'] = [1500,2000,2030]
df

Unnamed: 0,Name,Age,City,salary,Salary_New
0,Krishna,33,Chennai,1000,1500
1,shankar,23,Florida,1000,2000
2,priya,30,Swiss,1000,2030


In [None]:
#Removing a column
df.drop('Salary_New')

KeyError: "['Salary_New'] not found in axis"

In [None]:
#By default it will check in the row index and by default the axes will be row index
df.drop('Salary_New',axis=0)

KeyError: "['Salary_New'] not found in axis"

In [None]:
#By default it will check in the row index and by default the axes will be row index
df.drop('Salary_New',axis=1)

Unnamed: 0,Name,Age,City,salary
0,Krishna,33,Chennai,1000
1,shankar,23,Florida,1000
2,priya,30,Swiss,1000


In [None]:
#here the salary column came again as the result is not persisted
df

Unnamed: 0,Name,Age,City,salary,Salary_New
0,Krishna,33,Chennai,1000,1500
1,shankar,23,Florida,1000,2000
2,priya,30,Swiss,1000,2030


In [None]:
#hence we need to use inplace = True
df.drop('salary',axis=1,inplace=True)
df

Unnamed: 0,Name,Age,City,Salary_New
0,Krishna,33,Chennai,1500
1,shankar,23,Florida,2000
2,priya,30,Swiss,2030


In [None]:
df

Unnamed: 0,Name,Age,City,Salary_New
0,Krishna,33,Chennai,1500
1,shankar,23,Florida,2000
2,priya,30,Swiss,2030


In [None]:
df['Age']=df['Age']+1
df

Unnamed: 0,Name,Age,City,Salary_New
0,Krishna,34,Chennai,1500
1,shankar,24,Florida,2000
2,priya,31,Swiss,2030


In [None]:
df.drop(0,inplace=True)
df

Unnamed: 0,Name,Age,City,Salary_New
1,shankar,24,Florida,2000
2,priya,31,Swiss,2030


In [None]:
df

Unnamed: 0,Name,Age,City,Salary_New
1,shankar,24,Florida,2000
2,priya,31,Swiss,2030


In [None]:
df.describe()

Unnamed: 0,Age,Salary_New
count,2.0,2.0
mean,27.5,2015.0
std,4.949747,21.213203
min,24.0,2000.0
25%,25.75,2007.5
50%,27.5,2015.0
75%,29.25,2022.5
max,31.0,2030.0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 1 to 2
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Name        2 non-null      object
 1   Age         2 non-null      int64 
 2   City        2 non-null      object
 3   Salary_New  2 non-null      int64 
dtypes: int64(2), object(2)
memory usage: 196.0+ bytes


In [None]:
df.dtypes

Unnamed: 0,0
Name,object
Age,int64
City,object
Salary_New,int64


In [None]:
df.isnull()
df.isnull().any()

In [None]:
df

Unnamed: 0,Name,Age,City,Salary_New
1,shankar,24,Florida,2000
2,priya,31,Swiss,2030


In [None]:
#Renaming a column
df = df.rename(columns = {'Name':'Full Name'})

In [None]:
df

Unnamed: 0,Full Name,Age,City,Salary_New
1,shankar,24,Florida,2000
2,priya,31,Swiss,2030


In [None]:
df.dtypes

Unnamed: 0,0
Full Name,object
Age,int64
City,object
Salary_New,int64


In [None]:
df['New Age'] = df['Age'].astype(float)
df

Unnamed: 0,Full Name,Age,City,Salary_New,New Age
1,shankar,24,Florida,2000,24.0
2,priya,31,Swiss,2030,31.0


In [None]:
df

Unnamed: 0,Full Name,Age,City,Salary_New,New Age
1,shankar,24,Florida,2000,24.0
2,priya,31,Swiss,2030,31.0


In [None]:
#calling function (custom or lamda functions)
df['Updated Salary'] = df['Salary_New'].apply(lambda x:x*2)
df.head()

Unnamed: 0,Full Name,Age,City,Salary_New,New Age,Updated Salary
1,shankar,24,Florida,2000,24.0,4000
2,priya,31,Swiss,2030,31.0,4060


In [None]:
grouped_mean = df.groupby('City')['Salary_New'].mean()
print(grouped_mean)

City
Florida    2000.0
Swiss      2030.0
Name: Salary_New, dtype: float64


In [None]:
grouped_sum = df.groupby(['City','Full Name'])['Salary_New'].sum()
print(grouped_sum)

City     Full Name
Florida  shankar      2000
Swiss    priya        2030
Name: Salary_New, dtype: int64


In [None]:
grouped_sum = df.groupby(['City','Full Name'])['Salary_New'].mean()
grouped_sum

Unnamed: 0_level_0,Unnamed: 1_level_0,Salary_New
City,Full Name,Unnamed: 2_level_1
Florida,shankar,2000.0
Swiss,priya,2030.0


In [None]:
grouped_agg = df.groupby('City')['Salary_New'].agg(['sum','mean','count'])
grouped_agg

Unnamed: 0_level_0,sum,mean,count
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Florida,2000,2000.0,1
Swiss,2030,2030.0,1


In [None]:
#Merging and joining dataframes
df1 = pd.DataFrame({'key':['A','B','C'], 'values1':[1,2,3]})
df2 = pd.DataFrame({'key':['A','B','D'], 'values2':[4,5,6]})

In [None]:
df1

Unnamed: 0,key,values1
0,A,1
1,B,2
2,C,3


In [None]:
df2

Unnamed: 0,key,values2
0,A,4
1,B,5
2,D,6


In [None]:
#Merge dataframes
df_merged = pd.merge(df1,df2,on='key',how='inner')
df_merged

Unnamed: 0,key,values1,values2
0,A,1,4
1,B,2,5


In [None]:
#Merge dataframes
df_merged = pd.merge(df1,df2,on='key',how='outer')
df_merged

Unnamed: 0,key,values1,values2
0,A,1.0,4.0
1,B,2.0,5.0
2,C,3.0,
3,D,,6.0


In [None]:
#Merge dataframes
df_merged = pd.merge(df1,df2,on='key',how='left')
df_merged

Unnamed: 0,key,values1,values2
0,A,1,4.0
1,B,2,5.0
2,C,3,


In [None]:
#Merge dataframes
df_merged = pd.merge(df1,df2,on='key',how='right')
df_merged

Unnamed: 0,key,values1,values2
0,A,1.0,4
1,B,2.0,5
2,D,,6


In [None]:
import pandas as pd
df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
df


Unnamed: 0,A,B
0,1,4
1,2,5
2,3,6


In [None]:
print(df.shape)

(3, 2)


In [None]:
import pandas as pd
df = pd.DataFrame({
    "Category": ["X", "X", "Y", "Y"],
    "Value": [1, 2, 3, 4]
})
result = df.groupby("Category")["Value"].transform("mean")
print(result.tolist())


[1.5, 1.5, 3.5, 3.5]


In [None]:
import pandas as pd
df = pd.DataFrame({
    "Dept": ["HR", "HR", "IT", "IT", "Finance"],
    "Salary": [5000, 6000, 5500, 7000, 6500]
})
result = df.groupby("Dept")["Salary"].max()
print(result.loc["IT"])


7000


In [None]:
import pandas as pd
df = pd.DataFrame({
    "Team": ["A", "A", "B", "B"],
    "Points": [10, 15, 20, 25]
})
print(df.groupby("Team")["Points"].sum().to_dict())


{'A': 25, 'B': 45}


# `Key Takeaways`


```

*   Pandas is a powerful library for data manipulation and exploratory data analysis in Python.
*   Handling missing values, renaming columns, and changing data types are essential data cleaning steps.
*   Aggregation and grouping operations allow for insightful analysis based on categorical variables.
*   Merging and joining DataFrames enable combining datasets for comprehensive analysis.

```



