Create a DataFrame with Pandas  
A data frame is a structured representation of data.  
DataFrame = Numbers + Row Index (left) + Column Names (top)

In [22]:
import pandas as pd
d={'column1': [1, 2, 3, 4, 7], 'column2': [4, 5, 6, 9, 5]}
df = pd.DataFrame(data=d)
data=pd.DataFrame({'column_1':[1,2,3,4,5],"column_2":[6,7,8,9,10]})
print(data)
print(df)
data_2=pd.DataFrame([10,20,30],index=["A","B","C"],columns=["index"])
print(data_2)

   column_1  column_2
0         1         6
1         2         7
2         3         8
3         4         9
4         5        10
   column1  column2
0        1        4
1        2        5
2        3        6
3        4        9
4        7        5
   index
A     10
B     20
C     30


Series (1D, like a column in Excel)

In [6]:
a=pd.Series([1,2,3])
print(a)
b=pd.Series([10,20,30],index=["AB","MU","Az"])
print(b)

0    1
1    2
2    3
dtype: int64
AB    10
MU    20
Az    30
dtype: int64


DataFrame (2D, like an Excel table)  
DataFrame = rows + columns with labels.

In [23]:
# DataFrame from dictionary
data = {
    "Name": ["Ali", "Sara", "John"],
    "Age": [22, 25, 30],
    "Score": [88, 92, 79],
}
df = pd.DataFrame(data)
print(df)


   Name  Age  Score
0   Ali   22     88
1  Sara   25     92
2  John   30     79


# Creating from NumPy array

In [24]:
import numpy as np
import pandas as pd
arr=np.array([[1,2,3],[10,20,30]])
df=pd.DataFrame(data=arr,columns=["Roll","Age","marks"])
print(df)
print("\n")
print(df.shape[0]) # number of rows
print("\n")
print(df.shape[1]) # numebr of columns

   Roll  Age  marks
0     1    2      3
1    10   20     30


2


3


# Importing / Exporting Data

In [None]:
# Read CSV
df = pd.read_csv("data.csv")

# Read Excel
df = pd.read_excel("data.xlsx")

# Read JSON
df = pd.read_json("data.json")

# Save DataFrame back to CSV
df.to_csv("output.csv", index=False)




#  Indexing & Selection

In [25]:
# Access by Column
import pandas as pd
print(df)
print("\n")
print(df['Age'])
print("\n")
print(df[['Age']])
print("\n")
print(df[['Age','marks','Roll']])

   Roll  Age  marks
0     1    2      3
1    10   20     30


0     2
1    20
Name: Age, dtype: int32


   Age
0    2
1   20


   Age  marks  Roll
0    2      3     1
1   20     30    10


In [26]:
# Access by row
#.loc[] -> by label(index name)
#.iloc[]-> by position(0,2,1..)
print(df.loc[1])   # single row selection
print(df.iloc[::])  # slicing → keeps multiple rows → DataFrame
print(df.loc[0:2])   # Rows 0 to 2 (inclusive with loc)
print(df.iloc[0:1])  #Rows 0 to 1 (Python slicing rule)

Roll     10
Age      20
marks    30
Name: 1, dtype: int32
   Roll  Age  marks
0     1    2      3
1    10   20     30
   Roll  Age  marks
0     1    2      3
1    10   20     30
   Roll  Age  marks
0     1    2      3


# Slicing rows & columns   
df.loc[rows, cols]

In [27]:
import pandas as pd
data = {
    'Name': ['Ali', 'Abra', 'Xa$', 'Dabra'],
    'Age': [22, 25, 19, 30],
    'Score': [88, 92, 79, 95]
}
df = pd.DataFrame(data)

print(df['Age']) ,   print("\n")          # Single column → Series      
print(df[['Name','Age']]) , print("\n")    # Multiple columns → DataFrame

# Rows 1 to 3, only 'Name' column
print(df.loc[1:3, 'Name'])
print("\n")  
# Rows 0 to 2, columns 'Name' and 'Score'
print(df.loc[0:2],['Name','Score'])



0    22
1    25
2    19
3    30
Name: Age, dtype: int64


    Name  Age
0    Ali   22
1   Abra   25
2    Xa$   19
3  Dabra   30


1     Abra
2      Xa$
3    Dabra
Name: Name, dtype: object


   Name  Age  Score
0   Ali   22     88
1  Abra   25     92
2   Xa$   19     79 ['Name', 'Score']


In [28]:
import pandas as pd
df = pd.DataFrame({
    'Name': ['Ali','Ravi','Zoya','John'],
    'Age': [20, 22, 19, 21],
    'Marks': [90, 70, 85, 95]
})
print(df)


   Name  Age  Marks
0   Ali   20     90
1  Ravi   22     70
2  Zoya   19     85
3  John   21     95


In [29]:
# Column access
print(df['Age'] )       # single column → Series
df[['Age','Marks']]   # multiple columns → DataFrame

0    20
1    22
2    19
3    21
Name: Age, dtype: int64


Unnamed: 0,Age,Marks
0,20,90
1,22,70
2,19,85
3,21,95


In [30]:
# Row access
print(df.iloc[2])       # row by position
df.loc[2]        # row by label (default index == same as above)

Name     Zoya
Age        19
Marks      85
Name: 2, dtype: object


Name     Zoya
Age        19
Marks      85
Name: 2, dtype: object

In [31]:

# Row + Column together
print(df.iloc[1, 0])            # row 1, col 0 → single value
print(df.loc[1, 'Marks'])        # row label 1, col label 'Marks'
df.loc[0:2, ['Age']]      # slice rows + select col

Ravi
70


Unnamed: 0,Age
0,20
1,22
2,19


In [32]:

# Boolean filtering
print(df[df['Marks'] > 80])      # all rows where Marks > 80
df[(df['Age'] > 20) & (df['Marks'] > 85)]   # multiple conditions

   Name  Age  Marks
0   Ali   20     90
2  Zoya   19     85
3  John   21     95


Unnamed: 0,Name,Age,Marks
3,John,21,95


In [33]:
import pandas as pd
df = pd.DataFrame({
    'Name': ['Ali','Ravi','Zoya','John'],
    'Age': [20, 22, 19, 21],
    'Marks': [90, 70, 85, 95]
})
print(df)


   Name  Age  Marks
0   Ali   20     90
1  Ravi   22     70
2  Zoya   19     85
3  John   21     95


Get only the Age column.  
Get the 2nd row using .iloc.  
Get the Marks of 'Zoya' using .loc.  
Get all rows where Marks > 80.  
Get the Name and Marks of students where Age > 20.   

Selecting columns → df['Age']    
Filtering rows → df[df['Age'] > 20]   

In [34]:
# 1 [ df[] takes only 1 parameter , to apply condition use df.loc[]]
print(df["Age"])
print("\n")
print(df.loc[:,'Age'])

0    20
1    22
2    19
3    21
Name: Age, dtype: int64


0    20
1    22
2    19
3    21
Name: Age, dtype: int64


In [35]:
#2
print(df.iloc[1])

Name     Ravi
Age        22
Marks      70
Name: 1, dtype: object


In [36]:
# 3 
df.loc[df['Name'] == 'Zoya', 'Marks']

2    85
Name: Marks, dtype: int64

In [37]:
# 4
df[df["Marks"]>80]

Unnamed: 0,Name,Age,Marks
0,Ali,20,90
2,Zoya,19,85
3,John,21,95


In [38]:
# 5s
df.loc[df['Age'] > 20, ['Name', 'Marks']]

Unnamed: 0,Name,Marks
1,Ravi,70
3,John,95


#     Column Operations , 
# Aggregation & Grouping ,  
# Sorting

In [39]:
import pandas as pd
df=pd.DataFrame({
    'Name': ['Ayaan', 'Zoya', 'Riya', 'Ayaan', 'Zoya'],
    'Class': ['10A', '10A', '10B', '10A', '10B'],
    'Gender': ['M', 'F', 'F', 'M',  'F'],
    'Math': [85, 90, 78, 88, 92],
    'Science': [80, 95, 85, 89, 94],
    'Marks': [82, 92, 80, 86, 93]
})
print(df)

    Name Class Gender  Math  Science  Marks
0  Ayaan   10A      M    85       80     82
1   Zoya   10A      F    90       95     92
2   Riya   10B      F    78       85     80
3  Ayaan   10A      M    88       89     86
4   Zoya   10B      F    92       94     93


In [41]:
# 1. Column Operations
# (i) Add a new column (Total)
df["Total"]=df['Math']+df['Science']
print(df)

    Name Class Gender  Math  Science  Marks  Total
0  Ayaan   10A      M    85       80     82    165
1   Zoya   10A      F    90       95     92    185
2   Riya   10B      F    78       85     80    163
3  Ayaan   10A      M    88       89     86    177
4   Zoya   10B      F    92       94     93    186


In [42]:
# Modify column (Name to uppercase)
df['Name'] = df['Name'].str.upper()
print(df)

    Name Class Gender  Math  Science  Marks  Total
0  AYAAN   10A      M    85       80     82    165
1   ZOYA   10A      F    90       95     92    185
2   RIYA   10B      F    78       85     80    163
3  AYAAN   10A      M    88       89     86    177
4   ZOYA   10B      F    92       94     93    186


In [43]:
# Apply function to a column (add 5 to Marks)
#df["Marks"]=df['Marks']+5
df['Marks']=df['Marks'].apply(lambda x: x + 5)
df

Unnamed: 0,Name,Class,Gender,Math,Science,Marks,Total
0,AYAAN,10A,M,85,80,87,165
1,ZOYA,10A,F,90,95,97,185
2,RIYA,10B,F,78,85,85,163
3,AYAAN,10A,M,88,89,91,177
4,ZOYA,10B,F,92,94,98,186


In [44]:
# 2. Aggregation
# Get average Marks of all students
df['Marks']=df['Marks'].mean()
df 

Unnamed: 0,Name,Class,Gender,Math,Science,Marks,Total
0,AYAAN,10A,M,85,80,91.6,165
1,ZOYA,10A,F,90,95,91.6,185
2,RIYA,10B,F,78,85,91.6,163
3,AYAAN,10A,M,88,89,91.6,177
4,ZOYA,10B,F,92,94,91.6,186


In [47]:
# Sum of Math and Science
total_math_science = df[['Math', 'Science']].sum(axis=1) # Row-wise Sum
total_math_science

0    165
1    185
2    163
3    177
4    186
dtype: int64

In [55]:
# 3. Grouping
# (i) Group by ["split-apply-combine"] Class → find average Marks
# Aggregation: Functions like mean(), sum(), count(), min(), and max
group_class = df.groupby('Class')['Marks'].mean()
print(group_class)
# (ii) Group by Class and calculate Multiple Aggregations
agg_class = df.groupby('Class').agg({'Math':'sum','Science':'mean'})
print(agg_class)

Class
10A    91.6
10B    91.6
Name: Marks, dtype: float64
       Math  Science
Class               
10A     263     88.0
10B     170     89.5


In [56]:
# 4 SORTING
# Sort by Marks (Descending)
df_sorted = df.sort_values(by='Marks',ascending=False)
df_sorted

Unnamed: 0,Name,Class,Gender,Math,Science,Marks,Total
0,AYAAN,10A,M,85,80,91.6,165
1,ZOYA,10A,F,90,95,91.6,185
2,RIYA,10B,F,78,85,91.6,163
3,AYAAN,10A,M,88,89,91.6,177
4,ZOYA,10B,F,92,94,91.6,186


When to use what?  
 
✔ df['col'] → Access a single column   
✔ df[['col1','col2']] → Access multiple columns  
✔ .str.upper() → Apply string operations on columns   
✔ .sum(axis=1) → Row-wise sum (e.g., add subject marks)   
✔ groupby() + mean() / sum() → Aggregate by a category   
✔ .agg({'col':'func'}) → Multiple aggregations at once   
✔ .sort_values() → Sort rows by a column   

# Merging & Joining (combine datasets) 
# Pivot & Crosstab (summary tables)  
# Handling Missing Data (cleaning real datasets)   
# Value_counts & unique (quick insights)

In [12]:
import pandas as pd
df1=pd.DataFrame({'ID':[1,2,3,4],
                  'Name':['Amit','Bina','Chetan','Deepa'],
                  'Class':['A','A','B','B']})
df2= pd.DataFrame({
    'ID':[1,2,3,5],
    'Subject':['Math','Science','Math','Science'],
    'Marks':[90,85,None,75]})
print(df1)
print(df2)


   ID    Name Class
0   1    Amit     A
1   2    Bina     A
2   3  Chetan     B
3   4   Deepa     B
   ID  Subject  Marks
0   1     Math   90.0
1   2  Science   85.0
2   3     Math    NaN
3   5  Science   75.0


In [4]:
# Merge (Combine Both Tables)
merged=pd.merge(df1,df2,on="ID",how='inner')
print(merged)
#merge = mix two tables,
#on='ID' = common column,
#how='inner' = only common rows.

   ID    Name Class  Subject  Marks
0   1    Amit     A     Math   90.0
1   2    Bina     A  Science   85.0
2   3  Chetan     B     Math    NaN


In [7]:
# Handling missing values [Replace NaN marks with average marks]
merged['Marks'].fillna(merged['Marks'].mean(),inplace=True)
merged

Unnamed: 0,ID,Name,Class,Subject,Marks
0,1,Amit,A,Math,90.0
1,2,Bina,A,Science,85.0
2,3,Chetan,B,Math,87.5


In [13]:
# Pivot Table
import pandas as pd

data = {
    'Name': ['Alice', 'Alice', 'Bob', 'Bob'],
    'Subject': ['Math', 'Science', 'Math', 'Science'],
    'Marks': [90, 85, 80, 88]
}
df = pd.DataFrame(data)
print(df)
# original data


    Name  Subject  Marks
0  Alice     Math     90
1  Alice  Science     85
2    Bob     Math     80
3    Bob  Science     88


In [14]:
# Pivot
# We want Name as rows, Subject as columns, and Marks as values:
pivot=df.pivot(index='Name',columns='Subject',values='Marks')
print(pivot)

Subject  Math  Science
Name                  
Alice      90       85
Bob        80       88


In [15]:
# Value Counts
print(merged['Subject'].value_counts())
#Meaning: Count frequency of each subject.

Subject
Math       2
Science    1
Name: count, dtype: int64
