In [2]:
# pandas is widely used for data analysis and data cleaning
# Two datastructures :-
    # 1. series (1-D array like object)
    # 2. dataframe (2-D homo/hetrogenous tabular data structure with labelled axes) [Mutable]

import pandas as pd

In [None]:
# Pandas "Series" is a 1-D object which can hold any datatype(singular).
# Similar to a column in table

# create series from list : indexing is created by default (0,1,2 ...)

my_series = pd.Series([7, 8, 4, 5, 6, 0])
print(type(my_series))
print("Series:\n", my_series)

<class 'pandas.core.series.Series'>
Series:
 [5.0, 6.0, 2.0, 3.0, 4.0, 1.0]


In [69]:
# create series from dictionary : index is taken from key values of dictionary

data={'a':1, 'b': 2, 'c': 3}
my_series = pd.Series(data)
print(type(my_series))
print(my_series)

<class 'pandas.core.series.Series'>
a    1
b    2
c    3
dtype: int64


In [70]:
# creating series using two lists (1 for index and 1 for value)

indexes = ['i', 'ii', 'iii', 'iv']
values = ["Hello", "World", "ML", "Python"]
my_series = pd.Series(values, index = indexes)
print(type(my_series))
print(my_series)
print()

<class 'pandas.core.series.Series'>
i       Hello
ii      World
iii        ML
iv     Python
dtype: object



In [None]:
# Dataframe (multiple columns instead of single column in Series)
# axis = 0 is rowindexing
# axis = 1 is columnindexing

# creating a dataframe from dictionary of lists
data={
    'Name': ['Krish', 'John', 'Jack'],          # column 1
    'Age': [25, 30, 45],                        # column 2
    'City': ['Banglore', 'New York', 'Florida'] # column 3
}
my_dataframe = pd.DataFrame(data)
print(type(my_dataframe))
print(my_dataframe)

<class 'pandas.core.frame.DataFrame'>
    Name  Age      City
0  Krish   25  Banglore
1   John   30  New York
2   Jack   45   Florida


In [44]:
# adding a column
my_dataframe['Salary'] = [2000, 3000, 4000]
print(my_dataframe)
print()

# remove a row/column using drop(index, axis, inplace) method
# axis = 0 is row       # axis = 1 is column
my_dataframe.drop('Salary', axis=1, inplace=True)       # delete the column    
print(my_dataframe)
print()

my_dataframe.drop(0, inplace=True)                      # delete the record at rowidx = 0
print(my_dataframe)

    Name  Age      City  Salary
0  Krish   25  Banglore    2000
1   John   30  New York    3000
2   Jack   45   Florida    4000

    Name  Age      City
0  Krish   25  Banglore
1   John   30  New York
2   Jack   45   Florida

   Name  Age      City
1  John   30  New York
2  Jack   45   Florida


In [None]:
# drop = True       --> means throw away the current index column completely and build from scratch
my_dataframe.reset_index(drop=True, inplace=True)       # rebuilding the index for the rows
print(my_dataframe)

# Note :- drop() is temporary operation. to make it permanent, use "inplace" attribute
# Note :- all functional operations like drop, rename, sort_values etc return a modified dataframe instead of modifying original

   Name  Age      City
0  John   30  New York
1  Jack   45   Florida


In [None]:
# adding a row using .loc[idx] method. Here df.loc[idx] is assigned a complete record

new_row = {'Name': 'Guest', 'Age': 35, 'City': 'Unknown'}

my_dataframe.loc[len(my_dataframe)] = new_row
print(my_dataframe)

    Name  Age      City
0   John   30  New York
1   Jack   45   Florida
2  Guest   35   Unknown


In [47]:
# Applying mathematical operation on a column
my_dataframe['Age'] = my_dataframe['Age'] + 1
print(my_dataframe)

    Name  Age      City
0   John   31  New York
1   Jack   46   Florida
2  Guest   36   Unknown


In [None]:
# creating a dataframe from list of dictionaries

data=[
    {'Name': 'Krish', 'Age': 30, 'City':'Banglore'},    # record 1
    {'Name': 'John', 'Age': 30, 'City': 'New York'},    # record 2
    {'Name': 'Jack', 'Age': 45, 'City': 'Florida'}      # record 3
]

my_df = pd.DataFrame(data)
print(type(my_df))
print(my_df)

<class 'pandas.core.frame.DataFrame'>
    Name  Age      City
0  Krish   30  Banglore
1   John   30  New York
2   Jack   45   Florida


In [3]:
# reading a csv file as dataframe using read_csv() method
# Syntax :- df = pd.read_csv('file_path')
df = pd.read_csv('sales_data.csv')

In [77]:
# display first 'k' rows using head(k) method
df.head(5)

Unnamed: 0,CustomerID,Product,PurchaseDate,Quantity,UnitPrice,CustomerName,ProductCategory,PaymentMethod,ReviewRating,TotalPrice
0,C5361,Phone,2024-03-05,8,618.83,Customer C5361,Office Supplies,Cash,1,4950.64
1,C6231,Laptop,2025-06-21,7,366.22,Customer C6231,Electronics,Debit Card,3,2563.54
2,C7704,Chair,2023-06-25,5,634.51,Customer C7704,Office Supplies,Credit Card,4,3172.55
3,C2923,Printer,2023-09-30,3,508.63,Customer C2923,Office Supplies,Gift Card,1,1525.89
4,C4847,Monitor,2023-04-03,4,452.06,Customer C4847,Electronics,Credit Card,2,1808.24


In [3]:
# display last 'k' rows using tail(k) method
df.tail(5)

Unnamed: 0,CustomerID,Product,PurchaseDate,Quantity,UnitPrice,CustomerName,ProductCategory,PaymentMethod,ReviewRating,TotalPrice
1795,C8238,Chair,2023-05-15,3,13.08,Customer C8238,Furniture,Online,4,39.24
1796,C8831,Monitor,2023-02-06,4,323.03,Customer C8831,Electronics,Debit Card,2,1292.12
1797,C6514,Phone,2024-08-08,5,129.86,Customer C6514,Electronics,Cash,3,649.3
1798,C3334,Printer,2024-02-03,3,493.15,Customer C3334,Furniture,Credit Card,5,1479.45
1799,C9593,Printer,2025-05-11,7,390.86,Customer C9593,Office Supplies,Online,4,2736.02


In [14]:
# Display all values in a column using [] operator
df['PaymentMethod']

0              Cash
1        Debit Card
2       Credit Card
3         Gift Card
4       Credit Card
           ...     
1795         Online
1796     Debit Card
1797           Cash
1798    Credit Card
1799         Online
Name: PaymentMethod, Length: 1800, dtype: object

In [4]:
# Display all the unique values in a column
df['PaymentMethod'].unique()

array(['Cash', 'Debit Card', 'Credit Card', 'Gift Card', 'Online'],
      dtype=object)

In [None]:
# Accessing/Modifying entries
# [DF- Indexing]    [Positional Indexing]
# .loc[u,label],    .iloc[u,v]       --> access/modify multiple entries (slower)
# .at[u, label],    .iat[u,v]        --> access/modify single entry (faster)

# modification is permanent in dataframe

In [None]:
# display/assign kth record using .loc[u, v] method       (using indexing of 2-D matrix)
# label based indexing is slower
# can access/modify multiple values at once

# uses "dataframe indexing"

print(df.loc[0, :])                 # access all columns of 0th row
print()

print(df.loc[:, "PaymentMethod"].unique())      # access all rows of "PaymentMethod" column and return unique values
print()

print(df.loc[2:5, ['PaymentMethod', 'TotalPrice']])   # access payment method and totalprice for row 2,3,4,5
print()

# display/assign a specific element similar to matrix indexing using .iloc[u, v] method

# uses "positional indexing" (regardless of dataframe indexing)
print(df.iloc[0:5, 0:2])                

CustomerID                   C5361
Product                      Phone
PurchaseDate            2024-03-05
Quantity                         8
UnitPrice                   618.83
CustomerName        Customer C5361
ProductCategory    Office Supplies
PaymentMethod                 Cash
ReviewRating                     1
TotalPrice                 4950.64
Name: 0, dtype: object

['Cash' 'Debit Card' 'Credit Card' 'Gift Card' 'Online']

  PaymentMethod  TotalPrice
2   Credit Card     3172.55
3     Gift Card     1525.89
4   Credit Card     1808.24
5     Gift Card      629.37

  CustomerID  Product
0      C5361    Phone
1      C6231   Laptop
2      C7704    Chair
3      C2923  Printer
4      C4847  Monitor


In [None]:
# display a column value of a given row index using .at[idx, 'Column_Name']
# scalar access is faster
# can access/modify only single value at once
print(df.at[150, 'PaymentMethod'])

# using 2-D matrix indexing using .iat[] method
print(df.iat[2, 0])

Online
C7704


In [None]:
# statistical summary of dataframe using .describe() method (column-wise)
print(df.describe())

          Quantity    UnitPrice  ReviewRating   TotalPrice
count  1800.000000  1800.000000   1800.000000  1800.000000
mean      4.588889   395.729417      2.995000  1814.809333
std       2.313383   229.355902      1.425753  1473.605942
min       1.000000     5.180000      1.000000     6.060000
25%       3.000000   199.512500      2.000000   585.320000
50%       5.000000   391.970000      3.000000  1440.740000
75%       7.000000   587.707500      4.000000  2795.300000
max       8.000000   799.910000      5.000000  6389.120000
