NumPy is a core Python library for numerical computing, built for handling large arrays and matrices efficiently.

  1. ndarray object – Stores homogeneous data in n-dimensional arrays for fast processing.
  2. Vectorized operations – Perform element-wise calculations without explicit loops.
  3. Broadcasting – Apply operations across arrays of different shapes.
  4. Linear algebra functions – Matrix multiplication, inversion, eigenvalues, etc.
  5. Statistical tools – Mean, median, standard deviation, and more.
  6. Fourier transforms – Fast computation for signal and image processing.
  7. Integration with other libraries – Works seamlessly with Pandas, SciPy, and scikit-learn.

- NumPy arrays are homogeneous, meaning all elements must be the same type, allowing efficient computation.
- Vectorized operations in NumPy can be 10 to 100 times faster than equivalent Python loops.

What is NumPy Used for?
  1. Creating and manipulating arrays.
  2. Performing element-wise and matrix operations.
  3. Generating random numbers and statistical calculations.
  4. Conducting linear algebra operations.
  5. Working with Fourier transformations.
  6. Handling missing values efficiently in datasets.

Why Learn NumPy?
 1. NumPy speeds up math operations like addition and multiplication on large groups of numbers compared to regular Python..
 2. It’s good for handling large lists of numbers (arrays), so you don’t have to write complicated loops.
  3. It gives ready-to-use functions for statistics, algebra and random numbers.
Libraries like Pandas, SciPy, TensorFlow and many others are built on top of NumPy.
  4. NumPy uses less memory and stores data more efficiently, which matters when working with lots of data.

In [None]:
import numpy as np


arr =np.array([1, 2, 3, 4, 5])

print(arr)

[1 2 3 4 5]


In [None]:
# check type
arr = np.array([1,2,6,5,8])
print(type(arr))

<class 'numpy.ndarray'>


In [None]:
# with defualt value zero
default_val = np.zeros(3)
print(default_val)

[0. 0. 0.]


In [None]:
# with default value ones

ones_val = np.ones((2,3))
print(ones_val)

[[1. 1. 1.]
 [1. 1. 1.]]


In [None]:
# full(shape,value)

filled_Arr = np.full((2,3),7)
print(filled_Arr)

[[7 7 7]
 [7 7 7]]


In [None]:
# creating sequences of numbers in numpy
# arange
# arange(start,stop , step)

arr = np.arange(1,10,2)
print(arr)

[1 3 5 7 9]


In [None]:
# creating identity matrices
#eye(size)

identity_matricss = np.eye(4)
print(identity_matricss)


[[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]]


In [None]:
# check the shape
shaped_arr = np.array([[1, 2, 3], [4, 5, 6]])
print(shaped_arr.shape)

(2, 3)


In [None]:
# total no.of elements in array

sized_Arr=np.array([[1, 2, 3], [4, 5, 6]])
print(sized_Arr.size)


6


In [None]:
# Check how many dimensions the arrays have

a = np.array(42)
b = np.array([1, 2, 3, 4, 5])
c = np.array([[1, 2, 3], [4, 5, 6]])
d = np.array([[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]])

print(a.ndim)
print(b.ndim)
print(c.ndim)
print(d.ndim)

0
1
2
3


In [None]:
# datatype check

dtype_check = np.array([10,20,30.5,40,5,0])
print(dtype_check.dtype)

float64


In [None]:
# changinh one datatype into another datatype

changed_arr = np.array([1.2,5.5,2.6,8.7])
int_arr = changed_arr.astype(int)

print(int_arr)
print(int_arr.dtype)

[1 5 2 8]
int64


Math Operations

In [None]:
arr = np.array([10,20,30,40,5])
print(arr + 5)
print(arr - 5)
print(arr * 5)
print(arr / 5)
# print(arr // 5)


[15 25 35 45 10]
[ 5 15 25 35  0]
[ 50 100 150 200  25]
[2. 4. 6. 8. 1.]
[2 4 6 8 1]


Aggregation Function

In [None]:
arr = np.array([10,20,30,40,5])


print(arr.sum())
print(arr.mean())
print(arr.max())
print(arr.min())
print(arr.std())
print(arr.var())

105
21.0
40
5
12.806248474865697
164.0


Indexing and Slicing


In [None]:
# We can also define the step, like this: [start:end:step].

arr = np.array([1, 2, 3, 4, 5, 6, 7])

print(arr[1:5])

[2 3 4 5]


In [None]:
arr = np.array([1, 2, 3, 4, 5, 6, 7])
print(arr[:-1])

[1 2 3 4 5 6]


In [None]:
arr = np.array([1, 2, 3, 4, 5, 6, 7])

print(arr[:4])

[1 2 3 4]


In [None]:
# Negative Slicing

arr = np.array([1, 2, 3, 4, 5, 6, 7])

print(arr[-3:-1])
print(arr[::2])
print(arr[::-1])

[5 6]
[1 3 5 7]
[7 6 5 4 3 2 1]


In [None]:
# facy indexing

arr = np.array([11,23,6,48,6,5,58])

print(arr[[0,3,5]])

[11 48  5]


In [None]:
# filtering data

arr = np.array([11,23,6,48,6,5,58])

print(arr[arr > 25])

[48 58]


# Reshape and Manipulating


In [None]:
# rshape(rows,cols)

arr = np.array([11,23,6,48,6,5])

reshed_Arr = arr.reshape(2,3)

print(reshed_Arr)

[[11 23  6]
 [48  6  5]]
[11 23  6 48  6  5]


In [None]:
# flatten -->  flattens a multi-dimensional array into a one-dimensional array
#  .ravel() ->  ravel can affect the original array because it returns a view of the original array whenever possible, rather than a copy.
#  .flatten() -> The flatten() function is used to convert a multi-dimensional NumPy array into a one-dimensional array. It creates a new copy of the data so that original array stays unchanged.


arr_2d = np.array([[1,3,5],[6,7,8]])

print(arr_2d.ravel())
print(arr_2d.flatten())

[1 3 5 6 7 8]
[1 3 5 6 7 8]


Maniputing

In [None]:
# insert

"""
np.insert(array,index,value , axis=None)
"""
arr = np.array([11,23,6,48,6,5])

inserted_arr = np.insert(arr,3,99)

print(inserted_arr)



[11 23  6 99 48  6  5]


In [None]:
# append

""""
np.append(arr, values, axis=None)

"""

arr = np.array([11,23,6,48,6,5])

appended_arr = np.append(arr,6)

print(appended_arr)

[11 23  6 48  6  5  6]


In [None]:

arr_2d = np.array([[1, 2], [3, 4]])

new_2d = np.append(arr_2d, [[5, 6]], axis=0)

print(new_2d)


[[1 2]
 [3 4]
 [5 6]]


In [None]:
# delete

""""
Removes sub-arrays (elements, rows, columns) along a specified axis.

np.delete(arr, obj, axis=None)
"""

arr = np.array([11,23,6,48,6,5])

deletd_arr = np.delete(arr,3)

print(deletd_arr)

[11 23  6  6  5]


In [None]:
# concatenation

"""
np.concatenate((arr1,arr2),axis = 0)
axis  0 > vertical stacking
axis  1 > horizontal stacking

"""

arr1 = np.array([11,23,6,48,6,5])
arr2 = np.array([11,23,6,48,6,5])

print(np.concat((arr1,arr2),axis=0))
# print(np.concat((arr1,arr2),axis=1))



[11 23  6 48  6  5 11 23  6 48  6  5]


Stacking

In [None]:
"""
vertically
horizontally

vstact() row wise
hstack() column wise

"""

arr1 = np.array([11,23,6,48,6,5])
arr2 = np.array([11,23,6,48,6,5])

print(np.vstack((arr1,arr2)))

print(np.hstack((arr1,arr2)))


[[11 23  6 48  6  5]
 [11 23  6 48  6  5]]
[11 23  6 48  6  5 11 23  6 48  6  5]


Splitting

In [None]:
"""
np.split() === equal splite

np.hspit() - horizontal
np.vspit() - vertical
"""
arr1 = np.array([11,23,6,48,6,5])
print(np.split(arr1,2))


[array([11, 23,  6]), array([48,  6,  5])]


#  Broadcasting and Vectorization

In [None]:
prices = np.array([100,200,300])
discount = 10

final_prices = prices - (prices * discount/100)

print(final_prices)

[ 90. 180. 270.]


In [None]:
#  List Comprehension

# [expression for item in iterable if condition]

list1 =
list2 = [4,5,6]

result = [x+y for x,y in zip(list1,list2)]
print(result)

[5, 7, 9]


In [None]:
# using vectorization ...

arr1 = np.array([1,2,3])
arr2 = np.array([4,5,6])

result = arr1 + arr2
print(result)

[5 7 9]


Missing Value

In [None]:
arr = np.array([1.0, 2.0, np.nan, 4.0, 5.0])

# Display the array
print("Original array:", arr)

# Check for missing values
print("Is NaN:", np.isnan(arr))

Original array: [ 1.  2. nan  4.  5.]
Is NaN: [False False  True False False]


In [None]:
# Replace NaN with the mean of the non-missing values
mean_value = np.nanmean(arr)  # Computes mean ignoring NaN
arr_filled = np.nan_to_num(arr, nan=mean_value)
print("After filling NaN with mean:", arr_filled)

# Alternatively, remove NaN values
arr_cleaned = arr[~np.isnan(arr)]
print("After removing NaN:", arr_cleaned)

After filling NaN with mean: [1. 2. 3. 4. 5.]
After removing NaN: [1. 2. 4. 5.]


# Panda

- Pandas (stands for Python Data Analysis) is an open-source software library designed for data manipulation and analysis.

  - Revolves around two primary Data structures: Series (1D) and DataFrame (2D)
  - Built on top of NumPy, efficiently manages large datasets, offering tools for data cleaning, transformation, and analysis.
  - Tools for working with time series data, including date range generation and frequency conversion. For example, we can convert date or time columns into pandas’ datetime type using pd.to_datetime(), or specify parse_dates=True during CSV loading.
  - Seamlessly integrates with other Python libraries like NumPy, Matplotlib, and scikit-learn.
  - Provides methods like .dropna() and .fillna() to handle missing values seamlessly

  DataFrames: It is a two-dimensional data structure constructed with rows and columns, which is more similar to Excel spreadsheet.

 What is Pandas Used for?

    1. Reading and writing data from various file formats like CSV, Excel and SQL databases.
    2. Cleaning and preparing data by handling missing values and filtering entries.
    3. Merging and joining multiple datasets seamlessly.
    4. Reshaping data through pivoting and stacking operations.
    5. Conducting statistical analysis and generating descriptive statistics.
    6. Visualizing data with integrated plotting capabilities.

Why Learn Pandas

  1. It offers a simple and intuitive way to work with structured data, especially using DataFrames.
  2. Makes data exploration easy, so you can quickly understand patterns or spot issues.
  3. Saves time by reducing the need for complex code.
  4. It's widely used in industries like finance, healthcare, marketing and research.
  5. A must-have skill for data science, analytics and machine learning roles.

In [None]:
import pandas as pd

In [None]:
data = {
    "Name":["Raj","Shayam","Baburam"],
    "Age":[30,40,50],
    "City":["Delhi","Mumbai","Punjab"]
}

# convert into data framer
df = pd.DataFrame(data)

# save to csv format
df.to_csv("output.csv",index=False)

In [None]:
# head
df = pd.read_csv("/content/customers-100.csv")
df.head()

Unnamed: 0,Index,Customer Id,First Name,Last Name,Company,City,Country,Phone 1,Phone 2,Email,Subscription Date,Website
0,1,DD37Cf93aecA6Dc,Sheryl,Baxter,Rasmussen Group,East Leonard,Chile,229.077.5154,397.884.0519x718,zunigavanessa@smith.info,2020-08-24,http://www.stephenson.com/
1,2,1Ef7b82A4CAAD10,Preston,Lozano,Vega-Gentry,East Jimmychester,Djibouti,5153435776,686-620-1820x944,vmata@colon.com,2021-04-23,http://www.hobbs.com/
2,3,6F94879bDAfE5a6,Roy,Berry,Murillo-Perry,Isabelborough,Antigua and Barbuda,+1-539-402-0259,(496)978-3969x58947,beckycarr@hogan.com,2020-03-25,http://www.lawrence.com/
3,4,5Cef8BFA16c5e3c,Linda,Olsen,"Dominguez, Mcmillan and Donovan",Bensonview,Dominican Republic,001-808-617-6467x12895,+1-813-324-8756,stanleyblackwell@benson.org,2020-06-02,http://www.good-lyons.com/
4,5,053d585Ab6b3159,Joanna,Bender,"Martin, Lang and Andrade",West Priscilla,Slovakia (Slovak Republic),001-234-203-0635x76146,001-199-446-3860x3486,colinalvarado@miles.net,2021-04-17,https://goodwin-ingram.com/


In [None]:
#tail
df.tail()

Unnamed: 0,Index,Customer Id,First Name,Last Name,Company,City,Country,Phone 1,Phone 2,Email,Subscription Date,Website
95,96,cb8E23e48d22Eae,Karl,Greer,Carey LLC,East Richard,Guyana,(188)169-1674x58692,001-841-293-3519x614,hhart@jensen.com,2022-01-30,http://hayes-perez.com/
96,97,CeD220bdAaCfaDf,Lynn,Atkinson,"Ware, Burns and Oneal",New Bradview,Sri Lanka,+1-846-706-2218,605.413.3198,vkemp@ferrell.com,2021-07-10,https://novak-allison.com/
97,98,28CDbC0dFe4b1Db,Fred,Guerra,Schmitt-Jones,Ortegaland,Solomon Islands,+1-753-067-8419x7170,+1-632-666-7507x92121,swagner@kane.org,2021-09-18,https://www.ross.com/
98,99,c23d1D9EE8DEB0A,Yvonne,Farmer,Fitzgerald-Harrell,Lake Elijahview,Aruba,(530)311-9786,001-869-452-0943x12424,mccarthystephen@horn-green.biz,2021-08-11,http://watkins.info/
99,100,2354a0E336A91A1,Clarence,Haynes,"Le, Nash and Cross",Judymouth,Honduras,(753)813-6941,783.639.1472,colleen91@faulkner.biz,2020-03-11,http://www.hatfield-saunders.net/


In [None]:
# random sample
df.sample()

Unnamed: 0,Index,Customer Id,First Name,Last Name,Company,City,Country,Phone 1,Phone 2,Email,Subscription Date,Website
33,34,A09AEc6E3bF70eE,Kaitlyn,Santana,Herrera Group,New Kaitlyn,United States of America,6303643286,447-710-6202x07313,georgeross@miles.org,2021-09-21,http://pham.com/


In [None]:
# info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Index              100 non-null    int64 
 1   Customer Id        100 non-null    object
 2   First Name         100 non-null    object
 3   Last Name          100 non-null    object
 4   Company            100 non-null    object
 5   City               100 non-null    object
 6   Country            100 non-null    object
 7   Phone 1            100 non-null    object
 8   Phone 2            100 non-null    object
 9   Email              100 non-null    object
 10  Subscription Date  100 non-null    object
 11  Website            100 non-null    object
dtypes: int64(1), object(11)
memory usage: 9.5+ KB


In [None]:
# describe
df.describe()

Unnamed: 0,Index
count,100.0
mean,50.5
std,29.011492
min,1.0
25%,25.75
50%,50.5
75%,75.25
max,100.0


In [None]:
print(f'Shape {df.shape}')

Shape (100, 12)


In [None]:
print(f'Column Name: {df.columns}')

Column Name: Index(['Index', 'Customer Id', 'First Name', 'Last Name', 'Company', 'City',
       'Country', 'Phone 1', 'Phone 2', 'Email', 'Subscription Date',
       'Website'],
      dtype='object')


In [None]:
# 1 - select specific column
# 2 - filter rows
#  3 - combine multiple conditions

print(df[['First Name' ,'Last Name']].head(5))

  First Name Last Name
0     Sheryl    Baxter
1    Preston    Lozano
2        Roy     Berry
3      Linda     Olsen
4     Joanna    Bender


In [None]:
print(df['Customer Id'])

0     DD37Cf93aecA6Dc
1     1Ef7b82A4CAAD10
2     6F94879bDAfE5a6
3     5Cef8BFA16c5e3c
4     053d585Ab6b3159
           ...       
95    cb8E23e48d22Eae
96    CeD220bdAaCfaDf
97    28CDbC0dFe4b1Db
98    c23d1D9EE8DEB0A
99    2354a0E336A91A1
Name: Customer Id, Length: 100, dtype: object


In [None]:
data = {
"Name": ['Ram', 'Shyam', 'Ghanshyam', 'Dhanshyam', 'Aditi', 'Jagdish', 'Raj', 'Simran'],
"Age": [28,34,22,30,29,40,25,32],
"Salary": [50000,60000,45000, 52000, 49000, 70000,48000,58000],
"Performance Score": [185,98,78,92,88,95,80,891]
}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Salary,Performance Score
0,Ram,28,50000,185
1,Shyam,34,60000,98
2,Ghanshyam,22,45000,78
3,Dhanshyam,30,52000,92
4,Aditi,29,49000,88
5,Jagdish,40,70000,95
6,Raj,25,48000,80
7,Simran,32,58000,891


In [None]:
high_salary = df[df['Salary'] > 55000]
high_salary

Unnamed: 0,Name,Age,Salary,Performance Score
1,Shyam,34,60000,98
5,Jagdish,40,70000,95
7,Simran,32,58000,891


In [None]:
# multiple condition
#  salary > 40 and age > 30

multiple_condition = df[(df['Age'] > 30) & (df['Salary'] > 55000)]
multiple_condition


Unnamed: 0,Name,Age,Salary,Performance Score
1,Shyam,34,60000,98
5,Jagdish,40,70000,95
7,Simran,32,58000,891


In [None]:
# using or

filtered_or = df[(df['Age'] > 30) | (df['Performance Score'] > 90)]
filtered_or

Unnamed: 0,Name,Age,Salary,Performance Score
0,Ram,28,50000,185
1,Shyam,34,60000,98
3,Dhanshyam,30,52000,92
5,Jagdish,40,70000,95
7,Simran,32,58000,891


# Advanced

In [None]:
#  adding columns

df['Bonus'] = df['Salary'] * 0.1
df

Unnamed: 0,Name,Age,Salary,Performance Score,Bonus
0,Ram,28,50000,185,5000.0
1,Shyam,34,60000,98,6000.0
2,Ghanshyam,22,45000,78,4500.0
3,Dhanshyam,30,52000,92,5200.0
4,Aditi,29,49000,88,4900.0
5,Jagdish,40,70000,95,7000.0
6,Raj,25,48000,80,4800.0
7,Simran,32,58000,891,5800.0


In [None]:
#  another method using insert

df.insert(0,'Employee Id',[10,20,30,40,50,60,70,80])
df

Unnamed: 0,Employee Id,Name,Age,Salary,Performance Score,Bonus
0,10,Ram,28,50000,185,5000.0
1,20,Shyam,34,60000,98,6000.0
2,30,Ghanshyam,22,45000,78,4500.0
3,40,Dhanshyam,30,52000,92,5200.0
4,50,Aditi,29,49000,88,4900.0
5,60,Jagdish,40,70000,95,7000.0
6,70,Raj,25,48000,80,4800.0
7,80,Simran,32,58000,891,5800.0


In [None]:
#  updating data single
df.loc[0,'Salary'] = 1000000
df


Unnamed: 0,Employee Id,Name,Age,Salary,Performance Score,Bonus,"(0, Salary)"
0,10,Ram,28,1000000,185,5000.0,1000000
1,20,Shyam,34,60000,98,6000.0,1000000
2,30,Ghanshyam,22,45000,78,4500.0,1000000
3,40,Dhanshyam,30,52000,92,5200.0,1000000
4,50,Aditi,29,49000,88,4900.0,1000000
5,60,Jagdish,40,70000,95,7000.0,1000000
6,70,Raj,25,48000,80,4800.0,1000000
7,80,Simran,32,58000,891,5800.0,1000000


In [None]:
#  updating multiple columns

df['Salary'] = df['Salary'] * 1.05
df


Unnamed: 0,Employee Id,Name,Age,Salary,Performance Score,Bonus,"(0, Salary)"
0,10,Ram,28,1050000.0,185,5000.0,1000000
1,20,Shyam,34,63000.0,98,6000.0,1000000
2,30,Ghanshyam,22,47250.0,78,4500.0,1000000
3,40,Dhanshyam,30,54600.0,92,5200.0,1000000
4,50,Aditi,29,51450.0,88,4900.0,1000000
5,60,Jagdish,40,73500.0,95,7000.0,1000000
6,70,Raj,25,50400.0,80,4800.0,1000000
7,80,Simran,32,60900.0,891,5800.0,1000000


In [None]:
# removing columns

df.drop(columns=['Performance Score'],inplace=True)
df


Unnamed: 0,Employee Id,Name,Age,Salary,Bonus,"(0, Salary)"
0,10,Ram,28,1050000.0,5000.0,1000000
1,20,Shyam,34,63000.0,6000.0,1000000
2,30,Ghanshyam,22,47250.0,4500.0,1000000
3,40,Dhanshyam,30,54600.0,5200.0,1000000
4,50,Aditi,29,51450.0,4900.0,1000000
5,60,Jagdish,40,73500.0,7000.0,1000000
6,70,Raj,25,50400.0,4800.0,1000000
7,80,Simran,32,60900.0,5800.0,1000000


# **Missing Values**

In [None]:
data = {
"Name": ['Ram', None, 'Ghanshyam', 'Dhanshyam', 'Aditi', 'Jagdish', 'Raj', 'Simran'],
"Age": [28,None,22,30,29,40,25,32],
"Salary": [50000,None,45000, 52000, 49000, 70000,48000,58000],
"Performance Score": [185,None,78,92,88,95,80,891]
}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Salary,Performance Score
0,Ram,28.0,50000.0,185.0
1,,,,
2,Ghanshyam,22.0,45000.0,78.0
3,Dhanshyam,30.0,52000.0,92.0
4,Aditi,29.0,49000.0,88.0
5,Jagdish,40.0,70000.0,95.0
6,Raj,25.0,48000.0,80.0
7,Simran,32.0,58000.0,891.0


In [None]:
# isnull
df.isnull()

Unnamed: 0,Name,Age,Salary,Performance Score
0,False,False,False,False
1,True,True,True,True
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
5,False,False,False,False
6,False,False,False,False
7,False,False,False,False


In [None]:
# isnull sum
df.isnull().sum()

Unnamed: 0,0
Name,1
Age,1
Salary,1
Performance Score,1


In [None]:
# dropna delete colmun or row
#  axis = 0 for row
#  axis = 1 for column

df.dropna(inplace=True)
df

Unnamed: 0,Name,Age,Salary,Performance Score
0,Ram,28.0,50000.0,185.0
2,Ghanshyam,22.0,45000.0,78.0
3,Dhanshyam,30.0,52000.0,92.0
4,Aditi,29.0,49000.0,88.0
5,Jagdish,40.0,70000.0,95.0
6,Raj,25.0,48000.0,80.0
7,Simran,32.0,58000.0,891.0


In [None]:
#  fill the none value using default
data = {
"Name": ['Ram', None, 'Ghanshyam', 'Dhanshyam', 'Aditi', 'Jagdish', 'Raj', 'Simran'],
"Age": [28,None,22,30,29,40,25,32],
"Salary": [50000,None,45000, 52000, 49000, 70000,48000,58000],
"Performance Score": [185,None,78,92,88,95,80,891]
}
df = pd.DataFrame(data)
df


Unnamed: 0,Name,Age,Salary,Performance Score
0,Ram,28.0,50000.0,185.0
1,,,,
2,Ghanshyam,22.0,45000.0,78.0
3,Dhanshyam,30.0,52000.0,92.0
4,Aditi,29.0,49000.0,88.0
5,Jagdish,40.0,70000.0,95.0
6,Raj,25.0,48000.0,80.0
7,Simran,32.0,58000.0,891.0


In [None]:
df.fillna(0,inplace=True)
df

Unnamed: 0,Name,Age,Salary,Performance Score
0,Ram,28.0,50000.0,185.0
1,0,0.0,0.0,0.0
2,Ghanshyam,22.0,45000.0,78.0
3,Dhanshyam,30.0,52000.0,92.0
4,Aditi,29.0,49000.0,88.0
5,Jagdish,40.0,70000.0,95.0
6,Raj,25.0,48000.0,80.0
7,Simran,32.0,58000.0,891.0


In [None]:
#  fill the value mean

# df['Age'].fillna(df['Age'].mean(),inplace=True)

In [None]:
# interpolate() predict the dataset
data = {
"Name": ['Ram', 'Shayam', 'Ghanshyam', 'Dhanshyam', 'Aditi', 'Jagdish', 'Raj', 'Simran'],
"Age": [28,None,22,30,29,40,25,32],
"Salary": [50000,None,45000, 52000, 49000, 70000,48000,58000],
"Performance Score": [185,None,78,92,88,95,80,891]
}
df = pd.DataFrame(data)
df
# df.interpolate(method="linear",axis=0,inplace=True)


Unnamed: 0,Name,Age,Salary,Performance Score
0,Ram,28.0,50000.0,185.0
1,Shayam,,,
2,Ghanshyam,22.0,45000.0,78.0
3,Dhanshyam,30.0,52000.0,92.0
4,Aditi,29.0,49000.0,88.0
5,Jagdish,40.0,70000.0,95.0
6,Raj,25.0,48000.0,80.0
7,Simran,32.0,58000.0,891.0


In [None]:
# linear

data ={
    "Time":[1,2,3,4,5],
    "Value":[10,None,30,None,50]
}

df =pd.DataFrame(data)
df['Value'] = df['Value'].interpolate(method="linear")
df

Unnamed: 0,Time,Value
0,1,10.0
1,2,20.0
2,3,30.0
3,4,40.0
4,5,50.0


#  sorting and aggregation

In [None]:
#  sorting
# df.sort_values(by="column_name",True/False,inplace=True)

data ={
    "Name":["Arun","Varun","Krun"],
    "Age":[80,28,98],
    "Salary":[10000,20000,30000]
}

df = pd.DataFrame(data)
df.sort_values(by="Age",ascending=False,inplace=True)
df

Unnamed: 0,Name,Age,Salary
2,Krun,98,30000
0,Arun,80,10000
1,Varun,28,20000


In [None]:
#  multiple columns
df.sort_values(by=["Age","Salary"],ascending=[True,False],inplace=True)
df

Unnamed: 0,Name,Age,Salary
1,Varun,28,20000
0,Arun,80,10000
2,Krun,98,30000


In [None]:
# aggrestion -> summary statics
df['Salary'].mean()

np.float64(20000.0)

# Grouping

In [None]:
data ={
    "Name":["Arun","Varun","Krun","Trun","Barun"],
    "Age":[80,28,98,38,28],
    "Salary":[10000,20000,30000,18000,20500]
}

df = pd.DataFrame(data)

grouped = df.groupby("Age")["Salary"].sum()
grouped

Unnamed: 0_level_0,Salary
Age,Unnamed: 1_level_1
28,40500
38,18000
80,10000
98,30000


In [None]:
#  multiple columns groupby

grouped = df.groupby(["Age","Name"])["Salary"].sum()
grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,Salary
Age,Name,Unnamed: 2_level_1
28,Barun,20500
28,Varun,20000
38,Trun,18000
80,Arun,10000
98,Krun,30000


# Merging and Joining


In [None]:
#  merging

df_customers = pd.DataFrame({
    "CustomerId":[1,2,3],
    "Name":["Ramesh","Sures","Ganesh"]
})

df_orders=pd.DataFrame({
    'CustomerId':[1,2,4],
    'OrderAmount':[250,450,560]
})

merged = pd.merge(df_customers,df_orders,on="CustomerId",how="inner")
print("inner join")
merged

inner join


Unnamed: 0,CustomerId,Name,OrderAmount
0,1,Ramesh,250
1,2,Sures,450


In [None]:
# outer join
merged = pd.merge(df_customers,df_orders,on="CustomerId",how="outer")
print("outer join")
merged

outer join


Unnamed: 0,CustomerId,Name,OrderAmount
0,1,Ramesh,250.0
1,2,Sures,450.0
2,3,Ganesh,
3,4,,560.0


In [None]:
# left join
merged = pd.merge(df_customers,df_orders,on="CustomerId",how="left")
print("left join")
merged

left join


Unnamed: 0,CustomerId,Name,OrderAmount
0,1,Ramesh,250.0
1,2,Sures,450.0
2,3,Ganesh,


In [None]:
# right join
merged = pd.merge(df_customers,df_orders,on="CustomerId",how="right")
print("right join")
merged

right join


Unnamed: 0,CustomerId,Name,OrderAmount
0,1,Ramesh,250
1,2,Sures,450
2,4,,560


In [None]:
# concentation
# vertically(row-wise)
# horizontally(column)
# pd.concate(df1,df2,axis=0 , ingore_index=True)


df_region1 = pd.DataFrame({
    'CutomerId':[1,2],
    "Name":["Gopal","Raju"]
})

df_region2 = pd.DataFrame({
    'CutomerId':[3,4],
    "Name":["Shyam","Babu"]
})


In [None]:
vertically = pd.concat([df_region1,df_region2],ignore_index=True)
vertically

Unnamed: 0,CutomerId,Name
0,1,Gopal
1,2,Raju
2,3,Shyam
3,4,Babu
