# PANDAS
Pandas is a Python library used for data analysis and manipulation. 
It helps you work with tabular data (like Excel sheets or CSV files) in a super easy and powerful way.



In [7]:
import pandas as pd
import numpy as np

## Creating Data Structures
Pandas has two main data structures:

Series: A one-dimensional labeled array.
DataFrame: A two-dimensional table-like structure.

In [11]:
# Creating a Series
s = pd.Series([10, 20, 30, 40], index=['a', 'b', 'c', 'd'])
print(s)

# Creating a DataFrame
data = {'Name': ['Alice', 'Bob', 'Charlie'], 'Age': [25, 30, 35]}
df = pd.DataFrame(data)
print(df)

a    10
b    20
c    30
d    40
dtype: int64
      Name  Age
0    Alice   25
1      Bob   30
2  Charlie   35


In [23]:
import pandas as pd
# Sample DataFrame
data = pd.DataFrame({'Name': ['Alice', 'Bob', 'Charlie','Tork','Gill','Ben','Cork','Tkin','Bin','Champ'],
        'Age': [25, 30, 35, 20, 17, 23, 25, 27, 43, 32],
        'City': ['NY', 'LA', 'SF', 'IN', 'DL', 'NZ', 'SA', 'CH', 'HR', 'WI']})
print(data)

      Name  Age City
0    Alice   25   NY
1      Bob   30   LA
2  Charlie   35   SF
3     Tork   20   IN
4     Gill   17   DL
5      Ben   23   NZ
6     Cork   25   SA
7     Tkin   27   CH
8      Bin   43   HR
9    Champ   32   WI


In [25]:
# Create dataset
import pandas as pd
# Sample DataFrame
data = pd.DataFrame({'Name': ['Alice', 'Bob', 'Charlie','Tork','Gill','Ben','Cork','Tkin','Bin','Champ'],
        'Age': [25, 30, 35, 20, 17, 23, 25, 27, 43, 32],
        'City': ['NY', 'LA', 'SF', 'IN', 'DL', 'NZ', 'SA', 'CH', 'HR', 'WI']})
print(data)


      Name  Age City
0    Alice   25   NY
1      Bob   30   LA
2  Charlie   35   SF
3     Tork   20   IN
4     Gill   17   DL
5      Ben   23   NZ
6     Cork   25   SA
7     Tkin   27   CH
8      Bin   43   HR
9    Champ   32   WI


In [27]:
#Data cleaning
#data frame
import pandas as pd
data=pd.DataFrame({"Name":["Ankit","Navdeep","Mohnish","Ankit","Ritesh","Ishika"],
      "Age":[23,None,19,23,None,18],
      "Salary":[2000,3000,8000,2000,5000,6000]})
#print(data)
print(data.isnull().sum())  # Count missing values per column



Name      0
Age       2
Salary    0
dtype: int64


In [39]:
#Fill Missing Values with a Specific Value
import pandas as pd
data=pd.DataFrame({"Name":["Ankit","Navdeep","Mohnish","Ankit","Ritesh","Ishika"],
      "Age":[23,None,19,23,None,18],
      "Salary":[2000,3000,8000,2000,5000,6000]})
print(data)

print("\nAge column after filling missing values with 0:")
#print(data.["Age"].fillna(value=0)) 
#OR
#print(data.fillna({'Age': '0'}))  # 👉 Replaces all missing values with 0.

# Fill with Mean   # 👉 Replaces missing values with the average of all non-missing values.
print("\n Filling  with Mean:")
print(data["Age"].fillna(data["Age"].mean()))

# Fill with Median  # 👉 Fills missing values with the middle value (when sorted).
print("\n Filling  with Median:")
print(data["Age"].fillna(data["Age"].median()))

#Fill with Mode  # 👉 Uses the most frequently occurring value to fill missing data.
print("\n Filling  with Mode:")
print(data["Age"].fillna(data["Age"].mode()[0]))

#Forward Fill (Fill with Previous Row's Value) # 👉 Fills missing value with the value just before it in the column.
print(data["Age"].fillna(method="ffill"))

#Backward Fill (Fill with Next Row's Value) # 👉 Fills missing value with the value just after it in the column.
print(data["Age"].fillna(method="bfill"))


      Name   Age  Salary
0    Ankit  23.0    2000
1  Navdeep   NaN    3000
2  Mohnish  19.0    8000
3    Ankit  23.0    2000
4   Ritesh   NaN    5000
5   Ishika  18.0    6000

Age column after filling missing values with 0:

 Filling  with Mean:
0    23.00
1    20.75
2    19.00
3    23.00
4    20.75
5    18.00
Name: Age, dtype: float64

 Filling  with Median:
0    23.0
1    21.0
2    19.0
3    23.0
4    21.0
5    18.0
Name: Age, dtype: float64

 Filling  with Mode:
0    23.0
1    23.0
2    19.0
3    23.0
4    23.0
5    18.0
Name: Age, dtype: float64
0    23.0
1    23.0
2    19.0
3    23.0
4    23.0
5    18.0
Name: Age, dtype: float64
0    23.0
1    19.0
2    19.0
3    23.0
4    18.0
5    18.0
Name: Age, dtype: float64


  print(data["Age"].fillna(method="ffill"))
  print(data["Age"].fillna(method="bfill"))


In [41]:
# Cleaning Empty cells
import pandas as pd

# Creating DataFrame with empty (None) values
data = pd.DataFrame({
    "Name": ["Ankit", "Navdeep", "Mohnish", "Ankit", "Ritesh", "Ishika"],
    "Age": [23, None, 19, 23, None, 18],  # Missing values (None)
    "Salary": [2000, 3000, None, 2000, 5000, 6000]  # Missing value (None)
})

# Display DataFrame
print("Original Data:")
print(data)

# Check for missing values  # 👉 `.isnull().sum()` counts how many missing (NaN) values exist in each column.
print("\nMissing Values Count:")
print(data.isnull().sum()) 

#Remove Rows with Empty Cells # 👉 `.dropna()` removes rows that contain **any** missing (NaN) values. 
print("\nRemove Rows with Empty Cells")
data_cleaned = data.dropna()  # Removes any row with at least one NaN value
print(data_cleaned) 

#Remove Columns with Empty Cells  # 👉 `.dropna(axis=1)` removes columns that contain **any** missing (NaN) values.
print("\nRemove Columns with Empty Cells")
data_cleaned = data.dropna(axis=1)  # Removes any column with at least one NaN value
print(data_cleaned)


Original Data:
      Name   Age  Salary
0    Ankit  23.0  2000.0
1  Navdeep   NaN  3000.0
2  Mohnish  19.0     NaN
3    Ankit  23.0  2000.0
4   Ritesh   NaN  5000.0
5   Ishika  18.0  6000.0

Missing Values Count:
Name      0
Age       2
Salary    1
dtype: int64

Remove Rows with Empty Cells
     Name   Age  Salary
0   Ankit  23.0  2000.0
3   Ankit  23.0  2000.0
5  Ishika  18.0  6000.0

Remove Columns with Empty Cells
      Name
0    Ankit
1  Navdeep
2  Mohnish
3    Ankit
4   Ritesh
5   Ishika


In [43]:
#Remove Duplicates
import pandas as pd
# Creating DataFrame with empty (None) values
data = pd.DataFrame({
    "Name": ["Ankit", "Navdeep", "Mohnish", "Ankit", "Ritesh", "Ishika"],
    "Age": [23, None, 19, 23, None, 18],  # Missing values (None)
    "Salary": [2000, 3000, None, 2000, 5000, 6000]  # Missing value (None)
})
# Display DataFrame
print("Original Data:")
print(data)
# Remove Duplicates
print("\nRemove Duplicates")
print(data.drop_duplicates())  #The duplicate row with "Ankit" is removed.# 👉 `.drop_duplicates()` removes exact duplicate rows (all column values must match).


Original Data:
      Name   Age  Salary
0    Ankit  23.0  2000.0
1  Navdeep   NaN  3000.0
2  Mohnish  19.0     NaN
3    Ankit  23.0  2000.0
4   Ritesh   NaN  5000.0
5   Ishika  18.0  6000.0

Remove Duplicates
      Name   Age  Salary
0    Ankit  23.0  2000.0
1  Navdeep   NaN  3000.0
2  Mohnish  19.0     NaN
4   Ritesh   NaN  5000.0
5   Ishika  18.0  6000.0


In [57]:
import pandas as pd

# Creating a sample DataFrame with incorrect formats
data = pd.DataFrame({
    "Name": ["Ankit", "Navdeep", "Mohnish", "Ankit", "Ritesh", "Ishika"],
    "Age": ["23", None, "19", "twenty", None, "18"],  # Some incorrect values
    "Salary": ["2000", "3000", "eight thousand", "2000", "5000", "6000"],  # Wrong format
    "Date": ["2024-01-01", "2024-02-15", "15-03-2024", "2024/04/10", None, "2024-06-20"]  # Inconsistent date format
})

# Check data types before cleaning
print("Before Cleaning:")
print(data.dtypes)

# Convert Age & Salary to numeric format (force errors to NaN, don't overwrite entire DataFrame)
print("\nConvert Age & Salary to Numeric Format")
data["Age"] = pd.to_numeric(data["Age"], errors="coerce")
data["Salary"] = pd.to_numeric(data["Salary"], errors="coerce")

print("\nAfter Converting Age & Salary to Numeric:")
print(data)

# Convert Date column to proper datetime format
print("\nConvert Date Column to Proper Date Format")
data["Date"] = pd.to_datetime(data["Date"], errors="coerce", dayfirst=True)

print("\nAfter Converting Date Column:")
print(data)

# Remove rows with NaN (invalid) values
data_cleaned = data.dropna()

print("\nAfter Removing Rows with Wrong Data:")
print(data_cleaned)



Before Cleaning:
Name      object
Age       object
Salary    object
Date      object
dtype: object

Convert Age & Salary to Numeric Format

After Converting Age & Salary to Numeric:
      Name   Age  Salary        Date
0    Ankit  23.0  2000.0  2024-01-01
1  Navdeep   NaN  3000.0  2024-02-15
2  Mohnish  19.0     NaN  15-03-2024
3    Ankit   NaN  2000.0  2024/04/10
4   Ritesh   NaN  5000.0        None
5   Ishika  18.0  6000.0  2024-06-20

Convert Date Column to Proper Date Format

After Converting Date Column:
      Name   Age  Salary       Date
0    Ankit  23.0  2000.0 2024-01-01
1  Navdeep   NaN  3000.0        NaT
2  Mohnish  19.0     NaN        NaT
3    Ankit   NaN  2000.0        NaT
4   Ritesh   NaN  5000.0        NaT
5   Ishika  18.0  6000.0        NaT

After Removing Rows with Wrong Data:
    Name   Age  Salary       Date
0  Ankit  23.0  2000.0 2024-01-01


In [61]:
import pandas as pd

# Creating a Series
s = pd.Series([10, 20, 30, 40], index=['a', 'b', 'c', 'd'])
print("Series:\n", s)

# Creating a DataFrame with all lists of same length
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 40],
    'City': ['New York', 'London', 'Paris']
}
df = pd.DataFrame(data)
print("\nDataFrame:\n", df)


Series:
 a    10
b    20
c    30
d    40
dtype: int64

DataFrame:
       Name  Age      City
0    Alice   25  New York
1      Bob   30    London
2  Charlie   40     Paris


In [63]:
import pandas as pd

# Reading the CSV file
df = pd.read_csv(r"C:\Users\chira\OneDrive\Desktop\Novem.Py\customer-data.csv")

# Save it to another file (optional)
df.to_csv('output.csv', index=False)

# Show full DataFrame
print("Full DataFrame:\n", df)

# Check available column names
print("\nAvailable columns:\n", df.columns)

# Indexing and Selecting Data Safely
# Replace 'Name' with the correct column name if different
if 'Name' in df.columns:
    print("\nUsing .loc:", df.loc[0, 'Name'])
else:
    print("\n'Name' column not found. Please check column names.")

# iloc is based on index position (row 1, column 1)
print("Using .iloc:", df.iloc[1, 1])



Full DataFrame:
     Index      Customer Id First Name Last Name  \
0       1  DD37Cf93aecA6Dc     Sheryl    Baxter   
1       2  1Ef7b82A4CAAD10    Preston    Lozano   
2       3  6F94879bDAfE5a6        Roy     Berry   
3       4  5Cef8BFA16c5e3c      Linda     Olsen   
4       5  053d585Ab6b3159     Joanna    Bender   
..    ...              ...        ...       ...   
95     96  cb8E23e48d22Eae       Karl     Greer   
96     97  CeD220bdAaCfaDf       Lynn  Atkinson   
97     98  28CDbC0dFe4b1Db       Fred    Guerra   
98     99  c23d1D9EE8DEB0A     Yvonne    Farmer   
99    100  2354a0E336A91A1   Clarence    Haynes   

                            Company               City  \
0                   Rasmussen Group       East Leonard   
1                       Vega-Gentry  East Jimmychester   
2                     Murillo-Perry      Isabelborough   
3   Dominguez, Mcmillan and Donovan         Bensonview   
4          Martin, Lang and Andrade     West Priscilla   
..                    