## Step 1: Load the Data
Read the CSV file and preview the first few rows.

In [None]:
import pandas as pd
df = pd.read_csv('attendance.csv')
df.head()

## Step 2: Handle Missing Values
Check for missing values and fill them appropriately.

In [None]:
df.isnull().sum()
df['DaysAbsent'] = df['DaysAbsent'].fillna(0)
mean_present = df['DaysPresent'].mean()
df['DaysPresent'] = df['DaysPresent'].fillna(mean_present)
df.head()

## Step 3: Remove Duplicates
Identify and remove duplicate records.

In [None]:
df.duplicated().sum()
df = df.drop_duplicates()
df.shape

## Step 4: Convert Data Types
Ensure numeric columns are integers and convert date column.

In [None]:
df['DaysPresent'] = df['DaysPresent'].astype(int)
df['DaysAbsent'] = df['DaysAbsent'].astype(int)
df['TotalDays'] = df['TotalDays'].astype(int)
df['DateJoined'] = pd.to_datetime(df['DateJoined'])
df.dtypes

## Step 5: Calculate Attendance Rate
Create and normalize the AttendanceRate column.

In [None]:
df['AttendanceRate'] = df['DaysPresent'] / df['TotalDays']
df['AttendanceRate_norm'] = (df['AttendanceRate'] - df['AttendanceRate'].min()) / (df['AttendanceRate'].max() - df['AttendanceRate'].min())
df[['Name', 'AttendanceRate', 'AttendanceRate_norm']].head()

## Step 6: Identify Outliers
Use the IQR method to find outliers in AttendanceRate.

In [None]:
Q1 = df['AttendanceRate'].quantile(0.25)
Q3 = df['AttendanceRate'].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
outliers = df[(df['AttendanceRate'] < lower) | (df['AttendanceRate'] > upper)]
outliers

## Step 7: Save Cleaned Data
Save the cleaned DataFrame to a new CSV file.

In [None]:
df.to_csv('attendance_cleaned.csv', index=False)