**Import the laibrary**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ydata_profiling import ProfileReport

In [None]:
pip install ydata_profiling

In [None]:
pip install openpyxl

In [None]:
df = pd.read_excel('Employee Sample Data - A.xlsx')
print("Features Name:", df.columns.tolist())

# print first 5 rows
df.head()

**Exploratory Data Analysis Report**

In [None]:
profile = ProfileReport(
    df,
    title="Exploratory Data Analysis Report",
    explorative=True
)

output_filename = "employee_data_profile.html"
profile.to_file(output_filename)

In [None]:
df.info()
df.isna().sum()

# Cleaning Dataset

In [None]:
# Clean 'Annual Salary' column: remove '$', ',' , and convert to numeric fl
df['Annual Salary'] = df['Annual Salary'].replace({'\$': '', ',': ''}, regex=True).astype(float)


In [None]:
df['Annual Salary'].head()

In [None]:
# Clean 'Bonus %' column, remove % , convert to float
df['Bonus %'] = df['Bonus %'].replace({'%': ''}, regex=True).astype(float)

# Fill any missing bonus values with 0
df['Bonus %'].fillna(0, inplace=True)

df['Bonus %'].head()


In [None]:
# convert date columns
df['Hire Date'] = pd.to_datetime(df['Hire Date'], errors='coerce').dt.date
df['Exit Date'] = pd.to_datetime(df['Exit Date'], errors='coerce').dt.date

df.head()

In [None]:
# Modify the First 5 Rows
for i in range(5):
    df.loc[i, 'Full Name'] = f"Person {i+1}"
    df.loc[i, 'Age'] = np.random.randint(25, 55)
    df.loc[i, 'Annual Salary'] = np.random.randint(60000, 180000)
    df.loc[i, 'Department'] = 'Research & Development'
    df.loc[i, 'Job Title'] = 'Lead Scientist'
df.head()

In [None]:
# find and print the row with the largest salary
max_salary_row = df[df['Annual Salary'] == df['Annual Salary'].max()]
print("Row with the largest salary:")
max_salary_row

In [None]:
# group by department (Average age and salary)
department_grouped  = df.groupby('Department').agg({'Age': 'mean', 'Annual Salary': 'mean'}).rename(columns={'Age': 'Average Age', 'Annual Salary': 'Average Annual Salary'}).round(2)

department_grouped

In [None]:
# group by Department and Ethncity
dept_ethnicity_grouped = df.groupby(['Department', 'Ethnicity']).agg({
    'Age': ['max', 'min'],
    'Annual Salary': 'median'
}).round(2)

dept_ethnicity_grouped

In [None]:
# save the analysis to an excel file
try:
    with pd.ExcelWriter('employee_analysis.xlsx') as writer:
        df.to_excel(writer, sheet_name='Cleaned_Data', index=False)
        department_grouped.to_excel(writer, sheet_name='Department_Analysis')
        dept_ethnicity_grouped.to_excel(writer, sheet_name='Dept_Ethnicity_Analysis')
except Exception as e:
    print(f"Error saving file: {e}")

# Visualization

In [None]:
# Set plot style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 7)
%matplotlib inline

# Bar Chart, Average Salary by Department
plt.figure()
avg_salary_sorted = department_grouped['Average Annual Salary'].sort_values(ascending=False)
sns.barplot(x=avg_salary_sorted.values, y=avg_salary_sorted.index, palette="viridis")
plt.title('Average Annual Salary by Department', fontsize=16)
plt.xlabel('Average Annual Salary ($)', fontsize=12)
plt.ylabel('Department', fontsize=12)
plt.xticks(rotation=0)
plt.show()

In [None]:
# Histogram, Distribution of Employee Age
plt.figure()
sns.histplot(df['Age'], bins=20, kde=True, color='skyblue')
plt.title('Distribution of Employee Age', fontsize=16)
plt.xlabel('Age', fontsize=12)
plt.ylabel('Number of Employees', fontsize=12)
plt.tight_layout()
plt.show()


In [None]:

# Pie Chart, Employee Distribution by Country
plt.figure()
country_counts = df['Country'].value_counts()
# Group smaller countries into 'Other' to keep the pie chart clean
threshold = 20
main_countries = country_counts[country_counts >= threshold]
other_count = country_counts[country_counts < threshold].sum()
if other_count > 0:
    main_countries['Other'] = other_count

plt.pie(main_countries, labels=main_countries.index, autopct='%1.1f%%', startangle=140, colors=sns.color_palette("Set2"))
plt.title('Employee Distribution by Country', fontsize=16)
plt.ylabel('')
plt.axis('equal')
plt.show()