In [2]:
# Step 1: Import Necessary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Configure visualization styles
sns.set(style="whitegrid")

# Step 2: Load the Dataset
df = pd.read_csv('Employee_Performance-1.csv')

# Display the first few rows to understand the structure of the dataset
print("First few rows of the dataset:")
print(df.head())

# Step 3: Data Cleaning - Handle Missing Values
# Check for missing values
print("\nMissing values in the dataset:")
print(df.isnull().sum())

# Handling missing values by filling with median (or dropping based on context)
df.fillna(df.median(), inplace=True)

# Step 4: Data Cleaning - Handle Duplicates
# Check for duplicate rows
print("\nNumber of duplicate rows in the dataset:")
print(df.duplicated().sum())

# Remove duplicate rows if they exist
df.drop_duplicates(inplace=True)

# Step 5: Data Cleaning - Handle Outliers
# Using IQR to identify and remove outliers in 'Experience' and 'PerformanceRating'
Q1 = df[['Experience', 'PerformanceRating']].quantile(0.25)
Q3 = df[['Experience', 'PerformanceRating']].quantile(0.75)
IQR = Q3 - Q1

# Filter out outliers
df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]

print("\nData after cleaning:")
print(df.describe())

# Step 6: Summary Statistics
# Generate summary statistics for numerical columns
print("\nSummary Statistics:")
print(df.describe())

# Step 7: Data Distribution Visualization
# Distribution of Performance Ratings
plt.figure(figsize=(10, 6))
sns.histplot(df['PerformanceRating'], kde=True)
plt.title('Distribution of Performance Ratings')
plt.xlabel('Performance Rating')
plt.ylabel('Frequency')
plt.show()

# Box Plot for Performance Ratings across Departments
plt.figure(figsize=(12, 6))
sns.boxplot(x='Department', y='PerformanceRating', data=df)
plt.title('Performance Ratings by Department')
plt.xlabel('Department')
plt.ylabel('Performance Rating')
plt.xticks(rotation=45)
plt.show()

# Count of Employees by Department
plt.figure(figsize=(12, 6))
sns.countplot(x='Department', data=df)
plt.title('Count of Employees by Department')
plt.xlabel('Department')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()


ModuleNotFoundError: No module named 'pandas'