# Employee Data Analysis
This notebook performs a full data analysis workflow on `employees.csv`.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load data
df = pd.read_csv('employees.csv', parse_dates=['HireDate'])
df.head()


## Data Cleaning
- Convert types, handle missing ManagerID, create hire_year and tenure


In [None]:
df['ManagerID'] = df['ManagerID'].replace('', np.nan)
df['HireDate'] = pd.to_datetime(df['HireDate'])
df['hire_year'] = df['HireDate'].dt.year
df['tenure_years'] = (pd.Timestamp('today') - df['HireDate']).dt.days // 365
df['Department'] = df['Department'].astype('category')
df['Role'] = df['Role'].astype('category')
df.head()


## Exploratory Data Analysis
- Counts, distributions, and hiring trends


In [None]:
print('Employees per Department:\n', df['Department'].value_counts())
df['hire_year'].value_counts().sort_index().plot(kind='bar', figsize=(10,4));
plt.title('Hires by Year')
plt.show()


## Feature Engineering
- Create direct_reports_count for managers


In [None]:
df['ManagerID'] = df['ManagerID'].astype('float').astype(pd.Int64Dtype())
reports = df.groupby('ManagerID')['EmployeeID'].count().rename('direct_reports_count')
df = df.merge(reports, left_on='EmployeeID', right_on='ManagerID', how='left')
df['direct_reports_count'] = df['direct_reports_count'].fillna(0).astype(int)
df.head()


## Simple Predictive Example
Predict whether an employee is a manager (`Role` contains 'Manager') using tenure and department.


In [None]:
df['is_manager'] = df['Role'].str.contains('Manager').astype(int)
le = LabelEncoder()
X = pd.DataFrame({
    'tenure': df['tenure_years'],
    'dept': le.fit_transform(df['Department'])
})
y = df['is_manager']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)
print('Test accuracy:', model.score(X_test, y_test))
