# ** please load your data below the code :)**

In [8]:
import pandas as pd
#two types allowed to load xlsx or csv files
class Load_data:
    def __init__(self, file_name=None):
        self.df = None
        if file_name:
            try:
                file_extension = file_name.split('.')[-1].lower() 
                if file_extension == 'xlsx' or file_extension == 'xls':
                    self.df = pd.read_excel(file_name)  
                    print("Data successfully loaded")
                elif file_extension == 'csv':
                    self.df = pd.read_csv(file_name)
                    print("Data successfully loaded")
                else:
                    print("Unsupported file format.")
            except Exception as e:
                print("Error loading data")
        else:
            print("No file path provided.")
    
    def Data_overview(self): 
        if self.df is not None:
            print(self.df.info())
            print(f"\nShape: {self.df.shape}, Data types:\n{self.df.dtypes}")
            print(f"\nMissing values:\n{self.df.isnull().sum()}")
            print(f"\nUnique values per column:\n{self.df.nunique()}")
        else:
            print("Data not loaded. Please provide a valid file path.")


In [9]:
class DataProcessing(Load_data):
    def __init__(self, file_name):
        super().__init__(file_name)
        #handling missing vlaues:
        self.df["Exit Date"] = self.df["Exit Date"].fillna(pd.to_datetime("2050-12-31"))  # Fill null with a far-future date to represent ongoing employment
        self.df['Full Name'] = self.df['Full Name'].fillna('Unknown')
        self.df['Job Title'] = self.df['Job Title'].fillna('Unknown')
        self.df['Department'] = self.df['Department'].fillna('Not Assigned')
        self.df['Gender'] = self.df['Gender'].fillna('Not Provided')
        self.df['Ethnicity'] = self.df['Ethnicity'].fillna('Not Provided')
        self.df['Age'] = self.df['Age'].fillna(self.df['Age'].mean()) #fill age with the mean vlaue of column age
        self.df['Hire Date'] = self.df['Hire Date'].fillna(pd.to_datetime('1900-01-01')) #fill hire date with random data
        self.df['Annual Salary'] = self.df['Annual Salary'].fillna(self.df['Annual Salary'].median()) #fill salary with median since CEOs can be outliers :D
        self.df['Bonus %'] = self.df['Bonus %'].fillna(self.df['Bonus %'].mean())
        self.df['Country'] = self.df['Country'].fillna('Not Provided')
        self.df['City'] = self.df['City'].fillna('Not Provided')
    def ShowData_after_processing(self):
        self.Data_overview()
    def save_processing_as_new_file(self, directory): #the user will provide the directory only
        file_name = "Processed_Employee_Data"
        full_file_path = f"{directory}\\{file_name}.xlsx"
        self.df.to_excel(full_file_path, index=False)
        print("Data successfully saved ")

In [10]:
class Data_analysis(DataProcessing):
    def __init__(self,file_name):
        super().__init__(file_name)
    def group_data(self):
        print("\nAverage Age and Salary by Department:")
        print(self.df.groupby('Department')[['Age', 'Annual Salary']].mean())
        print("\nMax Age, Min Age, and Median Salary by Department and Ethnicity:")
        print(self.df.groupby(['Department', 'Ethnicity'])[['Age', 'Annual Salary']].agg(['max', 'min', 'median']))
    def employment_with_largest_salary(self):
       max_salary = self.df['Annual Salary'].max()
       print("\nThe employee with the highest salary")
       print(self.df[self.df['Annual Salary'] == max_salary])



In [None]:
data = Data_analysis(r"C:\Users\Estarta\Desktop\Project\Employee_Sample_Data.xlsx")
data.Data_overview()
data.save_processing_as_new_file(r"C:\Users\Estarta\Desktop\Project")
data.group_data()
data.employment_with_largest_salary()

Data successfully loaded
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   EEID           1000 non-null   object        
 1   Full Name      1000 non-null   object        
 2   Job Title      1000 non-null   object        
 3   Department     1000 non-null   object        
 4   Business Unit  1000 non-null   object        
 5   Gender         1000 non-null   object        
 6   Ethnicity      1000 non-null   object        
 7   Age            1000 non-null   float64       
 8   Hire Date      1000 non-null   datetime64[ns]
 9   Annual Salary  1000 non-null   float64       
 10  Bonus %        1000 non-null   float64       
 11  Country        1000 non-null   object        
 12  City           1000 non-null   object        
 13  Exit Date      1000 non-null   datetime64[ns]
dtypes: datetime64[ns](2), float64(3), object(9)
memo