In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import random

# Load the existing CSV file into a DataFrame
file_path = '/content/drive/MyDrive/layoff.csv'  # Update this with your file path
df = pd.read_csv(file_path)

# Define unique values
industries = ['Travel', 'Sales', 'Transportation', 'Education', 'Healthcare',
              'Food', 'Finance', 'Fitness', 'Aerospace', 'Retail', 'Real Estate', 'Logistics']
stages = ['Acquired', 'Series B', 'Post-IPO', 'Series G', 'Series F', 'Series C',
          'Unknown', 'Series D', 'Series H', 'Series E', 'Series J', 'Seed',
          'Private Equity', 'Series A', 'Subsidiary', 'Series I']

# Assign Industry column
df['Industry'] = np.random.choice(industries, size=len(df))

# Define departments for each industry (optional, if needed for further use)
industry_departments = {
    'Travel': ['Sales', 'Tech Department'],
    'Sales': ['Research & Development', 'Sales', 'IT Department'],
    'Transportation': ['Sales', 'IT Department'],
    'Education': ['Sales', 'IT Department', 'Research & Development', 'Tech Development'],
    'Healthcare': ['Research & Development'],
    'Food': ['Research & Development', 'Sales'],
    'Finance': ['Sales', 'Research & Development', 'Tech Development', 'IT Department'],
    'Fitness': ['Sales', 'IT Department', 'Tech Development', 'Research & Development'],
    'Aerospace': ['Research & Development', 'Sales', 'IT Department', 'Human Resources', 'Tech Development'],
    'Retail': ['Research & Development', 'Sales', 'IT Department', 'Tech Development'],
    'Real Estate': ['Research & Development', 'Sales', 'IT Department', 'Human Resources', 'Tech Development'],
    'Logistics': ['Research & Development', 'Sales', 'IT Department', 'Human Resources', 'Tech Development', 'Defence']
}

# Assign Stage column based on Attrition value
def assign_stage(row):
    if row['Attrition'] == 'Yes':
        return 'Post-IPO'  # High attrition stage
    return random.choice(stages)  # Randomly choose for other cases

df['Stage'] = df.apply(assign_stage, axis=1)

# Add Funds_Raised(m) column with value constraints
def assign_funds(row):
    if row['Attrition'] == 'Yes':
        return np.random.randint(1, 30)  # Lower range for high attrition
    return np.random.randint(1, 50)  # General range

df['Funds_Raised(m)'] = df.apply(assign_funds, axis=1)

# Save the modified DataFrame back to a CSV file
output_file_path = '/content/drive/MyDrive/modified_file.csv'  # Update this with your desired output file path
df.to_csv(output_file_path, index=False)

print(f"Modified data saved to {output_file_path}")


Modified data saved to /content/drive/MyDrive/modified_file.csv


In [None]:
import pandas as pd
import numpy as np

# Define the function to generate new records
def generate_new_record():
    # Generate base attributes
    age = np.random.normal(loc=36.92, scale=9.14)
    age = max(18, min(60, age))

    daily_rate = np.random.normal(loc=802.49, scale=403.51)
    daily_rate = max(102, min(1499, daily_rate))

    distance_from_home = np.random.normal(loc=9.19, scale=8.11)
    distance_from_home = max(1, min(29, distance_from_home))

    education = np.random.choice([1, 2, 3, 4, 5], p=[0.2, 0.3, 0.2, 0.2, 0.1])
    relationship_satisfaction = np.random.choice([1, 2, 3, 4], p=[0.25, 0.25, 0.25, 0.25])
    stock_option_level = np.random.choice([0, 1, 2, 3], p=[0.25, 0.25, 0.25, 0.25])

    years_at_company = np.random.normal(loc=7.01, scale=6.13)
    years_at_company = max(0, min(years_at_company, age - 20))

    total_working_years = np.random.normal(loc=years_at_company + 3, scale=7.78)
    total_working_years = max(years_at_company, min(total_working_years, age - 20))

    years_in_current_role = np.random.normal(loc=4.23, scale=3.62)
    years_in_current_role = max(0, min(years_in_current_role, years_at_company))

    years_since_last_promotion = np.random.normal(loc=2.19, scale=3.22)
    years_since_last_promotion = max(0, min(years_since_last_promotion, years_at_company))

    years_with_curr_manager = np.random.normal(loc=4.13, scale=3.57)
    years_with_curr_manager = max(0, min(years_with_curr_manager, years_at_company))

    training_times_last_year = np.random.choice([0, 1, 2, 3, 4, 5, 6], p=[0.1, 0.1, 0.2, 0.2, 0.2, 0.1, 0.1])
    work_life_balance = np.random.choice([1, 2, 3, 4], p=[0.25, 0.25, 0.25, 0.25])

    environment_satisfaction = np.random.choice([1, 2, 3, 4], p=[0.25, 0.25, 0.25, 0.25])
    gender = np.random.choice(['Male', 'Female'], p=[0.5, 0.5])
    hourly_rate = np.random.normal(loc=65.88, scale=20.33)
    hourly_rate = max(30, min(100, hourly_rate))
    job_involvement = np.random.choice([1, 2, 3, 4], p=[0.25, 0.25, 0.25, 0.25])
    job_level = np.random.choice([1, 2, 3, 4, 5], p=[0.3, 0.3, 0.2, 0.1, 0.1])
    job_satisfaction = np.random.choice([1, 2, 3, 4], p=[0.25, 0.25, 0.25, 0.25])
    marital_status = np.random.choice(['Single', 'Married', 'Divorced'], p=[0.3, 0.5, 0.2])
    monthly_income = np.random.normal(loc=6500, scale=4700)
    monthly_income = max(1000, min(20000, monthly_income))
    monthly_rate = np.random.normal(loc=14313.10, scale=7117.64)
    monthly_rate = max(2094, min(26999, monthly_rate))
    num_companies_worked = np.random.randint(1, 10)
    over18 = 'Y'
    overtime = np.random.choice(['Yes', 'No'], p=[0.24, 0.76])
    percent_salary_hike = np.random.normal(loc=15.21, scale=3.66)
    percent_salary_hike = max(11, min(25, percent_salary_hike))
    performance_rating = np.random.choice([3, 4], p=[0.9, 0.1])

    # Industry and Department Mapping
    industries = ['Travel', 'Sales', 'Transportation', 'Education', 'Healthcare',
                  'Food', 'Finance', 'Fitness', 'Aerospace', 'Retail', 'Real Estate', 'Logistics']
    industry_departments = {
        'Travel': ['Sales', 'Tech Development'],
        'Sales': ['Research & Development', 'Sales', 'IT department'],
        'Transportation': ['Sales', 'IT department'],
        'Education': ['Sales', 'IT department', 'Research & Development', 'Tech Development'],
        'Healthcare': ['Research & Development'],
        'Food': ['Research & Development', 'Sales'],
        'Finance': ['Sales', 'Research & Development', 'Tech Development', 'IT department'],
        'Fitness': ['Sales', 'IT department', 'Tech Development', 'Research & Development'],
        'Aerospace': ['Research & Development', 'Sales', 'IT department', 'Human Resources', 'Tech Development'],
        'Retail': ['Research & Development', 'Sales', 'IT department', 'Tech Development'],
        'Real Estate': ['Research & Development', 'Sales', 'IT department', 'Human Resources', 'Tech Development'],
        'Logistics': ['Research & Development', 'Sales', 'IT department', 'Human Resources', 'Tech Development', 'Defence']
    }

    industry = np.random.choice(industries)
    department = np.random.choice(industry_departments[industry])

    # Set job role and education field based on department
    if department == 'IT department':
        education_field = 'Technical Degree'
        job_role = np.random.choice(['Network Engineer', 'Operational Executive', 'Software Developer',
                                     'Project Manager', 'Team Lead'])
    elif department == 'Tech Development':
        education_field = 'Technical Degree'
        job_role = np.random.choice(['Research Scientist', 'Research Director', 'Project Manager',
                                     'Team Lead'])
    elif department == 'Research & Development':
        education_field = np.random.choice(['Life Sciences', 'Medical', 'Technical Degree'])
        job_role = np.random.choice(['Research Scientist', 'Laboratory Technician', 'Research Director'])
    elif department == 'Sales':
        education_field = np.random.choice(['Marketing', 'Life Sciences', 'Medical'])
        job_role = np.random.choice(['Sales Executive', 'Sales Representative'])
    elif department == 'Human Resources':
        education_field = 'Human Resources'
        job_role = 'Human Resources'
    elif department == 'Defence':
        education_field = np.random.choice(['Technical Degree', 'Other'])
        job_role = np.random.choice(['Operational Executive', 'Project Manager'])

    # Funds raised
    funds_raised = np.random.uniform(1, 50)

    # Calculate attrition probability based on relevant attributes, stage, and funds raised
    stages = ['Acquired', 'Series B', 'Post-IPO', 'Series G', 'Series F', 'Series C',
              'Unknown', 'Series D', 'Series H', 'Series E', 'Series J', 'Seed',
              'Private Equity', 'Series A', 'Subsidiary', 'Series I']

    stage = np.random.choice(stages)

    # Base attrition probability
    base_attrition_prob = (
        0.1 * (5 - job_satisfaction) +
        0.1 * (5 - job_involvement) +
        0.1 * (5 - job_level) +
        0.1 * (5 - performance_rating) +
        0.1 * (1 / (years_at_company + 1)) +
        0.1 * (1 / (years_with_curr_manager + 1)) +
        0.1 * (1 / (years_in_current_role + 1))
    )

    # Adjust for stage
    if stage == 'Post-IPO':
        base_attrition_prob *= 1.5

    # Adjust for funds raised
    if funds_raised < 25:
        base_attrition_prob *= 1.2
    else:
        base_attrition_prob *= 0.8

    attrition_prob = min(max(base_attrition_prob, 0.1), 0.9)
    attrition = np.random.choice(['No', 'Yes'], p=[1 - attrition_prob, attrition_prob])

    new_record = {
        'Age': int(age),
        'Attrition': attrition,
        'BusinessTravel': np.random.choice(['Travel_Rarely', 'Travel_Frequently', 'Non-Travel'],
                                           p=[0.7095, 0.1884, 0.1021]),
        'DailyRate': int(daily_rate),
        'Department': department,
        'DistanceFromHome': int(distance_from_home),
        'Education': education,
        'EducationField': education_field,
        'EmployeeCount': 1,
        'EmployeeNumber': np.random.randint(1, 10000),  # Random unique employee number
        'RelationshipSatisfaction': relationship_satisfaction,
        'StandardHours': 80,
        'StockOptionLevel': stock_option_level,
        'TotalWorkingYears': int(total_working_years),
        'TrainingTimesLastYear': training_times_last_year,
        'WorkLifeBalance': work_life_balance,
        'YearsAtCompany': int(years_at_company),
        'YearsInCurrentRole': int(years_in_current_role),
        'YearsSinceLastPromotion': int(years_since_last_promotion),
        'YearsWithCurrManager': int(years_with_curr_manager),
        'EnvironmentSatisfaction': environment_satisfaction,
        'Gender': gender,
        'HourlyRate': int(hourly_rate),
        'JobInvolvement': job_involvement,
        'JobLevel': job_level,
        'JobRole': job_role,
        'JobSatisfaction': job_satisfaction,
        'MaritalStatus': marital_status,
        'MonthlyIncome': int(monthly_income),
        'MonthlyRate': int(monthly_rate),
        'NumCompaniesWorked': num_companies_worked,
        'Over18': over18,
        'OverTime': overtime,
        'PercentSalaryHike': int(percent_salary_hike),
        'PerformanceRating': performance_rating,
        'Industry': industry,
        'Stage': stage,
        'Funds_Raised(m)': round(funds_raised, 2)
    }

    return new_record

# Generate 3000 records
new_records = [generate_new_record() for _ in range(3000)]

# Convert the new records into a DataFrame
new_df = pd.DataFrame(new_records)

# Load the existing layoff.csv file
existing_df = pd.read_csv('/content/drive/MyDrive/modified_file.csv')

# Append the new records to the existing DataFrame
combined_df = pd.concat([existing_df, new_df], ignore_index=True)

# Save the combined DataFrame back to a CSV file
combined_df.to_csv('/content/drive/MyDrive/final.csv', index=False)

# Print the combined DataFrame
print(combined_df)


      Age Attrition     BusinessTravel  DailyRate              Department  \
0      41       Yes      Travel_Rarely       1102                   Sales   
1      49        No  Travel_Frequently        279  Research & Development   
2      37       Yes      Travel_Rarely       1373  Research & Development   
3      33        No  Travel_Frequently       1392  Research & Development   
4      27        No      Travel_Rarely        591  Research & Development   
...   ...       ...                ...        ...                     ...   
7465   18       Yes      Travel_Rarely        291           IT department   
7466   29       Yes      Travel_Rarely        393           IT department   
7467   27       Yes      Travel_Rarely       1080                   Sales   
7468   37       Yes  Travel_Frequently        217           IT department   
7469   35       Yes      Travel_Rarely       1393                   Sales   

      DistanceFromHome  Education    EducationField  EmployeeCount  \
0    

In [None]:
import pandas as pd
import numpy as np
# Load the dataset
file_path = '/content/drive/MyDrive/ADT.csv'  # Update this with your file path
df = pd.read_csv(file_path)

In [None]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 38 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       3000 non-null   int64  
 1   Attrition                 3000 non-null   object 
 2   BusinessTravel            3000 non-null   object 
 3   DailyRate                 3000 non-null   int64  
 4   Department                3000 non-null   object 
 5   DistanceFromHome          3000 non-null   int64  
 6   Education                 3000 non-null   int64  
 7   EducationField            3000 non-null   object 
 8   EmployeeCount             3000 non-null   int64  
 9   EmployeeNumber            3000 non-null   int64  
 10  EnvironmentSatisfaction   3000 non-null   int64  
 11  Gender                    3000 non-null   object 
 12  HourlyRate                3000 non-null   int64  
 13  JobInvolvement            3000 non-null   int64  
 14  JobLevel

In [None]:
#NUMERIC VALUEEEEEEEEEEEEEEEEEEEEEEEEEE

import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load the dataset
file_path = '/content/drive/MyDrive/ADT.csv'  # Update this with your file path
df = pd.read_csv(file_path)

# Convert all categorical columns to numeric using LabelEncoder
categorical_columns = df.select_dtypes(include=['object']).columns
label_encoders = {}

for column in categorical_columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

# Print the mapping for each categorical column
print("Label Encoding Mappings:")
for column, le in label_encoders.items():
    print(f"\n{column}:")
    for index, label in enumerate(le.classes_):
        print(f"  {label} -> {index}")

# Check the first few rows of the DataFrame to confirm the conversion
print("\nConverted DataFrame (first 5 rows):")
print(df.head())


Label Encoding Mappings:

Attrition:
  No -> 0
  Yes -> 1

BusinessTravel:
  Non-Travel -> 0
  Travel_Frequently -> 1
  Travel_Rarely -> 2

Department:
  Defence -> 0
  Human Resources -> 1
  IT department -> 2
  Research & Development -> 3
  Sales -> 4
  Tech Development -> 5

EducationField:
  Human Resources -> 0
  Life Sciences -> 1
  Marketing -> 2
  Medical -> 3
  Other -> 4
  Technical Degree -> 5

Gender:
  Female -> 0
  Male -> 1

JobRole:
  Healthcare Representative -> 0
  Human Resources -> 1
  Laboratory Technician -> 2
  Network Engineer -> 3
  Operational Executive -> 4
  Project Manager -> 5
  Research Director -> 6
  Research Scientist -> 7
  Sales Executive -> 8
  Sales Representative -> 9
  Software Developer -> 10
  Team Lead -> 11

MaritalStatus:
  Divorced -> 0
  Married -> 1
  Single -> 2

Over18:
  Y -> 0

OverTime:
  No -> 0
  Yes -> 1

Industry:
  Aerospace -> 0
  Education -> 1
  Finance -> 2
  Fitness -> 3
  Food -> 4
  Healthcare -> 5
  Logistics -> 6
  Real

In [None]:
# Calculate correlation matrix
correlation_matrix = df.corr()

# Extract correlations with 'Attrition'
attrition_correlation = correlation_matrix['Attrition'].sort_values(ascending=False)

# Print the top 10 features correlated with Attrition
print("Top 10 Features Correlated with Attrition:")
print(attrition_correlation)  # Include 'Attrition' itself at the top

Top 10 Features Correlated with Attrition:
Attrition                   1.000000
NumCompaniesWorked          0.040966
DistanceFromHome            0.034281
MaritalStatus               0.016490
EducationField              0.013351
MonthlyRate                 0.013064
BusinessTravel              0.009530
EnvironmentSatisfaction     0.009461
HourlyRate                  0.009175
MonthlyIncome               0.006745
PercentSalaryHike           0.005985
EmployeeNumber              0.004620
JobRole                     0.004580
YearsSinceLastPromotion     0.003249
Gender                      0.002896
OverTime                    0.001945
RelationshipSatisfaction    0.000078
WorkLifeBalance            -0.000427
Department                 -0.000923
Stage                      -0.013161
TotalWorkingYears          -0.016837
StockOptionLevel           -0.017731
Age                        -0.018700
TrainingTimesLastYear      -0.020220
DailyRate                  -0.025791
YearsAtCompany             -0.03

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score

# Load the dataset
# file_path = '/content/drive/MyDrive/ADT.csv'  # Update this with your file path
# df = pd.read_csv(file_path)

# Define the features and the target variable
features = ['Age', 'EducationField', 'JobRole', 'Department',
            'Industry', 'Stage', 'Funds_Raised(m)', 'JobSatisfaction', 'JobInvolvement',
            'YearsAtCompany', 'YearsInCurrentRole', 'YearsWithCurrManager',
            'RelationshipSatisfaction', 'PerformanceRating']
target = 'Attrition'

# Split the data into training and testing sets
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
precision = precision_score(y_test, y_pred)

# Print the evaluation metrics
print("Model Evaluation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"ROC-AUC Score: {roc_auc:.4f}")
print(f"Precision: {precision:.4f}")


Model Evaluation Metrics:
Accuracy: 0.8378
ROC-AUC Score: 0.5923
Precision: 0.8406


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score

# Load the dataset
# file_path = '/content/drive/MyDrive/ADT.csv'  # Update this with your file path
# df = pd.read_csv(file_path)

# Define the features and the target variable
features = ['Age', 'EducationField', 'JobRole', 'Department',
            'Industry', 'Stage', 'Funds_Raised(m)', 'JobSatisfaction', 'JobInvolvement',
            'YearsAtCompany', 'YearsInCurrentRole', 'YearsWithCurrManager',
            'RelationshipSatisfaction', 'PerformanceRating']
target = 'Attrition'

# Split the data into training and testing sets
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Function to evaluate a model
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    precision = precision_score(y_test, y_pred)

    return accuracy, roc_auc, precision

# Initialize models
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
    'LightGBM': LGBMClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'SVC': SVC(probability=True, random_state=42)
}

# Evaluate each model
results = {}
for name, model in models.items():
    accuracy, roc_auc, precision = evaluate_model(model, X_train, X_test, y_train, y_test)
    results[name] = {'Accuracy': accuracy, 'ROC-AUC Score': roc_auc, 'Precision': precision}

# Print the evaluation metrics for each model
for name, metrics in results.items():
    print(f"{name} Model Evaluation Metrics:")
    print(f"Accuracy: {metrics['Accuracy']:.4f}")
    print(f"ROC-AUC Score: {metrics['ROC-AUC Score']:.4f}")
    print(f"Precision: {metrics['Precision']:.4f}\n")


[LightGBM] [Info] Number of positive: 1792, number of negative: 308
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000503 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 419
[LightGBM] [Info] Number of data points in the train set: 2100, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.853333 -> initscore=1.760988
[LightGBM] [Info] Start training from score 1.760988
Random Forest Model Evaluation Metrics:
Accuracy: 0.8378
ROC-AUC Score: 0.5923
Precision: 0.8406

Gradient Boosting Model Evaluation Metrics:
Accuracy: 0.8356
ROC-AUC Score: 0.5465
Precision: 0.8402

XGBoost Model Evaluation Metrics:
Accuracy: 0.8211
ROC-AUC Score: 0.5386
Precision: 0.8417

LightGBM Model Evaluation Metrics:
Accuracy: 0.8256
ROC-AUC Score: 0.5427
Precision: 0.8417

Logistic Regression Model Evaluation Metrics:
Accuracy: 0.8422
R