In [3]:
# Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [26]:
train_data = pd.read_csv('Train_Dataset.csv')
test_data = pd.read_csv('Test_Dataset.csv')

0       6110001
1       6110002
2       6110003
3       6110004
4       6110005
         ...   
2625    6112626
2626    6112627
2627    6112628
2628    6112629
2629    6112630
Name: EmployeeID, Length: 2630, dtype: int64

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
test_employee_ids = test_data['EmployeeID']

In [36]:
# Drop EmployeeID from both train and test (not useful for model training)
train_data = train_data.drop(['EmployeeID','Attrition'], axis=1)
test_data = test_data.drop('EmployeeID', axis=1)
combined_data = pd.concat([train_data, test_data], axis=0)

In [38]:
combined_data

Unnamed: 0,Age,TravelProfile,Department,HomeToWork,EducationField,Gender,HourlnWeek,Involvement,WorkLifeBalance,Designation,JobSatisfaction,ESOPs,NumCompaniesWorked,OverTime,SalaryHikelastYear,WorkExperience,LastPromotion,CurrentProfile,MaritalStatus,MonthlyIncome
0,35.0,Rarely,Analytics,5.0,CA,Male,69.0,1.0,1.0,Executive,1.0,1.0,1.0,1.0,20.0,7.0,2.0,,M,18932.0
1,32.0,Yes,Sales,5.0,Statistics,Female,62.0,4.0,3.0,Executive,2.0,0.0,8.0,0.0,20.0,4.0,1.0,,Single,18785.0
2,31.0,Rarely,Analytics,5.0,Statistics,F,45.0,5.0,3.0,Manager,2.0,1.0,3.0,0.0,26.0,12.0,1.0,3.0,Single,22091.0
3,34.0,Yes,Sales,10.0,Statistics,Female,32.0,3.0,2.0,Manager,4.0,1.0,1.0,0.0,23.0,5.0,1.0,3.0,Divorsed,20302.0
4,37.0,No,Analytics,27.0,Statistics,Female,49.0,3.0,4.0,Manager,4.0,1.0,8.0,0.0,21.0,12.0,1.0,9.0,Divorsed,21674.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2625,46.0,Rarely,Sales,12.0,Marketing Diploma,Male,76.0,3.0,5.0,Senior Manager,5.0,1.0,5.0,1.0,18.0,10.0,1.0,3.0,Married,26761.0
2626,29.0,Rarely,Analytics,22.0,CA,Male,80.0,4.0,4.0,Executive,5.0,0.0,2.0,0.0,18.0,7.0,4.0,4.0,Divorsed,19196.0
2627,44.0,Rarely,Analytics,8.0,CA,F,42.0,4.0,1.0,Senior Manager,1.0,1.0,3.0,0.0,19.0,24.0,5.0,17.0,Married,25248.0
2628,,Rarely,Analytics,11.0,Statistics,Female,,4.0,3.0,Executive,4.0,0.0,2.0,0.0,18.0,2.0,1.0,1.0,Single,17261.0


In [40]:
# Label Encoding for categorical variables
label_encoders = {}
for col in combined_data.columns:
    if combined_data[col].dtype == 'object':
        le = LabelEncoder()
        combined_data[col] = le.fit_transform(combined_data[col])
        label_encoders[col] = le

In [9]:
# Split back into train and test
X = combined_data.iloc[:train_data.shape[0], :]
X_test = combined_data.iloc[train_data.shape[0]:, :]

# Target variable
y = train_data['Attrition']

# Encode target variable if needed (Yes/No -> 1/0)
if y.dtype == 'object':
    y = LabelEncoder().fit_transform(y)


In [19]:
# Split training data into train and validation sets (optional but good practice)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
y_train

4678    0.0
5338    NaN
6138    NaN
4264    0.0
7219    NaN
       ... 
5226    NaN
5390    NaN
860     1.0
7603    NaN
7270    NaN
Name: Attrition, Length: 6248, dtype: float64

In [17]:
# Build the model
model = RandomForestClassifier(random_state=42, n_estimators=100)
model.fit(X_train, y_train)

ValueError: Input y contains NaN.

In [None]:

# Validate the model (optional check)
val_preds = model.predict(X_val)
val_accuracy = accuracy_score(y_val, val_preds)
print(f"Validation Accuracy: {val_accuracy:.4f}")


In [None]:

# Predict on test data
test_preds = model.predict(X_test)

# Prepare the submission file
submission = pd.DataFrame({
    'EmployeeID': test_employee_ids,
    'Attrition': test_preds
})

# Convert Attrition back to original labels if needed
# For example, if you want 'Yes' and 'No' instead of 1 and 0:
submission['Attrition'] = submission['Attrition'].map({1: 'Yes', 0: 'No'})

# Save the submission file
submission.to_csv('Attrition_Prediction_Submission.csv', index=False)

print("Submission file 'Attrition_Prediction_Submission.csv' created successfully.")
