In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [85]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import LinearSVR, SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor


import warnings
warnings.filterwarnings(action='ignore')


In [None]:
data= pd.read_csv('/content/gdrive/MyDrive/AI/Ai_Assignment_01/train.csv')

In [None]:
data

Unnamed: 0,Employee ID,Date of Joining,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,Burn Rate
0,fffe32003000360033003200,30/09/2008,Female,Service,No,2,3.0,3.8,0.16
1,fffe3700360033003500,30/11/2008,Male,Service,Yes,1,2.0,5.0,0.36
2,fffe31003300320037003900,10/03/2008,Female,Product,Yes,2,,5.8,0.49
3,fffe32003400380032003900,03/11/2008,Male,Service,Yes,1,1.0,2.6,0.20
4,fffe31003900340031003600,24/07/2008,Female,Service,No,3,7.0,6.9,0.52
...,...,...,...,...,...,...,...,...,...
22745,fffe31003500370039003100,30/12/2008,Female,Service,No,1,3.0,,0.41
22746,fffe33003000350031003800,19/01/2008,Female,Product,Yes,3,6.0,6.7,0.59
22747,fffe390032003000,05/11/2008,Male,Service,Yes,3,7.0,,0.72
22748,fffe33003300320036003900,10/01/2008,Female,Service,No,2,5.0,5.9,0.52


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22750 entries, 0 to 22749
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Employee ID           22750 non-null  object 
 1   Date of Joining       22750 non-null  object 
 2   Gender                22750 non-null  object 
 3   Company Type          22750 non-null  object 
 4   WFH Setup Available   22750 non-null  object 
 5   Designation           22750 non-null  int64  
 6   Resource Allocation   21369 non-null  float64
 7   Mental Fatigue Score  20633 non-null  float64
 8   Burn Rate             21626 non-null  float64
dtypes: float64(3), int64(1), object(5)
memory usage: 1.6+ MB


PREPROCESSING

In [None]:
#create function to copy the data frame and return it
def preprocess_inputs(df):
    df = df.copy()

# Drop Employee ID column because these dont add up into the prediction the performance of the model
    df = df.drop('Employee ID', axis=1)

# Now we are going to drop the missing values like we dont need the burn out rate because we dont want predict imputed values we want to predict fabricated values

missing_target_rows=df.loc[df['Burn Rate'].isna(), :].index
df = df.drop(missing_target_rows, axis=0).reset_index(drop=True)


# Fill remaining missing values with column means
      for column in ['Resource Allocation', 'Mental Fatigue Score']:
      df[column] = df[column].fillna(df[column].mean())

    df['Date of Joining'] = pd.to_datetime(df['Date of Joining'])
    df['Join Month'] = df['Date of Joining'].apply(lambda x: x.month)
    df['Join Day'] = df['Date of Joining'].apply(lambda x: x.day)
    df = df.drop('Date of Joining', axis=1)

     # Binary encoding
    df['Gender'] = df['Gender'].replace({'Female': 0, 'Male': 1})
    df['Company Type'] = df['Company Type'].replace({'Product': 0, 'Service': 1})
    df['WFH Setup Available'] = df['WFH Setup Available'].replace({'No': 0, 'Yes': 1})

    # Split df into X and y
    y = df['Burn Rate']
    X = df.drop('Burn Rate', axis=1)

    #Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)


     #Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)




  


return X_train, X_test, y_train, y_test

In [None]:
#Processed version of data we store in X
 X_train, X_test, y_train, y_test =preprocess_inputs(data)

In [None]:
pd.to_datetime(X['Date of Joining']).apply(lambda X: X.year)

In [None]:
X_train

In [None]:
y_train

In [None]:
{column: len(X[column].unique()) for column in X.columns}

In [None]:
# so there is some missing values so take look to the nature of those in true or false because true represents missing value in it
X.isna()

In [None]:
# so we someover the rows to get the total True or missing values
X.isna().sum()

# TRANING

In [None]:
models = {
    "                     Linear Regression": LinearRegression(),
    " Linear Regression (L2 Regularization)": Ridge(),
    " Linear Regression (L1 Regularization)": Lasso(),
    "                   K-Nearest Neighbors": KNeighborsRegressor(),
    "                        Neural Network": MLPRegressor(),
    "Support Vector Machine (Linear Kernel)": LinearSVR(),
    "   Support Vector Machine (RBF Kernel)": SVR(),
    "                         Decision Tree": DecisionTreeRegressor(),
    "                         Random Forest": RandomForestRegressor(),
    "                     Gradient Boosting": GradientBoostingRegressor(),
    "                               XGBoost": XGBRegressor(),
    "                              LightGBM": LGBMRegressor(),
}
for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.") #THIS WILL THE TRAIN RESULT BUT I DONT UNDERSTAND WHAT THE ERROE IS NOT SOLVING

In [None]:
# Linear Regression trained.
#  Linear Regression (L2 Regularization) trained.
#  Linear Regression (L1 Regularization) trained.
#                    K-Nearest Neighbors trained.
#                         Neural Network trained.
# Support Vector Machine (Linear Kernel) trained.
#    Support Vector Machine (RBF Kernel) trained.
#                          Decision Tree trained.
#                          Random Forest trained.
#                      Gradient Boosting trained.
#                                XGBoost trained.
#                               LightGBM trained.
#                               CatBoost trained.

#RESULTS

In [None]:
for name, model in models.items():
    print(name + " R^2 Score: {:.5f}".format(model.score(X_test, y_test))) # SAME ERROE AS ABOVE I DIDNT CATCH THAT

In [None]:
#  Linear Regression R^2 Score: 0.87075
#  Linear Regression (L2 Regularization) R^2 Score: 0.87075
#  Linear Regression (L1 Regularization) R^2 Score: -0.00001
#                    K-Nearest Neighbors R^2 Score: 0.85605
#                         Neural Network R^2 Score: 0.87242
# Support Vector Machine (Linear Kernel) R^2 Score: 0.86897
#    Support Vector Machine (RBF Kernel) R^2 Score: 0.88430
#                          Decision Tree R^2 Score: 0.81606
#                          Random Forest R^2 Score: 0.89753
#                      Gradient Boosting R^2 Score: 0.90257
#                                XGBoost R^2 Score: 0.90310
#                               LightGBM R^2 Score: 0.90912
#                               CatBoost R^2 Score: 0.90842