## Linear Regression

### Part 1 - Data Preprocessing

### Importing the dataset

In [2]:
import pandas as pd
import numpy as np
#variable and this is a function for uploading the dataset
dataset = pd.read_csv('Employee Attrition.csv') 

### Cleaning the Data


In [3]:
dataset.dtypes
#Cheking data types

Emp ID                   float64
satisfaction_level       float64
last_evaluation          float64
number_project           float64
average_montly_hours     float64
time_spend_company       float64
Work_accident            float64
promotion_last_5years    float64
dept                      object
salary                    object
dtype: object

In [4]:
#Dropping Rows with Null Values
dataset.dropna(inplace=True)

In [5]:
#Grouping the Average Monthly Hour Values into 6
dataset['AverageMontlyHours'] = pd.cut(dataset['average_montly_hours'], 
                                 bins=[0, 100, 150, 200, 250, 300, 350], 
                                labels=['0', '1', '2', '3', '4', '5'], 
                                 right=True)

In [6]:
dataset = dataset.drop(['average_montly_hours'],axis = 1)

In [7]:
from sklearn.preprocessing import LabelEncoder
cat_cols = dataset.select_dtypes(include = ['object', 'category'])

In [8]:
#Converting the Department and Salary Columns Categroical Values into Numerical Values
label_encoder = LabelEncoder()
categorical_cols = cat_cols
for col in categorical_cols:
    dataset[col] = label_encoder.fit_transform(dataset[col])

In [9]:
#Moving the Satisfaction Level Column to the Last
column_to_move = dataset.iloc[:, 1]
dataset = dataset.drop(dataset.columns[1], axis=1)
dataset['satisfaction_level'] = column_to_move

In [10]:

dataset.head()

Unnamed: 0,Emp ID,last_evaluation,number_project,time_spend_company,Work_accident,promotion_last_5years,dept,salary,AverageMontlyHours,satisfaction_level
0,1.0,0.53,2.0,3.0,0.0,0.0,7,1,2,0.38
1,2.0,0.86,5.0,6.0,0.0,0.0,7,2,4,0.8
2,3.0,0.88,7.0,4.0,0.0,0.0,7,2,4,0.11
3,4.0,0.87,5.0,5.0,0.0,0.0,7,1,3,0.72
4,5.0,0.52,2.0,3.0,0.0,0.0,7,1,2,0.37


In [11]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14999 entries, 0 to 15786
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Emp ID                 14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  float64
 3   time_spend_company     14999 non-null  float64
 4   Work_accident          14999 non-null  float64
 5   promotion_last_5years  14999 non-null  float64
 6   dept                   14999 non-null  int64  
 7   salary                 14999 non-null  int64  
 8   AverageMontlyHours     14999 non-null  int64  
 9   satisfaction_level     14999 non-null  float64
dtypes: float64(7), int64(3)
memory usage: 1.3 MB


### Getting the inputs and output

In [12]:
# [rows,columns]
X = dataset.iloc[:,1:-1].values
X

array([[0.53, 2.  , 3.  , ..., 7.  , 1.  , 2.  ],
       [0.86, 5.  , 6.  , ..., 7.  , 2.  , 4.  ],
       [0.88, 7.  , 4.  , ..., 7.  , 2.  , 4.  ],
       ...,
       [0.53, 2.  , 3.  , ..., 8.  , 1.  , 1.  ],
       [0.96, 6.  , 4.  , ..., 8.  , 1.  , 4.  ],
       [0.52, 2.  , 3.  , ..., 8.  , 1.  , 2.  ]])

In [13]:
y = dataset.iloc[:,-1].values
y

array([0.38, 0.8 , 0.11, ..., 0.37, 0.11, 0.37])

### Creating the Training Set and the Test Set

In [14]:
# scikitlearn is a library
# model_selection is a module
# train_test_split is a function
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2)

In [15]:
X_train

array([[0.9 , 3.  , 6.  , ..., 1.  , 2.  , 3.  ],
       [0.64, 4.  , 3.  , ..., 9.  , 1.  , 2.  ],
       [0.96, 3.  , 2.  , ..., 9.  , 1.  , 2.  ],
       ...,
       [0.55, 3.  , 2.  , ..., 5.  , 0.  , 3.  ],
       [0.89, 7.  , 5.  , ..., 3.  , 2.  , 5.  ],
       [0.77, 4.  , 3.  , ..., 4.  , 2.  , 1.  ]])

In [16]:
X_test

array([[0.86, 5.  , 5.  , ..., 7.  , 2.  , 4.  ],
       [0.62, 4.  , 3.  , ..., 9.  , 2.  , 4.  ],
       [0.96, 5.  , 3.  , ..., 8.  , 2.  , 1.  ],
       ...,
       [0.84, 5.  , 5.  , ..., 9.  , 2.  , 4.  ],
       [0.83, 6.  , 4.  , ..., 5.  , 2.  , 4.  ],
       [0.95, 3.  , 4.  , ..., 7.  , 2.  , 3.  ]])

In [17]:
y_train

array([0.37, 0.62, 0.83, ..., 0.64, 0.09, 0.75])

In [18]:
y_test

array([0.78, 0.9 , 0.98, ..., 0.72, 0.1 , 0.73])

## Part 2 - Building and training the model

### Building the model

In [19]:
# linear_model is the module
# `LinearRegression is a class` is defining that `LinearRegression` is a class within the `linear_model` module. It indicates that `LinearRegression` is a blueprint or template for creating objects that represent linear regression models.
# Class is a pre-coded blueprint of something we want to build from which objects are created.
from sklearn.linear_model import LinearRegression
model = LinearRegression()

### Training the Model

In [20]:
# fit is a method inside LinearRegression class - they are like functions.
model.fit(X_train, y_train)

### Inference

In [21]:
y_pred = model.predict(X_test)
y_pred

array([0.5809823 , 0.58868436, 0.62424591, ..., 0.57519715, 0.55050816,
       0.69349334])

In [22]:
### Making the prediction of a single data point with:

#1.   last_evaluation = .85        
#2.   number_project = 5         
#3.   time_spend_company = 3     
#4.   Work_accident = 1          
#5.   promotion_last_5years = 0  
#6.   dept = 7                  
#7.   salary = 1                  
#8.   AverageMontlyHours = 2


In [23]:
model.predict([[.85, 5, 3, 1, 0, 7, 1, 2]])

array([0.63731746])

## Part 3: Evaluating the Model

### R-Squared

In [24]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)
r2

0.0599575001918452

### Adjusted R-Squared

In [25]:
k = X_test.shape[1]
n = X_test.shape[0]
adj_r2 = 1-(1-r2)*(n-1)/(n-k-1)
adj_r2

0.057443177223451714

## A New Model for Comparison 

In [27]:
from sklearn.tree import DecisionTreeRegressor
tree = DecisionTreeRegressor(max_depth=8,random_state=40)
tree.fit(X_train,y_train)

In [30]:
path = tree.cost_complexity_pruning_path(X_train,y_train)
alphas = path['ccp_alphas'].round(5)
print(alphas)

[0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 1.000e-05 1.000e-05 1.000e-05 1.000e-05 1.000e-05
 1.000e-05 1.000e-05 1.000e-05 1.000e-05 1.000e-05 1.000e-05 1.000e-05
 1.000e-05 1.000e-05 1.000e-05 1.000e-05 1.000e-05 1.000e-05 1.000e-05
 1.000e-05 1.000e-05 1.000e-05 1.000e-05 1.000e-05 1.000e-05 1.000e-05
 1.000e-05 1.000e-05 1.000e-05 1.000e-05 1.000e-05 1.000e-05 1.000e-05
 1.000e-05 1.000e-05 1.000e-05 1.000e-05 1.000e-05 1.000e-05 1.000e-05
 1.000e-05 1.000e-05 1.000e-05 1.000e-05 2.000e-05 2.000e-05 2.000e-05
 2.000e-05 2.000e-05 2.000e-05 2.000e-05 2.000e-05 2.000e-05 2.000e-05
 2.000e-05 2.000e-05 2.000e-05 2.000e-05 2.000e-05 2.000e-05 2.000e-05
 2.000

In [32]:
train_score,test_score = [],[]
for alpha in alphas:
    decisionTree = DecisionTreeRegressor(ccp_alpha = alpha,max_depth=8)
    decisionTree.fit(X_train,y_train)
    
    train_score.append(decisionTree.score(X_train,y_train))
    test_score.append(decisionTree.score(X_test,y_test))
    

In [35]:
max_test_score_alpha = alphas[np.argmax(test_score)]
dtree = DecisionTreeRegressor(ccp_alpha = max_test_score_alpha,
                                  max_depth=8,random_state = 40)
dtree.fit(X_train,y_train)

In [36]:
#R-squared Value
print(f"The R-squared value : {dtree.score(X_test,y_test):.2%}")


The R-squared value : 43.53%
