In [1]:
# import the required packages
import pandas as pd  
import numpy as np  
import random
import statsmodels.api as sm
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
%matplotlib inline

## 1. HR Analytics

### 1.1 Load and Sample the data

In [2]:
df_1 = pd.read_csv("hr_data_new.csv")
df_backup1 = df_1

FileNotFoundError: [Errno 2] No such file or directory: 'hr_data_new.csv'

In [None]:
df_1

In [None]:
df_1 = df_1.drop(['employee_id'],axis=1)

In [None]:
df_1.describe()

### 1.2 Build the Model(s)

### LogisticRegression

In [None]:
X1 = df_1.drop(['is_promoted'], axis=1)
y1 = df_1["is_promoted"]

In [None]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.3, random_state=1)

In [None]:
lg = LogisticRegression(solver='lbfgs', 
                        max_iter=10000)

lg.fit(X1_train,y1_train)

In [None]:
print(lg.coef_)
print(lg.intercept_)

In [None]:
print(lg.score(X1_train, y1_train), '(Train Accuracy)')

In [None]:
print(lg.score(X1_test, y1_test), '(Test Accuracy)')

In [None]:
lg2 = sm.Logit(y1_train, X1_train).fit()

In [None]:
lg2.summary()

### DecisionTreeClassifier

In [None]:
dt = tree.DecisionTreeClassifier(max_depth = 2)

param_grid = { "criterion" : ["gini", "entropy"], "min_samples_leaf" : [1, 5, 10], "min_samples_split" : [2, 4, 10, 12, 16], 'max_depth' : [2,3,4,5,6,7]}

gs = GridSearchCV(dt, param_grid=param_grid, scoring='accuracy', cv= 10, n_jobs=-1)

gs = gs.fit(X1_train, y1_train) # 

print(gs.best_score_)
print(gs.best_params_)

In [None]:
decision_tree = tree.DecisionTreeClassifier(criterion='entropy', max_depth = 7, min_samples_leaf = 1, min_samples_split =12, random_state=1)
decision_tree.fit(X1_train, y1_train)

In [None]:
train_acc = decision_tree.score(X1_train, y1_train)
print('the training accuracy is: ', train_acc)

In [None]:
test_acc = decision_tree.score(X1_test, y1_test)
print('the testing accuracy is: ', test_acc)

### ANN

In [None]:
mlp = MLPClassifier(activation ='logistic', hidden_layer_sizes=(10,), max_iter= 2000, solver = 'sgd', random_state=2)
mlp.fit(X1_train,y1_train)

In [None]:
print(mlp.score(X1_train, y1_train), '(Train Accuracy)')

In [None]:
print(mlp.score(X1_test, y1_test), '(Test Accuracy)')

### RandomForest

### XGB

In [None]:
xgbc = xgb.XGBClassifier()

In [None]:
xgbc.fit(X1_train, y1_train)

In [None]:
y_predict = xgbc.predict(X1_test)

In [None]:
print(accuracy_score(y1_test, y_predict))

### SVM.SVC

### 1.3 Evaluate and Improve the Model(s)

In [None]:
lg = LogisticRegression(n_jobs=-1)
param_grid = { "solver" : ["newton-cg", "lbfgs", "liblinear"], "max_iter" : [1000, 3000, 5000]}
gs = GridSearchCV(lg, param_grid=param_grid, scoring='accuracy', cv= 10, n_jobs=-1)
gs = gs.fit(X1_train, y1_train) # 

print(gs.best_score_)
print(gs.best_params_)

In [None]:
lg = LogisticRegression(solver='liblinear', max_iter = 1000, n_jobs=-1)
lg.fit(X1_train,y1_train)

In [None]:
print(lg.coef_)
print(lg.intercept_)

In [None]:
print(lg.score(X1_train, y1_train), '(Train Accuracy)')

In [None]:
print(lg.score(X1_test, y1_test), '(Test Accuracy)')

In [None]:
lg2 = sm.Logit(y1_train, X1_train).fit()

In [None]:
lg2.summary()

## 2. Airbnb

### 2.1 Load and Sample the data

In [None]:
df_2 = pd.read_csv("listings_new.csv")
df_backup1 = df_2

In [None]:
df_2

In [None]:
df_2 = df_2.drop(['id', 'name', 'host_id', 'host_name', 'last_review'],axis=1)

In [None]:
df_2.describe()

### 2.2 Build the Model(s)

### LinearRegression

In [None]:
X2 = df_2.drop(['price'], axis =1)
y2 = df_2["price"]

In [None]:
X2

In [None]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.3, random_state=10)

In [None]:
lm = LinearRegression()
lm.fit(X2_train, y2_train)

In [None]:
print(lm.coef_)
print(lm.intercept_)

In [None]:
y_fitted = lm.predict(X2_train)

# Calculated the RMSE and R^2 for training set
print(sqrt(mean_squared_error(y_fitted, y2_train)), '(Train RMSE)') # RMSE: the lower the better
print(lm.score(X2_train, y2_train), '(Train R^2 Value)') # R squared value (ranges from 0 to 1): the higher the better

In [None]:
lm2 = sm.OLS(y2_train, X2_train).fit()
# Summary statistics from the model
lm2.summary()

### DecisionTreeRegressor

In [None]:
tree_reg = tree.DecisionTreeRegressor(max_depth=2, random_state=2)
tree_reg.fit(X2_train, y2_train)

In [None]:
dt = tree.DecisionTreeRegressor(max_depth = 2)

param_grid = { "criterion" : ["mae", "mse"], "min_samples_leaf" : [1, 5, 10], "min_samples_split" : [2, 4, 10, 12, 16], 'max_depth' : [2,3,4,5,6,7]}

gs = GridSearchCV(dt, param_grid=param_grid, scoring='neg_mean_squared_error', cv= 10, n_jobs=-1)

gs = gs.fit(X2_train, y2_train) # 

print(gs.best_score_)
print(gs.best_params_)

In [None]:
decision_tree = tree.DecisionTreeRegressor(criterion="mse", max_depth = 7, min_samples_leaf = 10, min_samples_split =2, random_state=1)
decision_tree.fit(X2_train, y2_train)

In [None]:
train_mse = mean_squared_error(tree_reg.predict(X2_train), y2_train)
print('the training mean squared error is: ', train_mse)
test_mse = mean_squared_error(tree_reg.predict(X2_test), y2_test)
print('the testing mean squared error is: ',test_mse)

In [None]:
train_R2 = tree_reg.score(X2_train, y2_train)
print('training R^2 value is: ', train_R2)

test_R2 = tree_reg.score(X2_test, y2_test)
print('testing R^2 value is: ', test_R2)

### ANN

In [None]:
mlpR = MLPRegressor(activation ='logistic', hidden_layer_sizes=(10,), max_iter= 2000, solver = 'sgd', random_state=2)
mlpR.fit(X2_train,y2_train)

In [None]:
print(mlpR.score(X2_train, y2_train), '(Train Accuracy)')

In [None]:
print(mlpR.score(X2_test, y2_test), '(Test Accuracy)')

### XGBoost

In [None]:
xgbr = xgb.XGBRegressor()

In [None]:
xgbr.fit(X2_train, y2_train)

In [None]:
y_predict = xgbr.predict(X2_test)

In [None]:
print(accuracy_score(y2_test, y_predict))

### 2.3 Evaluate and Improve the Model(s)