In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams["figure.figsize"] =(9, 9)
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import seaborn as sns
from matplotlib import rcParams
import warnings
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv('eng.csv')
df.shape

In [None]:
df.describe()
df.info()
df.isnull().any()
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(10,'Score'))  #print 10 best features
%matplotlib inline


mask = np.triu(np.ones_like(df.corr()))

f,ax = plt.subplots(figsize=(16,8))
               
corr=df.corr()

cmap = sns.diverging_palette(230, 20, as_cmap=True, center = 'dark')               

sns.heatmap(df.corr(), annot=True, fmt ='.1f',mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

ax.set_title('Correlation Matrix')
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
df.plot(figsize=(16,8))

In [None]:
X = df.drop("F", axis=1)
y = df["F"]

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 2014)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import VotingClassifier

In [None]:
from time import time

In [None]:
HRFLM_estimators = []

# Defining 1 Logistic Regression Model
model11 = LogisticRegression(random_state = 2030,C=10, max_iter=5000)
HRFLM_estimators.append(('logistic1', model11))


# Defining 3 Random Forest Models
model21 = RandomForestClassifier(random_state = 2030)
HRFLM_estimators.append(('RF1', model21))

model22 = RandomForestClassifier(random_state = 2030) 
HRFLM_estimators.append(('RF2', model22))

model23 = RandomForestClassifier(random_state = 2030)
HRFLM_estimators.append(('RF3', model23))


In [None]:
# Defining the HRFLM ensemble model
HRFLM_ensemble = VotingClassifier(HRFLM_estimators,voting='soft',)
results = {}
#Training the model 
start = time()
HRFLM_ensemble.fit(X_train,y_train)
end = time()
results['training_time'] = end - start


In [None]:
start = time()
HRFLM_Prediction = HRFLM_ensemble.predict(X_test)
end = time()
results['testing_time'] = end - start

print("MODELLING TIMES(ms) OF HRFLM ARE:")
print("********************************************")
print("Training time: "+"{:.2f}".format(results['training_time']))
print("Testing time: "+"{:.2f}".format(results['testing_time']))
print("********************************************\n")


In [None]:
HRFLM_Accuracy = accuracy_score(y_test, HRFLM_Prediction)
print("The accuracy score for HRFLM in percentage is: "+"{:.2f}".format(HRFLM_Accuracy*100))

## Precision
HRFLM_Precision = precision_score(y_test, HRFLM_Prediction)
print("The precision score for HRFLM is: "+"{:.2f}".format(HRFLM_Precision))

## Recall 
HRFLM_Recall = recall_score(y_test, HRFLM_Prediction)
print("The recall score for HRFLM is as follows: "+"{:.2f}".format(HRFLM_Recall))

## F1 Score
HRFLM_F1Score = f1_score(y_test, HRFLM_Prediction)
print("The F1 Score for HRFLM is: "+"{:.2f}".format(HRFLM_F1Score))

## Confusion Matrix 
HRFLM_Confusion_Matrix=confusion_matrix(y_test,HRFLM_Prediction)
print("Confusion_Matrix: \n\n",HRFLM_Confusion_Matrix, "\n" )

## Classification Report
target_names =['class 0', 'class 1']
print(classification_report(y_test,HRFLM_Prediction,zero_division=1,target_names=target_names))


In [None]:
sensitivity=HRFLM_Confusion_Matrix[0][0]/(HRFLM_Confusion_Matrix[0][0]+HRFLM_Confusion_Matrix[0][1])
print("Specificity is: {}".format(sensitivity))
specificity=HRFLM_Confusion_Matrix[1][1]/(HRFLM_Confusion_Matrix[1][0]+HRFLM_Confusion_Matrix[1][1])
print("Sensitivity is: {}".format(specificity))

In [None]:
from sklearn import metrics
y_pred_proba =HRFLM_ensemble .predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="HRFLM, auc="+str(auc))
plt.legend(loc=8)
plt.plot(fpr, tpr)
plt.plot([0,1],[0,1],'-', color='blue') #diagonal line
plt.title('HRFLM ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

In [None]:
#we are trying to finding best option for parameters. There are values for trying.
param_grid = {
    "max_depth": [2,3,4,5,10],
    "learning_rate": [0.1, 0.01, 0.05, 1.0, 3],
    "gamma": [0,0.25,1.0, 1.5, 2],
    "reg_lambda": [0, 2.0, 1.0, 10.0,100],
    "scale_pos_weight": [1,3,5,7,10]
}

In [None]:
rom sklearn.model_selection import GridSearchCV
params={
     "max_depth": ["None",10, 30, 50, 75, 100],
    "max_features": ["auto",0.3, 0.6],
    "min_samples_leaf": [1,3,5,7],
    "min_samples_split": [2, 4, 8, 12],
    "n_estimators": [30, 50, 100, 200],
    "random_state" : [42]    
}
HRFLM_ensemble = VotingClassifier(HRFLM_estimators,voting='soft')
HRFLM_ensemble_grid = GridSearchCV(HRFLM_ensemble, params, scoring='accuracy', cv=7, n_jobs=-1)
HRFLM_ensemble_grid.fit(X_train, y_train)
## Output
print("Best parameters:  {}:".format(HRFLM_ensemble_grid.best_params_))
print("Best score: {}".format(HRFLM_ensemble_grid.best_score_))

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)


In [None]:
clf.score(X_test, y_test)


In [None]:
y_pred = clf.predict(X_test)


In [None]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
cm = confusion_matrix(y_test,y_pred)

In [None]:
print(cm)

In [None]:
sensitivity=cm[0][0]/(cm[0][0]+cm[0][1])
print("Specificity is: {}".format(sensitivity))
specificity=cm[1][1]/(cm[1][0]+cm[1][1])
print("Sensitivity is: {}".format(specificity))

In [None]:
cr = classification_report(y_test, y_pred)

In [None]:
print(cr)
from sklearn import metrics
y_pred_proba = clf.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="RF, auc="+str(auc))
plt.legend(loc=8)
plt.plot(fpr, tpr)
plt.plot([0,1],[0,1],'-', color='red') #diagonal line

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

In [None]:
from sklearn.model_selection import GridSearchCV
params={
     "max_depth": ["None",10, 30, 50, 75, 100],
    "max_features": ["auto",0.3, 0.6],
    "min_samples_leaf": [1,3,5,7],
    "min_samples_split": [2, 4, 8, 12],
    "n_estimators": [30, 50, 100, 200],
    "random_state" : [42]    
}
clf = RandomForestClassifier()
clf_grid = GridSearchCV(clf, params, scoring='accuracy', cv=7, n_jobs=-1)
clf_grid.fit(X_train, y_train)
## Output
print("Best parameters:  {}:".format(clf_grid.best_params_))
print("Best score: {}".format(clf_grid.best_score_))

In [None]:
df.groupby('F').count()
ckd = 250
notckd = 150

chart_labels = ['1', '0']
data = [ckd, notckd]

plt.pie(data,autopct='%1.0f%%',pctdistance=0.5, labeldistance=1.1 )
plt.title('Distribution of F')
plt.legend(chart_labels, loc='best'

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot')

# Feature importance dataframe
imp_df = pd.DataFrame({'feature': X_train.columns.values,
                       'importance': clf.feature_importances_})
 
# Reorder by importance
ordered_df = imp_df.sort_values(by='importance')
imp_range=range(1,len(imp_df.index)+1)
 
## Barplot with confidence intervals
height = ordered_df['importance']
bars = ordered_df['feature']
y_pos = np.arange(len(bars))

# Create horizontal bars
plt.barh(y_pos, height)
 
# Create names on the y-axis
plt.yticks(y_pos, bars)

plt.xlabel("Mean reduction in tree impurity in random forest")

plt.tight_layout()
# Show graphic
plt.show()

In [None]:
from xgboost import XGBClassifier
xg = XGBClassifier(objective='binary:logistic', n_estimators=200, seed=22,learning_rate=0.4,gamma = 2, reg_lambda=2,scale_pos_weight=3, max_depth=10)

xg.fit(X_train, y_train)



y_pred = xg.predict(X_test)


In [None]:
xg.score(X_test, y_test)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test,y_pred)
print(cm)

In [None]:
sensitivity=cm[0][0]/(cm[0][0]+cm[0][1])
print("Specificity is: {}".format(sensitivity))
specificity=cm[1][1]/(cm[1][0]+cm[1][1])
print("Sensitivity is: {}".format(specificity))

In [None]:
from sklearn import metrics
y_pred_proba = xg.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="Xgb, auc="+str(auc))
plt.legend(loc=4)
plt.plot(fpr, tpr)
plt.plot([0,1],[0,1],'-', color='blue') #diagonal line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

In [None]:
from time import time
from lightgbm import LGBMClassifier
lgb = LGBMClassifier ()
results = {}
start = time()
lgb.fit(X_train,y_train)
end = time()
results['training_time'] = end - start

In [None]:
start = time()
y_pred =lgb.predict(X_test)
end = time()
results['testing_time'] = end - start
print("Training time: "+"{:.2f}".format(results['training_time']))
print("Testing time: "+"{:.2f}".format(results['testing_time']))

In [None]:
from sklearn.metrics import confusion_matrix
cm_lgb = confusion_matrix(y_test,y_pred)
print(cm_lgb)

In [None]:
from sklearn.metrics import accuracy_score
ac_lgb = accuracy_score(y_test,y_pred)
print(ac_lgb)

In [None]:
sensitivity=cm_lgb[0][0]/(cm_lgb[0][0]+cm_lgb[0][1])
print("Specificity is: {}".format(sensitivity))
specificity=cm_lgb[1][1]/(cm_lgb[1][0]+cm_lgb[1][1])
print("Sensitivity is: {}".format(specificity))

In [None]:
from sklearn import metrics
y_pred_proba = lgb.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="LGBM, auc="+str(auc))
plt.legend(loc=8)
plt.plot(fpr, tpr)
plt.plot([0,1],[0,1],'-', color='blue') #diagonal line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')


In [None]:
gbm_params = {'learning_rate':0.02, 'boosting_type':'gbdt',
              'objective':'binary', 
              'metric':['auc', 'binary_logloss'],
              'num_leaves':50,
              'max_depth':10}