# Machine Learning Concepts and Principles
## Software Defect Detection

> Lazaros Panitsidis & Konstantinos Kravaritis<br />
> Msc Data Science <br />
> International Hellenic University <br />
> lpanitsidis@ihu.edu.gr & kkravaritis@ihu.edu.gr

## Contents
1. [Useful Python Libraries](#0)
1. [Data Content](#1)
1. [Feature Engineering](#2)
     1. [Data Preprocessing](#3)
     1. [Visualization & Analysis](#4)
1. [Feature Selection and Random Forest Classification](#5)
     1. [Feature Selection by Correlation](#6)
     1. [Univariate feature selection (SelectKbest)](#7)
     1. [Recursive Feature Elimination (RFE)](#8)
     1. [Recursive Feature Elimination with Cross-Validation (RFECV)](#9)
     1. [Feature importances with a forest of trees](#10)
     1. [XGBoost Feature Importances](#11)
     1. [Minimum Redundancy & Maximum Relevance](#12)
1. [Feature extraction with PCA](#11)
1. [Summary](#12)

<a id='0'></a>
## Useful Python Libraries

In [18]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # data visualization library  
import scipy.stats as stats
import matplotlib.pyplot as plt
import time
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None  # default='warn'
#import warnings library
import warnings
# ignore all warnings
warnings.filterwarnings('ignore')
import sys


# # Any results you write to the current directory are saved as output.
# from sklearn.feature_selection import SelectKBest, f_classif, chi2, RFE, RFECV , mutual_info_classif
# from sklearn.model_selection import train_test_split, cross_val_score , GridSearchCV , LeaveOneOut , KFold, StratifiedKFold, RepeatedStratifiedKFold
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import f1_score, confusion_matrix, accuracy_score , make_scorer , classification_report
# from sklearn.decomposition import PCA
# from sklearn.pipeline import make_pipeline , Pipeline # https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
# from sklearn.preprocessing import StandardScaler , LabelEncoder
# from xgboost import XGBClassifier , plot_importance
# from sklearn.utils import resample
# import eli5
# from eli5.sklearn import PermutationImportance
# from scipy.stats import spearmanr,pearsonr
# from scipy.cluster import hierarchy
# from scipy.spatial.distance import squareform
# from sklearn.inspection import permutation_importance
# from collections import defaultdict

### Data preprocessing

In [33]:
data_location = "../../Assignment/"

### jm1

In [34]:
jm1 = pd.read_csv(data_location + "jm1.csv")
jm1

Unnamed: 0,loc,v(g),ev(g),iv(g),n,v,l,d,i,e,b,t,lOCode,lOComment,lOBlank,locCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount,defects
0,1.1,1.4,1.4,1.4,1.3,1.30,1.30,1.30,1.30,1.30,1.30,1.30,2,2,2,2,1.2,1.2,1.2,1.2,1.4,False
1,1.0,1.0,1.0,1.0,1.0,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1,1,1,1,1,1,1,1,1,True
2,72.0,7.0,1.0,6.0,198.0,1134.13,0.05,20.31,55.85,23029.10,0.38,1279.39,51,10,8,1,17,36,112,86,13,True
3,190.0,3.0,1.0,3.0,600.0,4348.76,0.06,17.06,254.87,74202.67,1.45,4122.37,129,29,28,2,17,135,329,271,5,True
4,37.0,4.0,1.0,4.0,126.0,599.12,0.06,17.19,34.86,10297.30,0.20,572.07,28,1,6,0,11,16,76,50,7,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10880,18.0,4.0,1.0,4.0,52.0,241.48,0.14,7.33,32.93,1770.86,0.08,98.38,13,0,2,0,10,15,30,22,7,False
10881,9.0,2.0,1.0,2.0,30.0,129.66,0.12,8.25,15.72,1069.68,0.04,59.43,5,0,2,0,12,8,19,11,3,False
10882,42.0,4.0,1.0,2.0,103.0,519.57,0.04,26.40,19.68,13716.72,0.17,762.04,29,1,10,0,18,15,59,44,7,False
10883,10.0,1.0,1.0,1.0,36.0,147.15,0.12,8.44,17.44,1241.57,0.05,68.98,6,0,2,0,9,8,21,15,1,False


In [6]:
jm1.isna().sum().sum()

0

In [7]:
jm1.duplicated().sum()

1973

### mc1

In [36]:
mc1 = pd.read_csv(data_location + "mc1.csv")
mc1

Unnamed: 0,LOC_BLANK,BRANCH_COUNT,CALL_PAIRS,LOC_CODE_AND_COMMENT,LOC_COMMENTS,CONDITION_COUNT,CYCLOMATIC_COMPLEXITY,CYCLOMATIC_DENSITY,DECISION_COUNT,DESIGN_COMPLEXITY,DESIGN_DENSITY,EDGE_COUNT,ESSENTIAL_COMPLEXITY,ESSENTIAL_DENSITY,LOC_EXECUTABLE,PARAMETER_COUNT,GLOBAL_DATA_COMPLEXITY,GLOBAL_DATA_DENSITY,HALSTEAD_CONTENT,HALSTEAD_DIFFICULTY,HALSTEAD_EFFORT,HALSTEAD_ERROR_EST,HALSTEAD_LENGTH,HALSTEAD_LEVEL,HALSTEAD_PROG_TIME,HALSTEAD_VOLUME,MAINTENANCE_SEVERITY,MODIFIED_CONDITION_COUNT,MULTIPLE_CONDITION_COUNT,NODE_COUNT,NORMALIZED_CYLOMATIC_COMPLEXITY,NUM_OPERANDS,NUM_OPERATORS,NUM_UNIQUE_OPERANDS,NUM_UNIQUE_OPERATORS,NUMBER_OF_LINES,PERCENT_COMMENTS,LOC_TOTAL,c
0,0,1,0,0,0,0,1,1.00,0,1,1,1,1,0,0,0,1,1,7.74,1.50,17.41,0.00,5,0.67,0.97,11.61,1,0,0,2,0.50,2,3,2,3,2,0.00,0,False
1,0,1,0,0,0,0,1,1.00,0,1,1,1,1,0,0,0,0,0,7.74,1.50,17.41,0.00,5,0.67,0.97,11.61,1,0,0,2,1.00,2,3,2,3,1,0.00,0,False
2,0,1,1,0,0,0,1,1.00,0,1,1,2,1,0,0,0,0,0,9.51,3.00,85.59,0.01,9,0.33,4.75,28.53,1,0,0,3,0.50,3,6,3,6,2,0.00,0,False
3,8,17,11,32,3,12,13,0.16,6,10,0,43,4,0,48,0,10,0,49.82,25.34,31983.69,0.42,212,0.04,1776.87,1262.29,0,3,6,32,0.14,75,137,37,25,93,42.17,80,False
4,4,7,1,1,8,12,4,0.17,6,1,0,20,1,0,23,0,4,1,25.56,20.00,10223.25,0.17,113,0.05,567.96,511.16,0,3,6,18,0.11,52,61,13,10,38,28.13,24,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9461,0,1,0,0,0,0,1,0.33,0,1,1,1,1,0,3,0,0,0,9.51,3.00,85.59,0.01,9,0.33,4.75,28.53,1,0,0,2,0.20,3,6,3,6,5,0.00,3,False
9462,0,5,1,0,0,0,3,0.43,0,2,0,7,3,1,7,0,0,0,12.73,6.75,580.17,0.03,22,0.15,32.22,85.95,1,0,0,6,0.33,9,13,6,9,9,0.00,7,False
9463,1,5,3,0,0,8,3,0.23,4,3,1,15,1,0,13,0,2,0,17.24,7.50,969.51,0.04,31,0.13,53.86,129.27,0,2,4,14,0.19,12,19,8,10,16,0.00,13,False
9464,2,1,0,0,3,0,1,0.14,0,1,1,5,1,0,7,0,0,0,20.08,5.50,607.27,0.04,29,0.18,33.74,110.41,1,0,0,6,0.07,11,18,7,7,14,30.00,7,False


In [9]:
mc1.isna().sum().sum()

0

In [10]:
mc1.duplicated().sum()

7450

#### pc3

In [35]:
pc3 = pd.read_csv(data_location + "pc3.csv")
pc3

Unnamed: 0,LOC_BLANK,BRANCH_COUNT,CALL_PAIRS,LOC_CODE_AND_COMMENT,LOC_COMMENTS,CONDITION_COUNT,CYCLOMATIC_COMPLEXITY,CYCLOMATIC_DENSITY,DECISION_COUNT,DECISION_DENSITY,DESIGN_COMPLEXITY,DESIGN_DENSITY,EDGE_COUNT,ESSENTIAL_COMPLEXITY,ESSENTIAL_DENSITY,LOC_EXECUTABLE,PARAMETER_COUNT,HALSTEAD_CONTENT,HALSTEAD_DIFFICULTY,HALSTEAD_EFFORT,HALSTEAD_ERROR_EST,HALSTEAD_LENGTH,HALSTEAD_LEVEL,HALSTEAD_PROG_TIME,HALSTEAD_VOLUME,MAINTENANCE_SEVERITY,MODIFIED_CONDITION_COUNT,MULTIPLE_CONDITION_COUNT,NODE_COUNT,NORMALIZED_CYLOMATIC_COMPLEXITY,NUM_OPERANDS,NUM_OPERATORS,NUM_UNIQUE_OPERANDS,NUM_UNIQUE_OPERATORS,NUMBER_OF_LINES,PERCENT_COMMENTS,LOC_TOTAL,c
0,2,1,0,0,0,0,1,0.10,0,0.00,1,1.00,1,1,0.00,10,0,27.90,7.78,1687.93,0.07,57,0.13,93.77,217.02,1.00,0,0,2,0.08,28,29,9,5,13,0.00,10,False
1,1,1,4,0,0,0,1,0.07,0,0.00,1,1.00,5,1,0.00,14,2,40.67,14.00,7972.25,0.19,107,0.07,442.90,569.45,1.00,0,0,6,0.06,52,55,26,14,16,0.00,14,False
2,27,19,1,4,13,26,11,0.26,12,2.17,2,0.18,34,1,0.00,38,0,33.74,23.20,18157.82,0.26,136,0.04,1008.77,782.66,0.09,7,13,25,0.13,58,78,30,24,83,30.91,42,False
3,2,17,2,0,0,24,9,0.47,8,3.00,4,0.44,32,6,0.63,19,2,26.33,31.74,26522.64,0.28,154,0.03,1473.48,835.64,0.67,8,14,25,0.41,73,81,23,20,22,0.00,19,False
4,6,1,1,0,2,0,1,0.11,0,0.00,1,1.00,2,1,0.00,9,0,42.25,4.43,830.35,0.06,42,0.23,46.13,187.30,1.00,0,0,3,0.06,19,23,15,7,18,18.18,9,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1558,6,3,3,0,8,4,2,0.40,2,2.00,2,1.00,7,1,0.00,5,1,28.99,5.54,889.14,0.05,36,0.18,49.40,160.54,0.50,1,2,7,0.10,16,20,13,9,20,61.54,5,False
1559,3,5,3,0,0,6,3,0.33,2,3.00,3,1.00,10,1,0.00,9,0,17.03,14.00,3337.85,0.08,52,0.07,185.44,238.42,0.33,2,3,9,0.23,20,32,10,14,13,0.00,9,False
1560,0,1,0,0,0,0,1,1.00,0,0.00,1,1.00,1,1,0.00,0,0,0.00,0.00,0.00,0.00,2,0.00,0.00,2.00,1.00,0,0,2,1.00,0,2,0,2,1,0.00,0,False
1561,0,7,0,0,0,10,4,0.36,4,2.50,1,0.25,11,4,1.00,11,1,17.45,11.69,2385.64,0.07,42,0.09,132.54,204.04,1.00,3,5,9,0.33,19,23,13,16,12,0.00,11,False


In [12]:
pc3.isna().sum().sum()

0

In [13]:
pc3.duplicated().sum()

124

#### find optimal data types for faster computation

In [44]:
import sys

sys.path.append("..")
sys.path.append("../../Deliverable")

from functions import find_optimal_data_types

# Use find_optimal_data_types() function


In [45]:
test = find_optimal_data_types(mc1)

In [47]:
test.dtypes

LOC_BLANK                            uint8
BRANCH_COUNT                        uint16
CALL_PAIRS                           uint8
LOC_CODE_AND_COMMENT                 uint8
LOC_COMMENTS                         uint8
CONDITION_COUNT                     uint16
CYCLOMATIC_COMPLEXITY                uint8
CYCLOMATIC_DENSITY                 float16
DECISION_COUNT                      uint16
DESIGN_COMPLEXITY                    uint8
DESIGN_DENSITY                       uint8
EDGE_COUNT                          uint16
ESSENTIAL_COMPLEXITY                 uint8
ESSENTIAL_DENSITY                    uint8
LOC_EXECUTABLE                      uint16
PARAMETER_COUNT                      uint8
GLOBAL_DATA_COMPLEXITY               uint8
GLOBAL_DATA_DENSITY                  uint8
HALSTEAD_CONTENT                   float16
HALSTEAD_DIFFICULTY                float16
HALSTEAD_EFFORT                    float32
HALSTEAD_ERROR_EST                 float16
HALSTEAD_LENGTH                     uint16
HALSTEAD_LE

In [55]:
from ..functions.find_optimal_data_types import find_optimal_data_types

ImportError: attempted relative import with no known parent package