#Importing required libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.naive_bayes import BernoulliNB
from sklearn import tree
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans  
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectKBest,mutual_info_classif,chi2
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import naive_bayes
from sklearn import model_selection
import imblearn
from sklearn import preprocessing
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
import warnings 
warnings.filterwarnings('ignore')

In [None]:
#!pip install autoviz

#Data Loading 

In [None]:
Kdd_data=pd.read_csv('/content/cup98LRN.txt') #loading dataset 

In [None]:
Kdd_data.head()

Unnamed: 0,ODATEDW,OSOURCE,TCODE,STATE,ZIP,MAILCODE,PVASTATE,DOB,NOEXCH,RECINHSE,...,TARGET_D,HPHONE_D,RFA_2R,RFA_2F,RFA_2A,MDMAUD_R,MDMAUD_F,MDMAUD_A,CLUSTER2,GEOCODE2
0,8901,GRI,0,IL,61081,,,3712,0,,...,0.0,0,L,4,E,X,X,X,39.0,C
1,9401,BOA,1,CA,91326,,,5202,0,,...,0.0,0,L,2,G,X,X,X,1.0,A
2,9001,AMH,1,NC,27017,,,0,0,,...,0.0,1,L,4,E,X,X,X,60.0,C
3,8701,BRY,0,CA,95953,,,2801,0,,...,0.0,1,L,4,E,X,X,X,41.0,C
4,8601,,0,FL,33176,,,2001,0,X,...,0.0,1,L,2,F,X,X,X,26.0,A


#Handling categorical data

In [None]:
#finding all the categorical data 
categorical_col=Kdd_data.select_dtypes(include=['object']).columns.tolist()

In [None]:
#performing label encoding 
label_encoder = preprocessing.LabelEncoder()
for i in range(len(categorical_col)):
  Kdd_data[categorical_col[i]]= label_encoder.fit_transform(Kdd_data[categorical_col[i]].astype(str))

In [None]:
Kdd_data.isnull().sum() #checking null values in the dataset

ODATEDW       0
OSOURCE       0
TCODE         0
STATE         0
ZIP           0
           ... 
MDMAUD_R      0
MDMAUD_F      0
MDMAUD_A      0
CLUSTER2    132
GEOCODE2      0
Length: 481, dtype: int64

#Handling Null values 

In [None]:
DataImpute = SimpleImputer(missing_values = np.nan,strategy ='constant') #solving the problem of missing and nan values in the dataset

In [None]:
Kdd_data = DataImpute.fit_transform(Kdd_data) #data imputation and transforming data 

#Clustering based anaysis for getting the target value

In [None]:
#using k-means to get the labels target values, 1 meaning the chances of getting dontation
kmeans = KMeans(n_clusters=2, init='k-means++', random_state= 42)  
kmeans.fit(Kdd_data)  #fitting dataset 

KMeans(n_clusters=2, random_state=42)

In [None]:
labels=kmeans.labels_ #labels 

In [None]:
labels

array([0, 1, 0, ..., 1, 0, 1], dtype=int32)

#Splitting dataset into training and testing 

In [None]:
#splitting dataset into training and testing, test size is 33% and 77% is training data
X_train, X_test, y_train, y_test = train_test_split(Kdd_data,labels,test_size=0.33,random_state=42)

#Feature selection 

In [None]:
FeatureSelection=SelectKBest(chi2, k=300) #feature selection

In [None]:
TrainingChiData=FeatureSelection.fit_transform(X_train,y_train) #fitting data over the chi values
TestingChiData=FeatureSelection.transform(X_test)

#Balancing Unbalanced dataset

In [None]:
#balancing dataset 
smote = SMOTE()
x_smote, y_smote = smote.fit_resample(TrainingChiData,y_train)

In [None]:
#Crreating simple model to evluate the overall accuracy .
clf = BernoulliNB().fit(x_smote, y_smote)
y_pred = clf.predict(TestingChiData)
metrics.f1_score(y_test, y_pred)

0.7162729225177598

#Models training and testing

In [None]:
def EvaluteDataset(data, machine_learning_models):
  feature = data.iloc[:,:-1] #feature data 
  target = data['target'] #target values 
  fold = model_selection.KFold(n_splits=5) #using k-fold of 5
  f1score, precision, recall = [], [], [] #taking f1score, precision and recall as metrics for evaluation
  for training, testing in fold.split(feature):     #itering in dataset
      X_train, X_test = feature.iloc[training], feature.iloc[testing]
      y_train, y_test = target.iloc[training], target.iloc[testing]
      machine_learning_models.fit(X_train, y_train) #fitting dataset 
      y_pred = machine_learning_models.predict(X_test) # prediction of data on models 
      f1score += [metrics.f1_score(y_test, y_pred)] #f1-score
      precision += [metrics.precision_score(y_test, y_pred)] #precison 
      recall += [metrics.recall_score(y_test, y_pred)] #recall
  print("***************{}".format(machine_learning_models.__class__.__name__))
  print("Precision={} -{}\n    Recall={} {}\n    F1-score={} {}".format(np.mean(precision), np.std(precision),
      np.mean(recall), np.std(recall),
      np.mean(f1score), np.std(f1score)))
  print(metrics.classification_report(y_test, y_pred))
  print(metrics.confusion_matrix(y_test, y_pred))
  print("-"*100)

#Models used for classification

In [None]:
#models 
decision_tree=DecisionTreeClassifier() #decision tree
naive=naive_bayes.GaussianNB() #naive bayes
logistic=LogisticRegression(random_state=0) #logistc 
st=SGDClassifier(loss="hinge", penalty="l2", max_iter=5) #stochastic gradient
rand=RandomForestClassifier(max_depth=2, random_state=0) # random forest classifier
svm=svm.SVC() #support vector machine

In [None]:
da=pd.DataFrame(Kdd_data)
da['target']=labels

In [None]:
da.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,472,473,474,475,476,477,478,479,480,target
0,8901.0,342.0,0.0,19.0,9939.0,0.0,0.0,3712.0,1.0,0.0,...,0.0,0.0,4.0,1.0,4.0,3.0,4.0,39.0,3.0,0
1,9401.0,121.0,1.0,8.0,16857.0,0.0,0.0,5202.0,1.0,0.0,...,0.0,0.0,2.0,3.0,4.0,3.0,4.0,1.0,1.0,1
2,9001.0,49.0,1.0,32.0,335.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,4.0,1.0,4.0,3.0,4.0,60.0,3.0,0
3,8701.0,127.0,0.0,8.0,18628.0,0.0,0.0,2801.0,1.0,0.0,...,1.0,0.0,4.0,1.0,4.0,3.0,4.0,41.0,3.0,1
4,8601.0,0.0,0.0,13.0,2936.0,0.0,0.0,2001.0,1.0,1.0,...,1.0,0.0,2.0,2.0,4.0,3.0,4.0,26.0,1.0,0


#Training and evluation on the dataset

In [None]:
#for discard instances
EvaluteDataset(da, decision_tree)
EvaluteDataset(da, naive)
EvaluteDataset(da, logistic)
EvaluteDataset(da, st)
EvaluteDataset(da, rand)
EvaluteDataset(da, svm)

***************DecisionTreeClassifier
Precision=0.999600751325306 -0.00023059635914090976
    Recall=0.9996196428029478 0.00023892160281010612
    F1-score=0.9996101642007529 0.0001495750339570567
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9632
           1       1.00      1.00      1.00      9450

    accuracy                           1.00     19082
   macro avg       1.00      1.00      1.00     19082
weighted avg       1.00      1.00      1.00     19082

[[9629    3]
 [   4 9446]]
----------------------------------------------------------------------------------------------------
***************GaussianNB
Precision=0.846980185247214 -0.005016352590938112
    Recall=0.6819972340341569 0.00866091249825128
    F1-score=0.755571008464408 0.006894783315802827
              precision    recall  f1-score   support

           0       0.73      0.87      0.80      9632
           1       0.84      0.67      0.75      9450

    ac