<a href="https://colab.research.google.com/github/MeenaRuwandi/INCS_870_Project_IDSforMinorAttacks/blob/meena/incs870_team4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
!pip install imbalanced-learn
!pip install scikit-learn



In [8]:
!pip install gdown
import gdown



In [9]:
# Required Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
file_ids = [
    '1zOeCqtGZjAj_nSLe3W5MH5pDQM28bPde',  # Replace with actual file ID
    '1fagr2rKW8EN-Psc5UlD9BFfa0rRRF6i3',  # Replace with actual file ID
    '1WL00LFkCA2ylV8_8c6jm52bnEhGLcKsq',   # Replace with actual file ID
    '1xr_QBU3Ab42nW1ELl0NUydx4Rbx60TVR']
    #'1MpqHsC5wQSu9CxlhJimz25EVcYubSMhu',
    #'18mKfJHXinmfwPpaYNU2D6xHwq4dQuEuL',
    #'1h1lMHytamf4Kc66NfAfMfBzDbmwM5Ub5',
    #'11__JhC64_D4ezxjF8d0dPNRDJF-TY6Pq'
#]


# Download each file
for i, file_id in enumerate(file_ids):
    url = f'https://drive.google.com/uc?id={file_id}'
    gdown.download(url, f'dataset_{i + 1}.csv', quiet=False)  # Saves as dataset_1.csv, dataset_2.csv, etc.

In [None]:
dataset_paths = [
    '/content/dataset_1.csv','/content/dataset_2.csv','/content/dataset_3.csv','/content/dataset_4.csv']
    #'/content/dataset_5.csv','/content/dataset_6.csv','/content/dataset_7.csv','/content/dataset_8.csv',
#]

# Make the data frame
dataframes = [pd.read_csv(path) for path in dataset_paths]

# Combine all datasets into one dataframe
combined_dataset = pd.concat(dataframes, ignore_index=True)

In [None]:
combined_dataset.describe()

In [None]:
combined_dataset.shape

In [None]:
combined_dataset.info()

In [None]:
# Pre processing
x = combined_dataset.drop(columns=[' Label'])
y = combined_dataset[' Label']
# Handle missing values
x.fillna(x.mean(), inplace=True)

# Get 'Label' column that indicates attack types
class_distribution = combined_dataset[' Label'].value_counts()
print(class_distribution)

In [None]:
#check for missing values
missing_values=combined_dataset.isnull().sum()
print("missing values in each column :\n", missing_values[missing_values>0])

In [None]:
combined_dataset.columns = combined_dataset.columns.str.strip()
print(combined_dataset.columns.tolist())


#fill missing values with the mean value of 'Flow Bytes/s' column with its mean value
meanVal = combined_dataset['Flow Bytes/s'].mean()

#verify if all the missing values are handled
combined_dataset['Flow Bytes/s'].fillna(meanVal, inplace=True)
missingValCheck = combined_dataset.isnull().sum()

#print("missing values after handling : \n", missingValCheck[missingValCheck>0])
print("Missing values after handling:\n", missingValCheck[missingValCheck > 0])




In [None]:
print(combined_dataset.columns.tolist())

print(combined_dataset['Flow Bytes/s'].dtype)



In [None]:
#separating features as X and target variable as Y
X= combined_dataset.drop(columns=['Label'])
Y= combined_dataset['Label']
#print the shape of X and Y
print("features shape",X.shape)
print("Target variable shape", Y.shape)

In [None]:
import numpy as np

#check for infinity values
infVal=X.isin([np.inf, -np.inf]).sum()
print("Infinity values in each column:\n", infVal[infVal>0])

# Check for excessively large values (based on what you deem large)
large_values = X.apply(lambda x: x > 1e6).sum()
print("Large values in each column: \n", large_values[large_values > 0])

In [None]:
#To Handle infinity values : Replace infinity values with
X.replace([np.inf,-np.inf],np.nan,inplace=True)

#To Handle Large values : Capping extremely large values
X = X.apply(lambda x: np.where(x > 1e6, 1e6, x))  # Example threshold 1e6

# Fill missing values after replacing infinity
X.fillna(X.mean(), inplace=True)

# Check for any remaining missing values
print(X.isnull().sum())



In [None]:
#selecting best features for the training model out of 78 features
#use Random forest algorithm
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import numpy as np

#Train a model with Random Forest
rfModel=RandomForestClassifier(n_estimators=100, random_state=42)
rfModel.fit(X,Y)

#Get important feature
importantFeatures=rfModel.feature_importances_

In [None]:
#Creating another Data frame with selected most importand features and their importance factor
importantFeatures_df=pd.DataFrame({
    'Feature' : X.columns,
    'Importance': importantFeatures
})

#Sort the Dataframe by importance scores in decending order
importantFeatures_df = importantFeatures_df.sort_values(by='Importance', ascending=False)

# Display the top important features
print(importantFeatures_df.head(20))  # Change 10 to the number of features you want to display

# Plotting feature importances
plt.figure(figsize=(10, 6))  # Adjust the figure size as needed
plt.barh(importantFeatures_df['Feature'][:20], importantFeatures_df['Importance'][:20], color='skyblue')  # Change to top 20
plt.xlabel('Importance Score')
plt.title('Top 20 Important Features')  # Update title to reflect the change
plt.gca().invert_yaxis()  # Invert y-axis to have the most important feature on top
plt.show()


In [None]:
importantFeatures_df.info()

In [None]:
# Assuming 'top_features' contains the names of the 20 important features
top_features = importantFeatures_df.nlargest(20, 'Importance')['Feature']

# Create a new DataFrame with selected features and the target variable
selected_features_df = combined_dataset[top_features.values.tolist() + ['Label']]

# Display the new DataFrame to verify
print(selected_features_df.head())


In [None]:
selected_features_df.info()

In [None]:
from sklearn.preprocessing import LabelEncoder

#initiate the lable encoder
labelEncoder=LabelEncoder()

#fit the encoder to the lable data
y_encode=labelEncoder.fit_transform(Y)

#check the data type
y_encode.dtype

In [None]:
from sklearn.model_selection import train_test_split

#Use features from selected_features_df dataframe and y_encode
x_train,x_test,y_train,y_test=train_test_split(selected_features_df,y_encode,test_size=0.2,random_state=42,stratify=y_encode)

#check the shapes of the resulting datsets
print("Traning features set shape:",x_train.shape)
print("Test features set shape:",x_test.shape)
print("Traning Lable set shape:",y_train.shape)
print("Test Lable set shape:",y_test.shape)

In [None]:
print(type(y_train))
unique_values = pd.Series(y_train).unique()
print(unique_values)


In [None]:
#check the class distribution | use the traning lable set
unique, counts=np.unique(y_train,return_counts=True)
class_distribution_training=dict(zip(unique,counts))#represent the different classes in the dataset as a dictionary
print(class_distribution_training)


#plot the class distribution
plt.figure(figsize=(8,4))
plt.bar(class_distribution_training.keys(),class_distribution_training.values(),color="blue")
plt.title('Training Class Distribution')
plt.xlabel('Class')
plt.ylabel('Frequency')
plt.xticks(rotation=0)
plt.show()

In [None]:
#use SMOTE for handle class imbalance
from imblearn.over_sampling import SMOTE

#Apply SMOTE to training features and traing lables
#Create an instance of SMOTE
smote=SMOTE(random_state=42)

#Fit SMOTE to thr training data
x_resampled, y_resampled = smote.fit_resample(x_train, y_train)

#check the new class distribution
print("Original traning labled distribution:")
print(y_train.value_counts())
print("\n Resampled training lables distribution:")
print(pd.Series(y_resampled).value_count())
