In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score
from sklearn.metrics import make_scorer, f1_score
from sklearn.model_selection import GridSearchCV

In [4]:
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report

In [6]:
train = pd.read_csv('/home/gil/Telecos/3B/APA/LAB/Competition/data/ML-MATT-CompetitionQT2021_train.csv', delimiter=';')
test = pd.read_csv('/home/gil/Telecos/3B/APA/LAB/Competition/data/ML-MATT-CompetitionQT2021_test.xls', delimiter=';')

train_ = pd.read_csv('/home/gil/Telecos/3B/APA/LAB/Competition/ML-MATT-CompetitionQT2021_train.csv', delimiter=';')
test_ = pd.read_csv('/home/gil/Telecos/3B/APA/LAB/Competition/data/ML-MATT-CompetitionQT2021_test.csv', delimiter=';')
test_ = test_.drop(columns='ID')

In [8]:
concatenated_test = pd.concat([test, test_], keys=['test', 'test_'])

different_rows = concatenated_test.reset_index().drop_duplicates(keep=False)

different_rows_count = len(different_rows)

print("Number of rows with at least one difference between test and test_:", different_rows_count)

Number of rows with at least one difference between test and test_: 18316


In [31]:
path = '/home/gil/Telecos/3B/APA/LAB/Competition/data/'
train = pd.read_csv(path + 'ML-MATT-CompetitionQT2021_train.csv', delimiter=';')
test = pd.read_csv(path + 'ML-MATT-CompetitionQT2021_test.csv', delimiter=';')

# from google.colab import drive
# drive.mount('/content/drive')
# file_path_train = '/content/drive/My Drive/APA/Competition/data/ML-MATT-CompetitionQT2021_train.csv'
# file_path_test = '/content/drive/My Drive/APA/Competition/data/ML-MATT-CompetitionQT2021_test.csv'
# train = pd.read_csv(file_path_train, delimiter=';')
# test = pd.read_csv(file_path_test, delimiter=';')

# Drop CellName and prepare Time
train = train.drop(columns=['CellName'])
time_parts = train['Time'].str.split(':', expand=True).astype(int)
seconds_since_midnight = (time_parts[0] * 60) + time_parts[1]
train['Time'] = seconds_since_midnight

# Drop CellName and prepare Time
test = test.drop(columns=['CellName'])
time_parts = test['Time'].str.split(':', expand=True).astype(int)
seconds_since_midnight = (time_parts[0] * 60) + time_parts[1]
test['Time'] = seconds_since_midnight
test = test.drop(columns=['ID'])

# Remove all non-numeric characters
test['maxThr_DL'] = test['maxThr_DL'].replace('[^\d]', '', regex=True)
test['maxThr_DL'] = test['maxThr_DL'].str.replace(r'(\d+\.\d+)(.*)', r'\1', regex=True).astype(float)

# Separate majority and minority classes
normal = train[train['Unusual'] == 0]
normal = normal.drop('Unusual', axis=1)
abnormal = train[train['Unusual'] == 1]

# Undersample majority class to match the desired ratio
# from sklearn.utils import resample
# desired_normal_count = int(len(abnormal) / 0.5 * 0.5)  # Calculate the desired number of samples for the majority class
# undersampled_normal = resample(normal, replace=False, n_samples=desired_normal_count, random_state=42)

# # Combine minority class with undersampled majority class
# train = pd.concat([undersampled_normal, abnormal])

# # Shuffle the dataframe
# train = train.sample(frac=1, random_state=42).reset_index(drop=True)

print("Percentage of 1s in balanced data:", (train['Unusual'] == 1).mean() * 100)
print("Percentage of 0s in balanced data:", (train['Unusual'] == 0).mean() * 100)

X = train.drop('Unusual', axis=1)
y = train['Unusual']

print(X.head())
print(y.head())
print(normal.head())
print(test.head())

X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Percentage of 1s in balanced data: 27.593214827660955
Percentage of 0s in balanced data: 72.40678517233904
   Time  PRBUsageUL  PRBUsageDL  meanThr_DL  meanThr_UL  maxThr_DL  maxThr_UL  \
0   645     12.3848      1.4019      0.3927      0.0438    16.6522     0.6806   
1   585     22.0438      2.0016      0.5620      0.2697    10.3994     1.1771   
2   465      0.5105      0.4258      0.0152      0.0106     0.2755     0.1685   
3   165      1.9963      1.1513      0.9908      0.0245    64.7465     0.8747   
4   210      0.3030      0.4040      0.0160      0.0130     0.3480     0.1680   

   meanUE_DL  meanUE_UL  maxUE_DL  maxUE_UL  maxUE_UL+DL  
0     1.1293     1.0491         5         3            8  
1     1.4480     1.1630         6         5           11  
2     1.0379     1.0535         1         2            3  
3     1.0766     1.0526         3         2            5  
4     1.0110     1.0110         2         1            3  
0    1
1    1
2    1
3    1
4    0
Name: Unusual, dt

In [5]:
selected_features = ['meanUE_UL', 'PRBUsageUL', 'PRBUsageDL']

X_train = X_train[selected_features]
X_test = X_test[selected_features]
normal = normal[selected_features]
abnormal = abnormal[selected_features]
test = test[selected_features]

In [24]:
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_samples': [0.5, 0.7, 0.9],
    'contamination': [0.05, 0.1, 0.15],
    'max_features': [0.5, 0.7, 0.9],
    'bootstrap': [True, False],
    'random_state': [42] 
}

# Initialize Decision Tree Classifier
isolationf = IsolationForest(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(isolationf, param_grid, cv=5)

# Fit the grid search to the data
grid_search.fit(normal)

# Get the best estimator from the grid search
best_isf = grid_search.best_estimator_

print(best_isf)

TypeError: If no scoring is specified, the estimator passed should have a 'score' method. The estimator IsolationForest(random_state=42) does not.

In [42]:
# isf = best_isf
isf = IsolationForest(n_estimators=100, max_samples=0.7, max_features=0.9, contamination=0.1, bootstrap=True, random_state=46)

isf.fit(normal)

y_pred = isf.predict(X_test)

y_p = [1 if x == -1 else 0 for x in y_pred]

f1 = f1_score(y_test, y_p)
print(f1)

0.15473191795609934
