In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier

# Import train & test sets

In [None]:
df_train = pd.read_csv("../input/anomaly-detection-in-4g-cellular-networks/ML-MATT-CompetitionQT2021_train.csv",sep=';')
df_test = pd.read_csv("../input/anomaly-detection-in-4g-cellular-networks/ML-MATT-CompetitionQT2021_test.csv",sep=';')

# Data Visualization

In [None]:
df_train

In [None]:
df_train.info()

In [None]:
df_train.describe()

In [None]:
sns.pairplot(data=df_train)

In [None]:
sns.scatterplot(x="PRBUsageUL",y="PRBUsageDL",hue="Unusual",data=df_train)

In [None]:
sns.scatterplot(x="meanThr_UL",y="meanThr_DL",hue="Unusual",data=df_train)

In [None]:
sns.scatterplot(x="maxThr_UL",y="maxThr_DL",hue="Unusual",data=df_train)

In [None]:
sns.barplot(x="maxUE_DL",y="maxUE_DL",data=df_train)

In [None]:
sns.barplot(x="maxUE_DL",y="maxUE_DL",hue="Unusual",data=df_train)

# Preprocessing Data

In [None]:
df_train.isna().sum()

In [None]:
for label,content in df_train.items():
    if pd.api.types.is_numeric_dtype(content):
        print(label)

In [None]:
for label,content in df_train.items():
    if not pd.api.types.is_numeric_dtype(content):
        print(label)

In [None]:
df_train_en = df_train.copy()
df_train_en.drop("Time",axis=1,inplace=True)
df_train_en.drop("CellName",axis=1,inplace=True)

# Building Machine Learning model

In [None]:
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier,GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

# Put models in a dictionary
models = {
    "Logistic Regression": LogisticRegression(),
    "KNN": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Extra Trees": ExtraTreesClassifier(),
    "Gradiant Boosting": GradientBoostingClassifier(),
    "XGB": XGBClassifier()
}
# Create a function to fit and score model
def fit_and_score(models,X_train,X_test,y_train,y_test):
    np.random.seed(42)
    # Make a dictionary to keep model score
    model_scores = {}
    # Loop through models
    for name,model in models.items():
        model.fit(X_train,y_train)
        #Evalute the model and append its score to model scores
        model_scores[name] = model.score(X_test,y_test)
    return model_scores

In [None]:
# Split the data 
X = df_train_en.drop("Unusual",axis=1)
y = df_train_en["Unusual"]

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [None]:
scores = fit_and_score(models=models,X_train=X_train,X_test=X_test,y_train=y_train,y_test=y_test)

In [None]:
model_compare = pd.DataFrame(scores,index=["Accuracy"])
model_compare.T.plot.bar()

In [None]:
scores

scores

In [None]:
from sklearn.metrics import classification_report
ideal_model = XGBClassifier()
ideal_model.fit(X_train,y_train)
preds = ideal_model.predict(X_test)
print(classification_report(y_test,preds))

In [None]:
df_test = pd.read_csv("data/ML-MATT-CompetitionQT2021_test.csv",sep=';')

In [None]:
df_test.head()

In [None]:
df_test_en = df_test.copy()
df_test_en.drop("Time",axis=1,inplace=True)
df_test_en.drop("CellName",axis=1,inplace=True)

In [None]:
predictions = ideal_model.predict(df_test_en)

In [None]:
df_test_en.head()

In [None]:
df_test_en.index = df_test_en.index + 1

In [None]:
# Loading test predictions into csv
submission_1 = pd.DataFrame({'Id':df_test_en.index,'Label': predictions});
submission_1.to_csv('submission_1.csv', index=False)
submission_1