In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.multioutput import MultiOutputClassifier
from xgboost import XGBClassifier

# Load the dataset
df = pd.read_csv(r"C:\Users\ragul\OneDrive\Desktop\hack\AI_Revenue_Leakage_Detection\model\Telecom\dataset\telecom_billing_dataset.csv")
df['Invoice_Num_Int'] = df['Invoice_number'].str.replace("INV", "").astype(int)
df = df.sort_values(by='Invoice_Num_Int').reset_index(drop=True)

# Step 3: Create Is_Duplicate flag (check previous/next after sorting)
df['Is_Duplicate'] = (
    (df['Invoice_number'] == df['Invoice_number'].shift(1)) | 
    (df['Invoice_number'] == df['Invoice_number'].shift(-1))
).astype(int)
# --- ROBUST DATA PREPARATION ---
# Separate features (X) and targets (y)
X = df.drop(columns=['Anomaly_type', 'Leakage'])
y = df[['Anomaly_type', 'Leakage']].copy()

# 1. Handle potential missing values in target columns first
y['Anomaly_type'] = y['Anomaly_type'].fillna('Unknown')
y['Leakage'] = y['Leakage'].fillna('Unknown')

# 2. Ensure target columns are of string type before encoding
y['Anomaly_type'] = y['Anomaly_type'].astype(str)
y['Leakage'] = y['Leakage'].astype(str)
# --- END OF PREPARATION ---

# Feature engineering for date columns
for col in ['Billing_date', 'Plan_start_date', 'Plan_end_date']:
    X[col] = pd.to_datetime(X[col], dayfirst=True)
    X[col + '_year'] = X[col].dt.year
    X[col + '_month'] = X[col].dt.month
    X[col + '_day'] = X[col].dt.day
X = X.drop(columns=['Billing_date', 'Plan_start_date', 'Plan_end_date'])

# Identify categorical and numerical features after date processing
categorical_features = X.select_dtypes(include=['object', 'category']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

# Create preprocessing pipelines
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Encode the target labels
le_anomaly = LabelEncoder()
le_leakage = LabelEncoder()
y['Anomaly_type'] = le_anomaly.fit_transform(y['Anomaly_type'])
y['Leakage'] = le_leakage.fit_transform(y['Leakage'])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- DEFINE THE XGBOOST MODEL ---
# XGBoost is a powerful gradient boosting model
base_model = XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False, random_state=42)
model = MultiOutputClassifier(base_model)
# --- END OF MODEL DEFINITION ---

# Create and train the full pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', model)])

pipeline.fit(X_train, y_train)

# Make and evaluate predictions
y_pred = pipeline.predict(X_test)

# Inverse transform labels for clear reporting
y_pred_df = pd.DataFrame(y_pred, columns=['Anomaly_type', 'Leakage'])
y_pred_df['Anomaly_type'] = le_anomaly.inverse_transform(y_pred_df['Anomaly_type'])
y_test_df = pd.DataFrame(y_test.values, columns=['Anomaly_type', 'Leakage'])
y_test_df['Anomaly_type'] = le_anomaly.inverse_transform(y_test_df['Anomaly_type'])
y_pred_df['Leakage'] = le_leakage.inverse_transform(y_pred_df['Leakage'])
y_test_df['Leakage'] = le_leakage.inverse_transform(y_test_df['Leakage'])


# --- DISPLAY RESULTS ---
print("--- Evaluation for 'Anomaly_type' ---")
print(classification_report(y_test_df['Anomaly_type'], y_pred_df['Anomaly_type']))
print(f"Accuracy for 'Anomaly_type': {accuracy_score(y_test_df['Anomaly_type'], y_pred_df['Anomaly_type']):.4f}")

print("\n--- Evaluation for 'Leakage' ---")
print(classification_report(y_test_df['Leakage'], y_pred_df['Leakage']))
print(f"Accuracy for 'Leakage': {accuracy_score(y_test_df['Leakage'], y_pred_df['Leakage']):.4f}")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


--- Evaluation for 'Anomaly_type' ---
                   precision    recall  f1-score   support

Duplicate entries       1.00      1.00      1.00       125
   Excess payment       1.00      1.00      1.00       137
 Extra data usage       1.00      0.99      1.00       106
  Missing charges       1.00      0.99      1.00       153
       No anomaly       0.99      1.00      0.99      1198
    Under payment       1.00      1.00      1.00       174
   Usage mismatch       1.00      0.88      0.94       107

         accuracy                           0.99      2000
        macro avg       1.00      0.98      0.99      2000
     weighted avg       0.99      0.99      0.99      2000

Accuracy for 'Anomaly_type': 0.9925

--- Evaluation for 'Leakage' ---
              precision    recall  f1-score   support

          No       0.99      1.00      0.99      1198
         Yes       1.00      0.98      0.99       802

    accuracy                           0.99      2000
   macro avg       0.9

In [2]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier
from xgboost import XGBClassifier

# Load the dataset
df = pd.read_csv(r"C:\Users\ragul\OneDrive\Desktop\hack\AI_Revenue_Leakage_Detection\model\Telecom\dataset\telecom_billing_dataset.csv")

# Add integer Invoice number for ordering
df['Invoice_Num_Int'] = df['Invoice_number'].str.replace("INV", "").astype(int)
df = df.sort_values(by='Invoice_Num_Int').reset_index(drop=True)

# Create Is_Duplicate flag
df['Is_Duplicate'] = (
    (df['Invoice_number'] == df['Invoice_number'].shift(1)) | 
    (df['Invoice_number'] == df['Invoice_number'].shift(-1))
).astype(int)

# Features and target
X = df.drop(columns=['Anomaly_type', 'Leakage'])
y = df[['Anomaly_type', 'Leakage']].copy()

# Handle missing values in targets
y['Anomaly_type'] = y['Anomaly_type'].fillna('Unknown').astype(str)
y['Leakage'] = y['Leakage'].fillna('Unknown').astype(str)

# Feature engineering for date columns
for col in ['Billing_date', 'Plan_start_date', 'Plan_end_date']:
    X[col] = pd.to_datetime(X[col], dayfirst=True)
    X[col + '_year'] = X[col].dt.year
    X[col + '_month'] = X[col].dt.month
    X[col + '_day'] = X[col].dt.day
X = X.drop(columns=['Billing_date', 'Plan_start_date', 'Plan_end_date'])

# Identify categorical and numerical features
categorical_features = X.select_dtypes(include=['object', 'category']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

# Transformers
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Encode targets
le_anomaly = LabelEncoder()
le_leakage = LabelEncoder()
y['Anomaly_type'] = le_anomaly.fit_transform(y['Anomaly_type'])
y['Leakage'] = le_leakage.fit_transform(y['Leakage'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Model
base_model = XGBClassifier(objective='binary:logistic',
                           eval_metric='logloss',
                           use_label_encoder=False,
                           random_state=42)
model = MultiOutputClassifier(base_model)

# Full pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', model)])

# Train
pipeline.fit(X_train, y_train)

# Save the pipeline and encoders
joblib.dump(pipeline, "telecom_pipeline.pkl")
joblib.dump(le_anomaly, "le_anomaly.pkl")
joblib.dump(le_leakage, "le_leakage.pkl")

print("✅ Model and encoders saved successfully!")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Model and encoders saved successfully!


In [10]:
import pandas as pd
import joblib

# Load pipeline and encoders
pipeline = joblib.load("telecom_pipeline.pkl")
le_anomaly = joblib.load("le_anomaly.pkl")
le_leakage = joblib.load("le_leakage.pkl")

# Load new data (WITHOUT target columns)
new_df = pd.read_csv(r"C:\Users\ragul\OneDrive\Desktop\hack\AI_Revenue_Leakage_Detection\model\Telecom\dataset\telecom_input.csv")

# Feature engineering for dates (same as training)
for col in ['Billing_date', 'Plan_start_date', 'Plan_end_date']:
    new_df[col] = pd.to_datetime(new_df[col], dayfirst=True)
    new_df[col + '_year'] = new_df[col].dt.year
    new_df[col + '_month'] = new_df[col].dt.month
    new_df[col + '_day'] = new_df[col].dt.day
new_df = new_df.drop(columns=['Billing_date', 'Plan_start_date', 'Plan_end_date'])

# Make predictions
y_pred = pipeline.predict(new_df)

# Convert predictions back to original labels
y_pred_df = pd.DataFrame(y_pred, columns=['Anomaly_type', 'Leakage'])
y_pred_df['Anomaly_type'] = le_anomaly.inverse_transform(y_pred_df['Anomaly_type'])
y_pred_df['Leakage'] = le_leakage.inverse_transform(y_pred_df['Leakage'])

# Combine with original input data
output_df = pd.concat([new_df, y_pred_df], axis=1)

# Save to CSV
output_df.to_csv("telecom_predictions.csv", index=False)
no_leakage_df = output_df[output_df["Leakage"] == "No"]
anomaly_df = output_df[output_df["Leakage"] == "Yes"]

# Save them as separate CSVs
no_leakage_df.to_csv("telecom_no_leakage_data.csv", index=False)
anomaly_df.to_csv("telecom_anomaly_data.csv", index=False)
print("✅ Predictions saved to 'telecom_predictions.csv'")


✅ Predictions saved to 'telecom_predictions.csv'
