In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import RandomOverSampler

# Load and preprocess data
file_path = '/content/contractHistoryComplete-contratsOctroyesComplet.csv'
data = pd.read_csv(file_path, low_memory=False)
features = ['tradeAgreements-accordsCommerciaux-eng', 'contractAwardDate-dateAttributionContrat',
            'contractingEntityName-nomEntitContractante-fra', 'contractStartDate-contratDateDebut', 'contractEndDate-dateFinContrat',
            'procurementCategory-categorieApprovisionnement', 'tenderDescription-descriptionAppelOffres-fra','procurementMethod-methodeApprovisionnement-eng']
target = 'totalContractValue-valeurTotaleContrat'

# Convert date columns to integer timestamps
date_columns = ['contractStartDate-contratDateDebut', 'contractEndDate-dateFinContrat', 'contractAwardDate-dateAttributionContrat']
for col in date_columns:
    data[col] = pd.to_datetime(data[col], format='%d/%m/%Y', dayfirst=True, errors='coerce')
    data[col] = data[col].view(np.int64) // 10**9  # Convert to integer timestamps

# Separate numerical and categorical features
numerical_features = date_columns
categorical_features = [col for col in features if col not in numerical_features]

# Define preprocessing pipelines
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Fill missing values with median
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Fill missing values with most frequent
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Discretize the target variable
discretizer = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
y = discretizer.fit_transform(data[[target]])

# Apply preprocessing to features
X_processed = preprocessor.fit_transform(data[features])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_processed, y.ravel(), test_size=0.2, random_state=42)

# Ensure y_train is of integer type
y_train = y_train.astype(int)

# Check class distribution in y_train
unique, counts = np.unique(y_train, return_counts=True)
class_distribution = dict(zip(unique, counts))
print(f'Class distribution in y_train: {class_distribution}')

# Apply RandomOverSampler to balance the classes
ros = RandomOverSampler(random_state=42)
X_res, y_res = ros.fit_resample(X_train, y_train)

# Define and train the model
model = Pipeline(steps=[
    ('classifier', RandomForestClassifier(random_state=42))
])

# Fit the model
model.fit(X_res, y_res)

# Predict and evaluate
y_pred = model.predict(X_test)
report = classification_report(y_test, y_pred, zero_division=0)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'Classification Report:\n{report}')
print(f'Confusion Matrix:\n{conf_matrix}')

# Generate 25 predictions with confidence scores
predictions = model.predict(X_test[:25])
pred_probs = model.predict_proba(X_test[:25])

# Display predictions and confidence scores
for i in range(25):
    print(f'Prediction: {predictions[i]}, Confidence: {pred_probs[i].max():.2f}')



Class distribution in y_train: {0: 3057, 1: 1058, 2: 1021, 3: 1026, 4: 1017, 5: 1020, 6: 1005, 7: 1049}
Classification Report:
              precision    recall  f1-score   support

         0.0       0.80      0.82      0.81       753
         1.0       0.58      0.69      0.63       248
         2.0       0.74      0.59      0.66       271
         3.0       0.55      0.63      0.59       256
         4.0       0.61      0.55      0.58       265
         5.0       0.62      0.54      0.58       260
         6.0       0.62      0.45      0.52       273
         7.0       0.54      0.74      0.63       238

    accuracy                           0.66      2564
   macro avg       0.63      0.63      0.62      2564
weighted avg       0.67      0.66      0.66      2564

Confusion Matrix:
[[614  44   6  26  11  20  11  21]
 [  9 172  25  19  10   4   3   6]
 [ 18  36 161  21  18   3   6   8]
 [ 21  19  15 162  15   6   6  12]
 [ 27  11   8  30 147  13  13  16]
 [ 24   8   3  14  26 141  16