<a href="https://colab.research.google.com/github/Mehdi007bond/Predictive_maintenance_Project/blob/main/Predictive_maintenance_Project_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# I started by generating the dataset so i can work


In [53]:
import pandas as pd
import numpy as np
import datetime

# --- 1. Configuration Générale ---
N_LINES = 4  # Nombre de lignes de production à simuler
DAYS_PER_MACHINE = 60
SAMPLES_PER_HOUR = 4
N_SAMPLES_PER_DAY = 24 * SAMPLES_PER_HOUR
TOTAL_SAMPLES_PER_MACHINE = DAYS_PER_MACHINE * N_SAMPLES_PER_DAY

# --- Paramètres de simulation (MIS À JOUR) ---
# Notre "état de panne" (cible=1) commence 58h avant la panne
# pour atteindre notre objectif de 4.02%
FAILURE_STATE_HOURS = 58
FAILURE_STATE_SAMPLES = FAILURE_STATE_HOURS * SAMPLES_PER_HOUR

# --- 2. Profils des Machines ---
def get_machine_parameters(machine_type):
    """Définit les profils de capteurs et de défaillance pour chaque type de machine."""

    if machine_type == 'Fraiseuse':
        return {
            'BASE_TEMP': 45.0, 'TEMP_NOISE': 2.0, 'TEMP_DEGRADE_RATE': 0.1,
            'BASE_VIB': 1.2, 'VIB_NOISE': 0.3, 'VIB_DEGRADE_RATE': 0.1,
            'BASE_CUR': 10.0, 'CUR_NOISE': 0.5, 'CUR_DEGRADE_RATE': 0.08,
            'BASE_TORQUE': 80.0, 'TORQUE_NOISE': 3.0, 'TORQUE_DEGRADE_RATE': 0.15
        }
    elif machine_type == 'Convoyeur':
        return {
            'BASE_TEMP': 30.0, 'TEMP_NOISE': 1.0, 'TEMP_DEGRADE_RATE': 0.05,
            'BASE_VIB': 0.4, 'VIB_NOISE': 0.1, 'VIB_DEGRADE_RATE': 0.03,
            'BASE_CUR': 3.0, 'CUR_NOISE': 0.1, 'CUR_DEGRADE_RATE': 0.15,
            'BASE_TORQUE': 20.0, 'TORQUE_NOISE': 1.0, 'TORQUE_DEGRADE_RATE': 0.05
        }
    elif machine_type == 'Machine_de_finition':
        return {
            'BASE_TEMP': 35.0, 'TEMP_NOISE': 0.5, 'TEMP_DEGRADE_RATE': 0.03,
            'BASE_VIB': 0.2, 'VIB_NOISE': 0.05, 'VIB_DEGRADE_RATE': 0.08,
            'BASE_CUR': 2.0, 'CUR_NOISE': 0.1, 'CUR_DEGRADE_RATE': -0.05,
            'BASE_TORQUE': 10.0, 'TORQUE_NOISE': 0.5, 'TORQUE_DEGRADE_RATE': -0.02
        }
    else:
        raise ValueError("Type de machine inconnu")

# --- 3. Fonction de Génération (Simplifiée) ---
def generate_machine_data(machine_id, machine_type, production_line, start_date):

    print(f"Génération des données pour Machine ID: {machine_id} (Type: {machine_type})...")

    params = get_machine_parameters(machine_type)
    total_samples = TOTAL_SAMPLES_PER_MACHINE

    time_offset_seconds = np.random.randint(0, 30)
    base_timestamps = [start_date + datetime.timedelta(hours=i/SAMPLES_PER_HOUR) for i in range(total_samples)]
    timestamps = [ts + datetime.timedelta(seconds=time_offset_seconds) for ts in base_timestamps]

    # Initialisation des capteurs
    temp = np.random.normal(loc=params['BASE_TEMP'], scale=params['TEMP_NOISE'], size=total_samples)
    vib = np.random.normal(loc=params['BASE_VIB'], scale=params['VIB_NOISE'], size=total_samples)
    curr = np.random.normal(loc=params['BASE_CUR'], scale=params['CUR_NOISE'], size=total_samples)
    torque = np.random.normal(loc=params['BASE_TORQUE'], scale=params['TORQUE_NOISE'], size=total_samples)

    # Feature "Odomètre": total_working_hours
    total_working_hours = (np.arange(total_samples) / SAMPLES_PER_HOUR).round(2)

    # --- CIBLE ML (Simplifiée) ---
    # 0 = 'Healthy', 1 = 'Failing'
    failure = np.zeros(total_samples, dtype=int)

    # Définir le point de panne exact
    failure_sample = total_samples - np.random.randint(N_SAMPLES_PER_DAY * 3, N_SAMPLES_PER_DAY * 15)

    # L'état de panne commence 58h avant
    failing_state_start_sample = failure_sample - FAILURE_STATE_SAMPLES

    # Mettre la cible à 1 pendant l'état de panne
    failure[failing_state_start_sample:failure_sample] = 1

    # Appliquer la dégradation des capteurs PENDANT l'état de panne
    for i in range(failing_state_start_sample, failure_sample):
        progress = (i - failing_state_start_sample) / (failure_sample - failing_state_start_sample)
        temp[i] += params['TEMP_DEGRADE_RATE'] * progress * 20
        vib[i] += params['VIB_DEGRADE_RATE'] * progress * 15
        curr[i] += params['CUR_DEGRADE_RATE'] * progress * 10
        torque[i] += params['TORQUE_DEGRADE_RATE'] * progress * 10

    # --- Assemblage du DataFrame (Simplifié) ---
    df = pd.DataFrame({
        'timestamp': timestamps,
        'production_line': production_line,
        'machine_id': machine_id,
        'machine_type': machine_type,
        'temperature': temp.round(2),
        'vibration': vib.round(4),
        'current': curr.round(3),
        'torque': torque.round(2),
        'total_working_hours': total_working_hours,
        'failure': failure                  # NOTRE CIBLE ML (0 ou 1)
    })

    return df

# --- 4. Boucle Principale de Génération ---
print("Démarrage de la génération du dataset (Cible = failure)...")
all_data_frames = []
start_date = datetime.datetime(2024, 1, 1)
machine_id_counter = 1
machine_types = ['Fraiseuse', 'Convoyeur', 'Machine_de_finition']

for i in range(N_LINES):
    line_name = f"Line_{i+1}"
    print(f"\n--- Génération des données pour {line_name} ---")
    for machine_type in machine_types:
        machine_df = generate_machine_data(
            machine_id=machine_id_counter,
            machine_type=machine_type,
            production_line=line_name,
            start_date=start_date
        )
        all_data_frames.append(machine_df)
        machine_id_counter += 1

# --- 5. Finalisation : Mélange et Sauvegarde ---
print("\nFinalisation du dataset...")
full_dataset = pd.concat(all_data_frames)

print("Tri du dataset par timestamp pour simuler la collecte 'temps réel'...")
full_dataset.sort_values(by='timestamp', inplace=True)
full_dataset.reset_index(drop=True, inplace=True)

print("\nDataset de CLASSIFICATION (4%) généré et mélangé !")
print(f"Nombre total d'échantillons: {len(full_dataset)}")

# Sauvegarde
output_filename = "production_line_STATE_BASED_4_PERCENT_data.csv"
full_dataset.to_csv(output_filename, index=False)
print(f"Dataset sauvegardé sous '{output_filename}'")

# Affichage d'un échantillon
print("\n--- Aperçu du Dataset ---")
print(full_dataset.head(10))

print("\n--- Distribution de la Cible (failure) ---")
print(full_dataset['failure'].value_counts())
target_rate = full_dataset['failure'].mean() * 100
print(f"Pourcentage de 'failure' (1): {target_rate:.2f}%")

Démarrage de la génération du dataset (Cible = failure)...

--- Génération des données pour Line_1 ---
Génération des données pour Machine ID: 1 (Type: Fraiseuse)...
Génération des données pour Machine ID: 2 (Type: Convoyeur)...
Génération des données pour Machine ID: 3 (Type: Machine_de_finition)...

--- Génération des données pour Line_2 ---
Génération des données pour Machine ID: 4 (Type: Fraiseuse)...
Génération des données pour Machine ID: 5 (Type: Convoyeur)...
Génération des données pour Machine ID: 6 (Type: Machine_de_finition)...

--- Génération des données pour Line_3 ---
Génération des données pour Machine ID: 7 (Type: Fraiseuse)...
Génération des données pour Machine ID: 8 (Type: Convoyeur)...
Génération des données pour Machine ID: 9 (Type: Machine_de_finition)...

--- Génération des données pour Line_4 ---
Génération des données pour Machine ID: 10 (Type: Fraiseuse)...
Génération des données pour Machine ID: 11 (Type: Convoyeur)...
Génération des données pour Machine ID: 

# Starting by exploratory data analysis

In [54]:
df = full_dataset
display(df.head(10))
display(df.shape)
df.isna().sum()
display(df.describe())
#


Unnamed: 0,timestamp,production_line,machine_id,machine_type,temperature,vibration,current,torque,total_working_hours,failure
0,2024-01-01 00:00:02,Line_2,5,Convoyeur,32.14,0.5506,2.897,20.17,0.0,0
1,2024-01-01 00:00:03,Line_2,6,Machine_de_finition,35.13,0.2289,2.018,9.87,0.0,0
2,2024-01-01 00:00:03,Line_3,8,Convoyeur,28.56,0.3221,2.997,20.0,0.0,0
3,2024-01-01 00:00:05,Line_3,9,Machine_de_finition,34.98,0.2514,2.13,9.87,0.0,0
4,2024-01-01 00:00:12,Line_4,10,Fraiseuse,42.82,1.1311,9.384,76.61,0.0,0
5,2024-01-01 00:00:14,Line_3,7,Fraiseuse,40.64,1.1351,9.849,78.63,0.0,0
6,2024-01-01 00:00:14,Line_1,3,Machine_de_finition,35.65,0.244,2.102,10.39,0.0,0
7,2024-01-01 00:00:16,Line_4,11,Convoyeur,29.0,0.409,2.897,18.19,0.0,0
8,2024-01-01 00:00:18,Line_4,12,Machine_de_finition,34.93,0.1768,2.022,10.11,0.0,0
9,2024-01-01 00:00:20,Line_1,1,Fraiseuse,48.66,1.0394,10.647,77.04,0.0,0


(69120, 10)

Unnamed: 0,timestamp,machine_id,temperature,vibration,current,torque,total_working_hours,failure
count,69120,69120.0,69120.0,69120.0,69120.0,69120.0,69120.0,69120.0
mean,2024-01-30 23:52:42.833332736,6.5,36.689679,0.622058,5.012767,36.692973,719.875,0.040278
min,2024-01-01 00:00:02,1.0,26.06,-0.096,1.305,8.0,0.0,0.0
25%,2024-01-15 23:56:22.750000128,3.75,30.7,0.2312,2.066,10.34,359.9375,0.0
50%,2024-01-30 23:52:43.500000,6.5,35.01,0.4114,3.005,20.01,719.875,0.0
75%,2024-02-14 23:49:04.249999872,9.25,43.68,1.0191,9.672,78.05,1079.8125,0.0
max,2024-02-29 23:45:25,12.0,53.42,3.2433,12.087,91.61,1439.75,1.0
std,,3.452078,6.386512,0.492817,3.578927,30.99968,415.695195,0.196611


In [55]:
df.info()
df.columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69120 entries, 0 to 69119
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   timestamp            69120 non-null  datetime64[ns]
 1   production_line      69120 non-null  object        
 2   machine_id           69120 non-null  int64         
 3   machine_type         69120 non-null  object        
 4   temperature          69120 non-null  float64       
 5   vibration            69120 non-null  float64       
 6   current              69120 non-null  float64       
 7   torque               69120 non-null  float64       
 8   total_working_hours  69120 non-null  float64       
 9   failure              69120 non-null  int64         
dtypes: datetime64[ns](1), float64(5), int64(2), object(2)
memory usage: 5.3+ MB


Index(['timestamp', 'production_line', 'machine_id', 'machine_type',
       'temperature', 'vibration', 'current', 'torque', 'total_working_hours',
       'failure'],
      dtype='object')

In [56]:
import seaborn as sns
import matplotlib.pyplot as plt

for col in df[['failure']] :
  col = df[col].value_counts()
  print(col)

# building a plot where we can see a evolution of each line (1 2 3)
#making a classification of each variable so we ca ddisplay it
df_line1 = df[(df['production_line'] == 'Line_1') & (df['machine_type'] == 'Machine_de_finition') & (df['failure'] == 1)]
display(df_line1)
df.head()

failure
0    66336
1     2784
Name: count, dtype: int64


Unnamed: 0,timestamp,production_line,machine_id,machine_type,temperature,vibration,current,torque,total_working_hours,failure
55493,2024-02-18 04:00:14,Line_1,3,Machine_de_finition,34.99,0.1520,1.854,9.69,1156.00,1
55505,2024-02-18 04:15:14,Line_1,3,Machine_de_finition,35.02,0.2712,2.084,9.76,1156.25,1
55517,2024-02-18 04:30:14,Line_1,3,Machine_de_finition,35.27,0.2321,1.948,9.96,1156.50,1
55530,2024-02-18 04:45:14,Line_1,3,Machine_de_finition,34.74,0.2025,1.849,9.26,1156.75,1
55541,2024-02-18 05:00:14,Line_1,3,Machine_de_finition,34.88,0.2996,1.966,10.19,1157.00,1
...,...,...,...,...,...,...,...,...,...,...
58218,2024-02-20 12:45:14,Line_1,3,Machine_de_finition,35.35,1.2856,1.388,10.30,1212.75,1
58229,2024-02-20 13:00:14,Line_1,3,Machine_de_finition,35.00,1.4003,1.640,9.10,1213.00,1
58241,2024-02-20 13:15:14,Line_1,3,Machine_de_finition,35.97,1.2973,1.543,9.96,1213.25,1
58254,2024-02-20 13:30:14,Line_1,3,Machine_de_finition,35.55,1.3696,1.508,9.40,1213.50,1


Unnamed: 0,timestamp,production_line,machine_id,machine_type,temperature,vibration,current,torque,total_working_hours,failure
0,2024-01-01 00:00:02,Line_2,5,Convoyeur,32.14,0.5506,2.897,20.17,0.0,0
1,2024-01-01 00:00:03,Line_2,6,Machine_de_finition,35.13,0.2289,2.018,9.87,0.0,0
2,2024-01-01 00:00:03,Line_3,8,Convoyeur,28.56,0.3221,2.997,20.0,0.0,0
3,2024-01-01 00:00:05,Line_3,9,Machine_de_finition,34.98,0.2514,2.13,9.87,0.0,0
4,2024-01-01 00:00:12,Line_4,10,Fraiseuse,42.82,1.1311,9.384,76.61,0.0,0
