In [14]:
import pandas as pd
from sklearn.ensemble import IsolationForest

# Sample data from your provided DataFrame
df = pd.read_csv('../pandasLab/dataset/Electric.csv')
print(df)

# Convert Datetime column to datetime type
df['Datetime'] = pd.to_datetime(df['Datetime'])

# Filter data for the specific date (2022-11-13)
date_filter = df['Datetime'].dt.date == pd.to_datetime('2022-11-13').date()
filtered_df = df[date_filter]

# Train an Isolation Forest model to detect anomalies in n_val
X = filtered_df[['n_val']].values 
model = IsolationForest(contamination=0.05, random_state=42)
model.fit(X) 

# Predict anomalies (outliers) using the model
anomalies = model.predict(X)


# Add the anomaly predictions to the filtered DataFrame
filtered_df['anomaly'] = anomalies


# Remove the rows with anomalies (n_id 9:00:00 and 10:00:00)
filtered_df_cleaned = filtered_df[filtered_df['anomaly'] == 1]

# Remove rows with n_val equal to 500 
filtered_df_cleaned = filtered_df_cleaned[~filtered_df_cleaned['n_val'].isin([500])]

# Display the cleaned DataFrame
print(filtered_df_cleaned)


   id    Datetime      n_id  n_val      machine_id
0   1  2022-11-13  10:06:00   2000  Air_Pump_Ozone
1   2  2022-11-13  10:00:00  10000  Air_Pump_Ozone
2   3  2022-11-13   9:15:00   1200  Air_Pump_Ozone
3   4  2022-11-13   9:00:00    500  Air_Pump_Ozone
4   5  2022-11-13   8:30:00   1100  Air_Pump_Ozone
5   6  2022-11-13   8:00:00   1000  Air_Pump_Ozone
   id   Datetime      n_id  n_val      machine_id  anomaly
0   1 2022-11-13  10:06:00   2000  Air_Pump_Ozone        1
2   3 2022-11-13   9:15:00   1200  Air_Pump_Ozone        1
4   5 2022-11-13   8:30:00   1100  Air_Pump_Ozone        1
5   6 2022-11-13   8:00:00   1000  Air_Pump_Ozone        1


In [13]:
import pandas as pd
from sklearn.ensemble import IsolationForest

# Load and preprocess data
data = pd.read_csv('../pandasLab/dataset/Electric.csv')
X = data[['n_val']].values

# Train Isolation Forest
model = IsolationForest(contamination=0.05, random_state=42) # pen karn check kar phit pok ka ti nai khr moun
model.fit(X)


# Predict anomalies
anomaly_scores = model.decision_function(X) # random khr moun pid pok ka ti
print(anomaly_scores)

threshold = sorted(anomaly_scores)[int(0.05 * len(anomaly_scores))]  # Adjust the contamination rate

# Identify and filter anomalies
data['anomaly'] = anomaly_scores > threshold
filtered_data = data[~data['anomaly']]

print(filtered_data)


[ 0.15418443 -0.05139481  0.31155225  0.18134352  0.32737372  0.31778411]
   id    Datetime      n_id  n_val      machine_id  anomaly
1   2  2022-11-13  10:00:00  10000  Air_Pump_Ozone    False


In [18]:
import pandas as pd
from sklearn.ensemble import IsolationForest

# Load and preprocess data
data = pd.read_csv('../pandasLab/dataset/Electric.csv')
X = data[['n_val']].values

# Train Isolation Forest
model = IsolationForest(contamination=0.05, random_state=42)
model.fit(X)

# Predict anomalies
anomaly_scores = model.decision_function(X)

# Sort anomaly scores in ascending order
sorted_anomaly_scores = sorted(anomaly_scores)

# Calculate threshold based on contamination rate (e.g., 5%)
threshold_index = int(0.05 * len(sorted_anomaly_scores))
threshold = sorted_anomaly_scores[threshold_index]

# Identify and filter anomalies
data['anomaly'] = anomaly_scores > threshold
filtered_data = data[~data['anomaly']]

print("Original Data:")
print(data)

print("\nFiltered Data (Anomalies Removed):")
print(filtered_data)

# Display rows with anomalies
anomalies = data[data['anomaly']]
print("\nAnomalies Detected:")
print(anomalies)


Original Data:
   id    Datetime      n_id  n_val      machine_id  anomaly
0   1  2022-11-13  10:06:00   2000  Air_Pump_Ozone     True
1   2  2022-11-13  10:00:00  10000  Air_Pump_Ozone    False
2   3  2022-11-13   9:15:00   1200  Air_Pump_Ozone     True
3   4  2022-11-13   9:00:00    500  Air_Pump_Ozone     True
4   5  2022-11-13   8:30:00   1100  Air_Pump_Ozone     True
5   6  2022-11-13   8:00:00   1000  Air_Pump_Ozone     True

Filtered Data (Anomalies Removed):
   id    Datetime      n_id  n_val      machine_id  anomaly
1   2  2022-11-13  10:00:00  10000  Air_Pump_Ozone    False

Anomalies Detected:
   id    Datetime      n_id  n_val      machine_id  anomaly
0   1  2022-11-13  10:06:00   2000  Air_Pump_Ozone     True
2   3  2022-11-13   9:15:00   1200  Air_Pump_Ozone     True
3   4  2022-11-13   9:00:00    500  Air_Pump_Ozone     True
4   5  2022-11-13   8:30:00   1100  Air_Pump_Ozone     True
5   6  2022-11-13   8:00:00   1000  Air_Pump_Ozone     True


In [19]:
import pandas as pd
from sklearn.ensemble import IsolationForest

# Load and preprocess data
data = pd.read_csv('../pandasLab/dataset/Electric.csv')

# Train Isolation Forest
model = IsolationForest(contamination=0.05, random_state=42)
model.fit(data[['n_val']].values)

# Predict anomalies
anomaly_scores = model.decision_function(data[['n_val']].values)

# Sort anomaly scores in ascending order
sorted_anomaly_scores = sorted(anomaly_scores) 

# Calculate threshold based on contamination rate (e.g., 5%)
threshold_index = int(0.05 * len(sorted_anomaly_scores)) 
threshold = sorted_anomaly_scores[threshold_index]

# Check if specific values are anomalies
anomaly_threshold = threshold  # You can adjust this threshold if needed

specific_values = [500, 10000]  # List of specific values to check
print(anomaly_threshold)
for specific_value in specific_values:
    if specific_value > anomaly_threshold:
        print(f"Value {specific_value} is predicted as an anomaly.")
    else:
        print(f"Value {specific_value} is not predicted as an anomaly.")

# Filter data to exclude rows with specified values
filtered_data = data[~data['n_val'].isin(specific_values)]

print("\nFiltered Data (Anomalies Removed):")
print(filtered_data)


-0.05139480964733467
Value 500 is predicted as an anomaly.
Value 10000 is predicted as an anomaly.

Filtered Data (Anomalies Removed):
   id    Datetime      n_id  n_val      machine_id
0   1  2022-11-13  10:06:00   2000  Air_Pump_Ozone
2   3  2022-11-13   9:15:00   1200  Air_Pump_Ozone
4   5  2022-11-13   8:30:00   1100  Air_Pump_Ozone
5   6  2022-11-13   8:00:00   1000  Air_Pump_Ozone
