In [2]:
import pandas as pd

In [3]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [4]:
train_df.head() , test_df.head()

(    timestamp  value  is_anomaly  predicted
 0  1425008573     42       False  44.072500
 1  1425008873     41       False  50.709390
 2  1425009173     41       False  81.405120
 3  1425009473     61       False  39.950367
 4  1425009773     44       False  35.350160,
     timestamp  value  predicted
 0  1396332000   20.0       20.0
 1  1396332300   20.0       20.0
 2  1396332600   20.0       20.0
 3  1396332900   20.0       20.0
 4  1396333200   20.0       20.0)

In [6]:
stats = train_df['value'].describe()
stats

count    15830.000000
mean        85.572205
std        321.760918
min          0.000000
25%         29.000000
50%         47.000000
75%         76.000000
max      13479.000000
Name: value, dtype: float64

In [10]:
anomaly_proportion = train_df['is_anomaly'].mean()
print(f'{anomaly_proportion:.2%}')

4.90%


In [12]:
correlation = train_df[['value', 'predicted']].corr().iloc[0, 1]
print(f'{correlation:.2f}')

0.45


In [13]:
from sklearn.model_selection import train_test_split

# Features and target
X_train = train_df[['value', 'predicted']]
y_train = train_df['is_anomaly']

# Splitting the training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [14]:
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report

# Initializing the model
model = IsolationForest(contamination=anomaly_proportion, random_state=42)

# Training the model
model.fit(X_train)

# Predicting on validation set
y_val_pred = model.predict(X_val)
# Converting predictions to binary (1 for anomaly, 0 for normal)
y_val_pred = [1 if x == -1 else 0 for x in y_val_pred]

# Evaluating the model
print(classification_report(y_val, y_val_pred))




              precision    recall  f1-score   support

       False       0.96      0.97      0.97      2997
        True       0.38      0.30      0.33       169

    accuracy                           0.94      3166
   macro avg       0.67      0.63      0.65      3166
weighted avg       0.93      0.94      0.93      3166



In [16]:
# Predicting on the test set
X_test = test_df[['value', 'predicted']]
test_predictions = model.predict(X_test)
# Converting predictions to binary (1 for anomaly, 0 for normal)
test_predictions = [1 if x == -1 else 0 for x in test_predictions]

# Creating submission DataFrame
submission_df = test_df[['timestamp']].copy()
submission_df['is_anomaly'] = test_predictions

# Saving the submission file
submission_df.to_csv('Submission.csv', index=False)
