In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn as skl
import plotly.express as px

In [4]:
df=pd.read_csv('ford.csv')

# Isolation Forest

## Подготовка данных

In [5]:
df.drop_duplicates(inplace=True)

df_non_numeric = df.select_dtypes(exclude=[np.number])
non_numeric_cols = df_non_numeric.columns.values

df_numeric=df.select_dtypes(include=[np.number])
numeric_cols=df_numeric.columns.values

for i in non_numeric_cols:
  df[i]=df[i].str.strip('123.!? \n\t')
  df[i] = df[i].str.lower()

df.columns=df.columns.str.lower()

df

Unnamed: 0,model,year,price,transmission,mileage,fueltype,tax,mpg,enginesize
0,fiesta,2017,12000,automatic,15944,petrol,150,57.7,1.0
1,focus,2018,14000,manual,9083,petrol,150,57.7,1.0
2,focus,2017,13000,manual,12456,petrol,150,57.7,1.0
3,fiesta,2019,17500,manual,10460,petrol,145,40.3,1.5
4,fiesta,2019,16500,automatic,1482,petrol,145,48.7,1.0
...,...,...,...,...,...,...,...,...,...
17961,b-max,2017,8999,manual,16700,petrol,150,47.1,1.4
17962,b-max,2014,7499,manual,40700,petrol,30,57.7,1.0
17963,focus,2015,9999,manual,7010,diesel,20,67.3,1.6
17964,ka,2018,8299,manual,5007,petrol,145,57.7,1.2


## Визуализация набора данных

In [6]:
fig = px.scatter([i for i in range(len(df['price']))], y=df['price'])
fig.show()

## Обучение модели

In [7]:
from sklearn.ensemble import IsolationForest

#инициализация Isolation Forest
isolation_model = IsolationForest(contamination = 0.003)

#тренировка модели
isolation_model.fit(df_numeric)

#создание предсказаний
IF_predictions = isolation_model.predict(df_numeric)


X does not have valid feature names, but IsolationForest was fitted with feature names



In [8]:
#добавление аномалий в датасет
df['anomalies'] = IF_predictions
anomalies = df.query('anomalies == -1')

import plotly.graph_objects as go

#строю график для выбросов
normal = go.Scatter(x=df.index.astype(str),y=df['price'],name="df",mode='markers')
outlier = go.Scatter(x=anomalies.index.astype(str),y=anomalies['price'],name="Anomalies",mode='markers',
                marker=dict(color='red', size=6,
                            line=dict(color='red', width=1)))

# обозначения графика
layout = go.Layout(title="Isolation Forest",yaxis_title='Price',xaxis_title='x-axis',)

#строю график
data = [normal, outlier]
fig = go.Figure(data=data, layout=layout)
fig.show()

## Уменьшение набора функций

Удаляю лишние столбцы, оставляю только столбцы "price" и "year"

In [9]:
#копирую датасет
data = df.copy()

data.drop('transmission', axis=1, inplace=True)
data.drop('mileage', axis=1, inplace=True)
data.drop('fueltype', axis=1, inplace=True)
data.drop('tax', axis=1, inplace=True)
data.drop('mpg', axis=1, inplace=True)
data.drop('enginesize', axis=1, inplace=True)
data.drop('anomalies', axis=1, inplace=True)
data.drop('model', axis=1, inplace=True)

data.head()

Unnamed: 0,year,price
0,2017,12000
1,2018,14000
2,2017,13000
3,2019,17500
4,2019,16500


## Обучение модели, используя только цену и год

In [10]:
#инициализация Isolation Forest
isolation_model1= IsolationForest(contamination=0.003)

#тренировка модели
isolation_model1.fit(data)

#создание предсказаний
IF_predictions1 = isolation_model1.predict(data)


X does not have valid feature names, but IsolationForest was fitted with feature names



In [11]:
# добавление аномалий в набор данных
data['anomalies'] = IF_predictions1
anomalies1 = data.query('anomalies == -1')

#строю график для выбросов
normal = go.Scatter(x=data.index.astype(str),y=data['price'],name="Normal data", mode='markers')
outlier = go.Scatter(x=anomalies1.index.astype(str), y=anomalies1['price'], name="Anomalies", mode='markers',
                        marker=dict(color='red', size=5,
                            line=dict(color='red', width=1)))

# обозначения графика
layout = go.Layout(title="Isolation Forest", yaxis_title='price',xaxis_title='x-axis',)

#строю график
Data = [normal, outlier]
fig = go.Figure(data=Data, layout=layout)
fig.show()