In [56]:
import joblib
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer
from sklearn.neural_network import MLPClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import pickle

In [3]:
pickle_filename = "pickle-file_30.pickle"

In [27]:
df = pd.read_csv('30.6 homework.csv')

In [28]:
columns_to_drop = [
        'id',
        'url',
        'region',
        'region_url',
        'price',
        'manufacturer',
        'image_url',
        'description',
        'posting_date',
        'lat',
        'long'
    ]

In [29]:
df.shape

(10000, 19)

In [30]:
q25 = df['year'].quantile(0.25)
q75 = df['year'].quantile(0.75)
iqr = q75 - q25
boundaries = (q25 - 1.5 * iqr, q75 + 1.5 * iqr)
df.loc[df['year'] < boundaries[0], 'year'] = round(boundaries[0])
df.loc[df['year'] > boundaries[1], 'year'] = round(boundaries[1])

In [31]:
df.shape

(10000, 19)

In [32]:
df.year.describe()

count    9964.000000
mean     2011.669410
std         6.516981
min      1994.000000
25%      2008.000000
50%      2013.000000
75%      2017.000000
max      2022.000000
Name: year, dtype: float64

In [33]:
def short_model(x):
    if not pd.isna(x):
        return x.lower().split(' ')[0]
    else:
        return x

In [34]:
df['short_model'] = df['model'].apply(short_model)
df['age_category'] = df['year'].apply(lambda x: 'new' if x > 2013 else ('old' if x < 2006 else 'average'))

In [58]:
df.price_category

0         high
1       medium
2         high
3       medium
4       medium
         ...  
9995       low
9996    medium
9997       low
9998      high
9999      high
Name: price_category, Length: 10000, dtype: object

In [46]:
numerical = df.select_dtypes(include=['int64', 'float64']).columns
categorical = df.select_dtypes(include=['object']).columns

In [47]:
numerical

Index(['id', 'price', 'year', 'odometer', 'lat', 'long'], dtype='object')

In [48]:
categorical

Index(['url', 'region', 'region_url', 'manufacturer', 'model', 'fuel',
       'title_status', 'transmission', 'image_url', 'description', 'state',
       'posting_date', 'price_category', 'short_model', 'age_category'],
      dtype='object')

In [49]:
numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

In [50]:
categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ])

In [51]:
preprocessor = ColumnTransformer(transformers=[
        ('numerical', numerical_transformer, numerical),
        ('categorical', categorical_transformer, categorical)
    ])

In [59]:
X = df.drop('price_category', axis=1)
y = df['price_category']

In [60]:
models = (
        LogisticRegression(solver='liblinear'),
        RandomForestClassifier(),
        MLPClassifier(activation='logistic', hidden_layer_sizes=(256, 128, 64))
    )

best_score = .0
best_pipe = None
for model in models:
    pipe = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    score = cross_val_score(pipe, X, y, cv=4, scoring='accuracy')
    print(f'model: {type(model).__name__}, acc_mean: {score.mean():.4f}, acc_std: {score.std():.4f}')

    if score.mean() > best_score:
        best_score = score.mean()
        best_pipe = pipe

print(f'best model: {type(best_pipe.named_steps["classifier"]).__name__}, accuracy: {best_score:.4f}')

model: LogisticRegression, acc_mean: nan, acc_std: nan
model: RandomForestClassifier, acc_mean: nan, acc_std: nan
model: MLPClassifier, acc_mean: nan, acc_std: nan


4 fits failed out of a total of 4.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\maubeastK\anaconda3\lib\site-packages\pandas\core\indexes\base.py", line 3621, in get_loc
    return self._engine.get_loc(casted_key)
  File "pandas\_libs\index.pyx", line 136, in pandas._libs.index.IndexEngine.get_loc
  File "pandas\_libs\index.pyx", line 163, in pandas._libs.index.IndexEngine.get_loc
  File "pandas\_libs\hashtable_class_helper.pxi", line 5198, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas\_libs\hashtable_class_helper.pxi", line 5206, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'price_category'

The above ex

AttributeError: 'NoneType' object has no attribute 'named_steps'