In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin

class FeatureMerger(BaseEstimator, TransformerMixin):
  def fit(self, X, y=None):
    return self
  
  def transform(self, X):
    df = pd.DataFrame()
    df['result'] = X.apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
    return df['result']

In [3]:
numerical_transformer = SimpleImputer(strategy='median')

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

vectorized_transformer = Pipeline(steps=[
  ('merge', FeatureMerger()),
  ('vectorize', TfidfVectorizer(max_features=2000))
])

In [4]:
# TODO: try another model
model = LogisticRegression(max_iter=1000)

In [32]:
def process_data(df, target_column, cv=5):
  y = df[target_column]
  X = df.drop([target_column], axis=1)
  categorical_cols = [cname for cname in X.columns if X[cname].nunique() < 10 and  X[cname].dtype == "object"]
  vectorized_cols = [cname for cname in X.columns if X[cname].nunique() >= 10 and  X[cname].dtype == "object"]
  numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]
  preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols),
        ('vect', vectorized_transformer, vectorized_cols)
    ])
  my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])
  return cross_val_score(my_pipeline, X, y, cv=cv, scoring='accuracy').mean()


In [6]:
news = pd.read_csv('News.csv')
print("Average accuracy:", process_data(news, 'category', cv=3))

Average accuracy: 0.5925823680812077


In [10]:
tortilla = pd.read_csv('tortilla_prices.csv')
print("Average accuracy:", process_data(tortilla, 'Store type'))

Average accuracy: 0.7084084947764306


In [31]:
customers1 = pd.read_csv('Train.csv', index_col='ID')
customers2 = pd.read_csv('Test.csv', index_col='ID')
customers = pd.concat([customers1, customers2])
print("Average accuracy:", process_data(customers, 'Segmentation'))

['Gender', 'Ever_Married', 'Graduated', 'Profession', 'Spending_Score', 'Var_1'] ['Age', 'Work_Experience', 'Family_Size']
Average accuracy: 0.4670406732117812
