In [13]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin

class FeatureMerger(BaseEstimator, TransformerMixin):
  def fit(self, X, y=None):
    return self
  
  def transform(self, X):
    df = pd.DataFrame()
    df['result'] = X.apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
    return df['result']

In [4]:
numerical_transformer = SimpleImputer(strategy='median')

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

vectorized_transformer = Pipeline(steps=[
  ('merge', FeatureMerger()),
  ('vectorize', TfidfVectorizer(max_features=2000))
])

In [19]:
model = LogisticRegression(max_iter=1000)
# model = RandomForestClassifier()

In [20]:
def process_data(df, target_column, cv=5):
  y = df[target_column]
  X = df.drop([target_column], axis=1)
  categorical_cols = [cname for cname in X.columns if X[cname].nunique() < 10 and  X[cname].dtype == "object"]
  vectorized_cols = [cname for cname in X.columns if X[cname].nunique() >= 10 and  X[cname].dtype == "object"]
  numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]
  preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols),
        ('vect', vectorized_transformer, vectorized_cols)
    ])
  my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])
  return cross_val_score(my_pipeline, X, y, cv=cv, scoring='accuracy').mean()


In [6]:
news = pd.read_csv('News.csv')
print("Average accuracy:", process_data(news, 'category', cv=3))

Average accuracy: 0.5925823680812077


In [10]:
tortilla = pd.read_csv('tortilla_prices.csv')
print("Average accuracy:", process_data(tortilla, 'Store type'))

Average accuracy: 0.7084084947764306


In [31]:
customers1 = pd.read_csv('Train.csv', index_col='ID')
customers2 = pd.read_csv('Test.csv', index_col='ID')
customers = pd.concat([customers1, customers2])
print("Average accuracy:", process_data(customers, 'Segmentation'))

Average accuracy: 0.4670406732117812


In [7]:
salaries = pd.read_csv('ds_salaries.csv')
salaries['job_title'] = salaries['job_title'].replace('ML Engineer', 'Machine Learning Engineer')

In [9]:
top_titles = salaries.job_title.value_counts()[:4]
top_titles

Data Engineer                1040
Data Scientist                840
Data Analyst                  612
Machine Learning Engineer     323
Name: job_title, dtype: int64

In [10]:
print(f'Top 4 job titles represent {round(sum(top_titles) / len(salaries) * 100, 2)}% of dataset')

Top 4 job titles represent 74.97% of dataset


Predicting job title from 4 most popular job titles doesn't give high accuracy

In [22]:
filtered_salaries = salaries[salaries['job_title'].isin(top_titles.index)]
print("Average accuracy:", process_data(filtered_salaries, 'job_title'))

Average accuracy: 0.38259325044404974


In [11]:
top_titles[0]/sum(top_titles)

0.369449378330373

Predicting experience_level gives almost same result as ZeroR model

In [23]:
print("Average accuracy:", process_data(salaries, 'experience_level'))

Average accuracy: 0.6841544607190413


In [82]:
salaries.experience_level.value_counts()[0] / len(salaries)

0.6700399467376831

In [25]:
url = pd.read_csv('url_spam_classification.csv')
print("Average accuracy:", process_data(url, 'is_spam'))

Average accuracy: 0.9203522408293509
