# Data preparation and ML ops

## Preprocessing
### Fake data

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from scipy import stats

np.random.seed(42)
n_samples = 100
data = {
    'income': np.random.normal(50000, 15000, n_samples),
    'credit_score' : np.random.normal(650,50, n_samples),
    'job_title' : np.random.choice(['Engineer', 'Teacher', 'Doctor', 'Artist'], n_samples),
    'target' : np.random.choice([0,1], n_samples)
}

data['income'][np.random.randint(0, n_samples, 5)] = np.nan
data['credit_score'][np.random.randint(0, n_samples, 3)] = np.nan
data['income'][np.random.randint(0, n_samples, 2)] = 150000
df_initial = pd.DataFrame(data)
df = pd.DataFrame(data)

### Handle values - cleaning missing values

In [2]:
for column in df.select_dtypes(include=['float64', 'int64']).columns:
    df[column].fillna(df[column].median(), inplace=True)

df.drop_duplicates(inplace=True)



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].median(), inplace=True)


### Standardize variables 
- scaling
- Encoding one hot
- detectin outliers

In [3]:
scaler = StandardScaler()
numeric_features = df.select_dtypes(include=['float64', 'int64']).columns
df[numeric_features] = scaler.fit_transform(df[numeric_features])

df = pd.get_dummies(df, drop_first=True)
z_scores = np.abs(stats.zscore(df.select_dtypes(include=['float64', 'int64'])))
df = df[(z_scores<3).all(axis=1)]

In [4]:
#df.head()
df_initial.head()

Unnamed: 0,income,credit_score,job_title,target
0,57450.712295,579.231463,Teacher,0
1,47926.035482,628.967734,Teacher,0
2,59715.328072,632.864274,Engineer,0
3,72845.447846,609.886137,Artist,1
4,46487.699379,641.935714,Engineer,0


#### Analyze data
 - Skewed data
 - splits

In [5]:
df['income_log'] = np.log1p(df['income'])
df = df.sort_values(by='income_log', ascending=True)
# df.dropna(inplace=True)
# df

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [6]:
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
X_train

Unnamed: 0,income,credit_score,job_title_Doctor,job_title_Engineer,job_title_Teacher,income_log
25,0.068457,2.307458,False,False,False,0.066216
75,0.613229,0.847927,False,False,True,0.478238
40,0.549298,0.205858,True,False,False,0.437802
24,-0.433655,0.590632,False,True,False,-0.568551
64,-0.122476,0.993736,False,True,False,-0.130651
...,...,...,...,...,...,...
70,0.260376,-0.989984,False,False,False,0.231410
34,0.613721,-1.022006,True,False,False,0.478543
15,-0.447374,0.285177,False,False,True,-0.593073
38,-1.034225,0.833296,False,False,True,
