## Libraries

Genral

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

SKLearn

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, ShuffleSplit, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.decomposition import PCA

Regression

In [None]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor

Classifier

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier

## Plots

Scatter Plot

In [None]:
plt.xlabel('C1')
plt.ylabel('target')

plt.scatter(df.C1, df.target, marker='+', color='red')

Bar Graph

In [None]:
pd.crosstab(df.C1, df.target).plot(kind='bar') # Impact of salary on employee retention

Histogram

In [None]:
plt.rcParams['figure.figsize'] = (10,6)

plt.hist(df.target, bins=20, rwidth=0.8)
plt.xlabel('target')
plt.ylabel('Count')
# plt.yscale('log')
plt.show()

Histogram with Bell Curve

In [None]:
plt.hist(df.target, bins=20, rwidth=0.8, density=True)
plt.xlabel('target')
plt.ylabel('Count')
# plt.yscale('log')

rng = np.arange(df.target.min(), df.target.max(), 0.1)
plt.plot(rng, norm.pdf(rng,df.target.mean(),df.target.std()))

In [None]:
# Plot Histogram
def plot_histogram(df, column, bins=20, rwidth=0.8):
    plt.rcParams['figure.figsize'] = (10, 6)
    
    plt.hist(df[column], bins=bins, rwidth=rwidth)
    plt.xlabel(f'{column}')
    plt.ylabel('Count (in log scale)')
    plt.yscale('log')
    plt.title(f'Histogram of {column}')
    plt.show()

plot_histogram(df, 'target')

## Analysis

Genral

In [None]:
df.select_dtypes(include=[np.number]).groupby('target').mean()

In [None]:
df['target'].unique()

In [None]:
df['target'].value_counts()

Handling Null / NaN Values

In [None]:
df.isna().sum()

In [None]:
df[['C1', 'C2']] = df[['C1', 'C2']].fillna(0)
df.isna().sum()

In [None]:
df = df.dropna()
df.isna().sum()

Encoding

In [None]:
# Label Encoding
le = LabelEncoder()
df['C1_encoded'] = le.fit_transform(df['C1'])
df

In [None]:
# Ordinal Encoding
oe = OrdinalEncoder(categories = [["High", "Medium", "Low"]], dtype=int)
df["C1_encoded"] = oe.fit_transform(df[["C1"]])
df

In [None]:
# One Hot Encoding
df = pd.get_dummies(df, drop_first=True)
df = df.astype(float)
df

### Outlier

1. **Percentile**

In [None]:
min_threshold, max_threshold = df.target.quantile([0.01, 0.999])
min_threshold, max_threshold

In [None]:
outliers = df[(df.target > max_threshold) | (df.target < min_threshold)]
outliers

In [None]:
df = df[(df.target < max_threshold) & (df.target > min_threshold)]
df.shape

2. **3 Std. Deviation**

In [None]:
upper_limit = df.target.mean() + 3 * df.target.std()
upper_limit

In [None]:
lower_limit = df.target.mean() - 3 * df.target.std()
lower_limit

In [None]:
outliers = df[(df.target > upper_limit) | (df.target < lower_limit)]
outliers

In [None]:
df_no_outlier_std_dev = df[(df.target < upper_limit) & (df.target > lower_limit)]
df_no_outlier_std_dev

3. **Z Score**

In [None]:
df['Zscore'] = ( df.target - df.target.mean() ) / df.target.std()
df

In [None]:
outliers = df[(df.Zscore < -3) | (df.Zscore > 3)]
outliers

In [None]:
df_no_outliers = df[(df.Zscore > -3) & (df.Zscore < 3)]
df_no_outliers

4. **IOR**

In [None]:
Q1 = df.target.quantile(0.25)
Q3 = df.target.quantile(0.75)
Q1, Q3

In [None]:
IQR = Q3 - Q1
IQR

In [None]:
lower_limit = Q1 - 1.5 * IQR
upper_limit = Q3 + 1.5 * IQR
lower_limit, upper_limit

In [None]:
outliers = df[(df.target < lower_limit) | (df.target > upper_limit)]
outliers

In [None]:
df_no_outliers = df[(df.target > lower_limit) & (df.target < upper_limit)]
df_no_outliers

Scaling

In [None]:
scalar = StandardScaler()
X_scaled = scalar.fit_transform(X)
X_scaled

#### PCA

In [None]:
# By percentage
pca = PCA(0.95)
X_pca = pca.fit_transform(X)
X_pca

In [None]:
# By number of components
pca = PCA(n_components=2)
X_pca2 = pca.fit_transform(X)
X_pca2

## Model

Train Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

Cross Val Score

In [None]:
scores = cross_val_score(RandomForestClassifier(n_estimators=50), X, y, cv=cv)
print(scores)
scores.mean()

Pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

clf.fit(X_train, y_train)

Grid Search CV

In [None]:
model_params = {
    'svm': {
        'model': SVC(),
        'params' : {
            'C': [1, 10, 20, 30, 50, 80, 100],
            'kernel': ['rbf', 'linear'],
            'gamma': ['auto', 'scale'],
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1, 5, 10, 20, 30, 40, 50, 60, 80, 100],
        }
    },
}

In [None]:
scores = []

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=cv, return_train_score=False)
    clf.fit(X, y)
    
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df_results = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
df_results

Randomized Search CV

In [None]:
scores = []

for model_name, mp in model_params.items():
    clf =  RandomizedSearchCV(mp['model'], mp['params'], cv=cv, return_train_score=False, n_iter=1)
    clf.fit(X, y)
    
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df_results = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
df_results

Grid Search CV for Param Tuning

In [None]:
clf = GridSearchCV(KNeighborsClassifier(), {
    'n_neighbors': [3, 5, 10, 20],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [20, 30, 40, 50, 60],
    'p': [1, 2]
}, cv=5, return_train_score=False)

clf.fit(X, y)

df_result = pd.DataFrame(clf.cv_results_)
df_result[['param_n_neighbors', 'param_p', 'param_weights', 'param_algorithm', 'mean_test_score']]

Bagging

In [None]:
bag_model = BaggingClassifier(
    estimator = DecisionTreeClassifier(),
    n_estimators = 100,
    max_samples = 0.8,
    oob_score = True,
    random_state = 0
)

scores = cross_val_score(bag_model, X_scaled, y, cv=cv)
scores.mean()

In [None]:
bag_model = BaggingRegressor(
    estimator = DecisionTreeRegressor(),
    n_estimators = 100,
    max_samples = 0.8,
    oob_score = True,
    random_state = 0
)

scores = cross_val_score(bag_model, X_scaled, y, cv=cv)
scores.mean()

## Result

Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

y_pred = model.predict(X_test)
cf = confusion_matrix(y_test, y_pred)

sns.heatmap(cf, annot=True, fmt="")
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')

Classification Report

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))