In [1]:
import pandas as pd
df = pd.read_csv("data\\train.data",sep="\t")
target = pd.read_csv("data\\train_churn.labels.txt",  header=None)
target.value_counts()
df.dropna(how='all', axis=1, inplace=True) 
df.shape

(50000, 212)

In [2]:
numeric= df._get_numeric_data().columns
h_cardinal = [(x,df[x].nunique()) for x in df.columns if x not in numeric]
for x,y in h_cardinal:
    if y >1000 :
        print("High Cardinality : "+x)
        df.drop(columns= x,inplace =True)
    if y <=2:
        print("low cardinality : "+ x)
        df[x] = pd.factorize(df[x])[0]

low cardinality : Var191
High Cardinality : Var198
High Cardinality : Var199
High Cardinality : Var200
low cardinality : Var201
High Cardinality : Var202
low cardinality : Var208
low cardinality : Var211
low cardinality : Var213
High Cardinality : Var214
low cardinality : Var215
High Cardinality : Var216
High Cardinality : Var217
low cardinality : Var218
High Cardinality : Var220
High Cardinality : Var222
low cardinality : Var224


In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer
from joblib import Parallel, delayed

# Define evaluation metrics
def evaluate_model(y_true, y_pred):
    return classification_report(y_true, y_pred, output_dict=True)


# Find numerical and categorical columns
numeric_features = df.select_dtypes(include=[np.float64]).columns.tolist()
categorical_features = df.select_dtypes(include=[object]).columns.tolist()

# Impute missing values for numerical features
imputer = SimpleImputer(strategy='constant', fill_value=0)
df[numeric_features] = imputer.fit_transform(df[numeric_features])
df[categorical_features] = df[categorical_features].astype(str)


params=dict(
    iterations=1000,
                           depth=5,
                           learning_rate=0.05,
                           loss_function='Logloss',
                           eval_metric='F1',
                           l2_leaf_reg=8,
                           verbose=400,
                           bagging_temperature=8,
                           border_count=32,
                           random_strength=16,
                           class_weights=[1, 22],
                           random_seed=42
)

# Initialize CatBoost classifier
model = CatBoostClassifier(**params)


def cross_validate_fold(model, X, y, train_idx, test_idx, categorical_features):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    # Fit the model
    model.fit(X_train, y_train, cat_features=categorical_features)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    f1_score = evaluate_model(y_test, y_pred)
    return f1_score

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Perform parallelized 10-fold cross-validation
f1_scores = Parallel(n_jobs=10)(delayed(cross_validate_fold)(model, df, target, train_idx, test_idx, categorical_features) 
                                for train_idx, test_idx in kfold.split(df, target))


In [9]:
import numpy as np
# Calculate average F1-score
avg_f1_score = np.mean([score['1']['f1-score'] for score in f1_scores])
print("Average F1-score:", avg_f1_score)
avg_precission = np.mean([score['1']['precision'] for score in f1_scores])
print("Average Precission:", avg_precission)
avg_recall = np.mean([score['1']['recall'] for score in f1_scores])
print("Average Recall:", avg_recall)


import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("catboost")
mlflow.log_params(params)
mlflow.log_metrics({"F1-Score":avg_f1_score,"Precision": avg_precission,"Recall": avg_recall})
model.fit(df, target, cat_features=categorical_features)
mlflow.catboost.log_model(model, "catboost")

Average F1-score: 0.20091762311822176
Average Precission: 0.11566310233151264
Average Recall: 0.7644066165146309
0:	learn: 0.7772139	total: 202ms	remaining: 3m 21s
400:	learn: 0.8124548	total: 25.4s	remaining: 37.9s
800:	learn: 0.8511478	total: 53.3s	remaining: 13.3s
999:	learn: 0.8611402	total: 1m 7s	remaining: 0us


<mlflow.models.model.ModelInfo at 0x26e1def3920>

In [7]:
mlflow.catboost.log_model(model, "catboost")

CatBoostError: There is no trained model to use save_model(). Use fit() to train model. Then use this method.

In [124]:
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel

numeric= df._get_numeric_data().columns
categoric = [x for x in df.columns if x not in numeric]

for col in df.columns:
    if col not in numeric:
        df[col]=df[col].fillna("nan")
        df[col]=df[col].astype('category')
    if col in numeric:
        df[col]=df[col].fillna(0)

In [125]:
# evaluation of a model fit using mutual information input features
from pandas import read_csv
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
import numpy as np


# prepare input data
def prepare_inputs(X_train, X_test,_col):
	oe = OrdinalEncoder()
	oe.fit(np.concatenate([X_train, X_test], axis=0))
	X_train_enc = oe.transform(X_train)
	X_test_enc = oe.transform(X_test)
	return X_train_enc, X_test_enc

#OHE
def one_hot(X_train, X_test,_col):
	### One hot encoding
	ohe = OneHotEncoder(sparse=False)
	ohe.fit(np.concatenate([X_train, X_test], axis=0))
	X_train_ohe = ohe.transform(X_train)
	X_test_ohe = ohe.transform(X_test)
	return X_train_ohe, X_test_ohe

# prepare target
def prepare_targets(y_train, y_test):
	le = LabelEncoder()
	le.fit(y_train)
	y_train_enc = le.transform(y_train)
	y_test_enc = le.transform(y_test)
	return y_train_enc, y_test_enc

# feature selection
def select_features(X_train, y_train, X_test,_col,_k):
	fs = SelectKBest(score_func=mutual_info_classif,k= _k)
	fs.fit(X_train, y_train)
	X_train_fs = fs.transform(X_train)
	X_test_fs = fs.transform(X_test)
	print(fs.get_feature_names_out(_col))
	return X_train_fs, X_test_fs 

# load the dataset
X, y = df.loc[:,categoric],target
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
# prepare input data
X_train_enc, X_test_enc = prepare_inputs(X_train, X_test,X.columns)
# prepare output data
y_train_enc, y_test_enc = prepare_targets(y_train, y_test)
# feature selection
X_train_fs, X_test_fs = select_features(X_train_enc, y_train_enc, X_test_enc,X.columns,5)

 
train_rows = X_train.index
test_rows = X_test.index
X_train_fs_num,X_test_fs_num   = select_features(df.loc[train_rows,numeric],target.iloc[train_rows] ,df.loc[test_rows,numeric],df.loc[:,numeric].columns,30)

X_train_fs, X_test_fs = one_hot(X_train_fs, X_test_fs,X.columns)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


['Var193' 'Var195' 'Var205' 'Var210' 'Var219']


  y = column_or_1d(y, warn=True)


['Var7' 'Var28' 'Var46' 'Var65' 'Var68' 'Var72' 'Var73' 'Var74' 'Var81'
 'Var91' 'Var100' 'Var102' 'Var112' 'Var113' 'Var121' 'Var126' 'Var136'
 'Var137' 'Var138' 'Var152' 'Var160' 'Var164' 'Var175' 'Var185' 'Var191'
 'Var201' 'Var213' 'Var215' 'Var218' 'Var224']




In [126]:
from sklearn.manifold import TSNE
import pandas as pd
import seaborn as sns
from sklearn.datasets import fetch_openml
import numpy as np
import matplotlib.pyplot as plt


# We want to get TSNE embedding with 2 dimensions
X, y = np.concatenate([X_train_fs,X_train_fs_num],axis=1), y_train_enc
test_X,test_y =  np.concatenate([X_test_fs,X_test_fs_num],axis=1), y_test 
tsne = TSNE(n_components=2,n_jobs=12,perplexity=45,n_iter=1000)
tsne_result = tsne.fit_transform(X)
tsne_result.shape
# (1000, 2)
# Two dimensions for each of our images
 
# Plot the result of our TSNE with the label color coded
# A lot of the stuff here is about making the plot look pretty and not TSNE
tsne_result_df = pd.DataFrame({'tsne_1': tsne_result[:,0], 'tsne_2': tsne_result[:,1], 'label': y})
fig, ax = plt.subplots(1)
sns.scatterplot(x='tsne_1', y='tsne_2', hue='label', data=tsne_result_df, ax=ax,s=120)
lim = (tsne_result.min()-5, tsne_result.max()+5)
ax.set_xlim(lim)
ax.set_ylim(lim)
ax.set_aspect('equal')
ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0)


KeyboardInterrupt: 

In [None]:
test_X,test_y =  np.concatenate([X_test_fs,X_test_fs_num],axis=1), y_test 

full_x = np.concatenate([test_X,X])
full_y = target

inliers = full_x[np.array(target==-1).flatten()]
outliers = full_x[np.array(target==1).flatten()]

from sklearn.svm import OneClassSVM
clf = OneClassSVM(gamma='auto').fit(outliers[:3000])

In [101]:
y_hat_out = clf.predict(outliers[3000:])
np.unique(y_hat_out, return_counts=True)

(array([-1,  1], dtype=int64), array([671,   1], dtype=int64))

In [102]:
y_hat_in = clf.predict(inliers)
np.unique(y_hat_in, return_counts=True)

(array([-1,  1], dtype=int64), array([46304,    24], dtype=int64))

In [86]:
import numpy as np
import xgboost as xgb
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report

full_y = (full_y+1)//2
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(full_x, full_y, test_size=0.24, random_state=42)

# Apply SMOTE to balance the data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Train XGBoost model
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'eta': 0.1,
    'max_depth': 2,
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.6,
    'gamma': 0.1,
    'seed': 42
}

dtrain = xgb.DMatrix(X_train_resampled, label=y_train_resampled)
dtest = xgb.DMatrix(X_test, label=y_test)

num_round = 500
bst = xgb.train(params, dtrain, num_round)

# Make predictions
y_pred = bst.predict(dtest)

# Evaluate model
threshold = 0.5  # Adjust threshold if necessary
y_pred_binary = [1 if p >= threshold else 0 for p in y_pred]

report = classification_report(y_true=y_test,y_pred= y_pred_binary)
print("Classification Report:\n", report)


Classification Report:
               precision    recall  f1-score   support

           0       0.93      1.00      0.96     11135
           1       0.25      0.00      0.00       865

    accuracy                           0.93     12000
   macro avg       0.59      0.50      0.48     12000
weighted avg       0.88      0.93      0.89     12000



In [45]:
np.unique(y_pred_binary,return_counts=True),np.unique(y_train_resampled,return_counts=True)

((array([0, 1]), array([17395,  2605], dtype=int64)),
 (array([0, 1], dtype=int64), array([27796, 27796], dtype=int64)))

In [None]:
from sklearn.linear_model import LogisticRegression
knn_model = LogisticRegression(max_iter=3000,fit_intercept=True,C=12) # You can change the value of 'k' as needed.

knn_model.fit(X_train_resampled, y_train_resampled)


y_pred = knn_model.predict(X_test)
y_pred_binary = [1 if p >= 0.2 else 0 for p in y_pred]
report = classification_report(y_true=y_test,y_pred= y_pred_binary)
print("Classification Report:\n", report)

In [9]:
df.loc[:,'Var8'].unique()

array([nan])

In [84]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report
from sklearn.impute import SimpleImputer


# Separate features and target
X = df
y = (target+1)//2

# Find numerical and categorical columns
numeric_features = X.select_dtypes(include=[np.float64]).columns.tolist()
categorical_features = X.select_dtypes(include=[object]).columns.tolist()

# Impute missing values for numerical features
imputer = SimpleImputer(strategy='constant',fill_value=0)
X[numeric_features] = imputer.fit_transform(X[numeric_features])

X[categorical_features] = X[categorical_features].astype(str)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=176)

# Initialize CatBoost classifier
model1 = CatBoostClassifier(iterations=1000,
                           depth=5,
                           learning_rate=0.03,
                           loss_function='Logloss',
                           eval_metric='F1',
                           l2_leaf_reg= 8,
                           verbose=400,
                           bagging_temperature=8,
                           border_count=32,
                           random_strength=16,
                           class_weights=[1, 18],
                           random_seed=42)


# Fit the model
model1.fit(X_train, y_train, cat_features=categorical_features)

# Make predictions
y_pred1 = model1.predict(X_test)

# Evaluate the model
report = classification_report(y_test, y_pred1)
print("Classification Report:\n", report)


0:	learn: 0.7408615	total: 85.6ms	remaining: 1m 25s
400:	learn: 0.7623188	total: 24.7s	remaining: 36.9s
800:	learn: 0.7932798	total: 53.9s	remaining: 13.4s
999:	learn: 0.8078068	total: 1m 8s	remaining: 0us
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.55      0.70      4640
           1       0.12      0.81      0.21       360

    accuracy                           0.57      5000
   macro avg       0.55      0.68      0.46      5000
weighted avg       0.91      0.57      0.67      5000



In [85]:
np.unique(y_pred1,return_counts=True)

(array([0, 1], dtype=int64), array([2621, 2379], dtype=int64))

In [68]:
import numpy as np
import xgboost as xgb
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Split data into train and test sets
# X_train, X_test, y_train, y_test = train_test_split(X[numeric_features], y, test_size=0.24, random_state=42)

# Apply SMOTE to balance the data
smote = SMOTE(random_state=42)
# One-hot encoding
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

from sklearn.compose import ColumnTransformer
column_transformer = ColumnTransformer([('le', OrdinalEncoder(),categorical_features),('encoder', OneHotEncoder(), categorical_features)], remainder='passthrough')
pre_processing=column_transformer.fit(X)

X_train_processed = pre_processing.transform(X_train)
X_test_processed = pre_processing.transform(X_test)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed, y_train)


from sklearn.linear_model import LogisticRegression
knn_model = LogisticRegression() # You can change the value of 'k' as needed.

knn_model.fit(X_train_resampled, y_train_resampled)

y_pred = knn_model.predict_proba(X_test_processed)
y_pred_binary = [1 if p[1] >= 0.4 else 0 for p in y_pred]
report = classification_report(y_true=y_test,y_pred= y_pred_binary)
print("Classification Report:\n", report)


  y = column_or_1d(y, warn=True)


Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.12      0.21      4636
           1       0.08      0.91      0.14       364

    accuracy                           0.18      5000
   macro avg       0.51      0.52      0.17      5000
weighted avg       0.88      0.18      0.20      5000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [69]:
report = classification_report(y_test, np.multiply(y_pred1,y_pred_binary))
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.36      0.53      4636
           1       0.09      0.85      0.17       364

    accuracy                           0.40      5000
   macro avg       0.53      0.61      0.35      5000
weighted avg       0.90      0.40      0.50      5000

