In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from xgboost import XGBClassifier

from tensorflow import keras

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


In [2]:
df = pd.read_csv('dataset.csv')
def categorical_column_fetch(df):
    categorical_columns = []
    for i in df.columns:
        if df[i].dtype == 'object':
            categorical_columns.append(i)
            continue
        else:
            pass
    return categorical_columns

categorical_columns = categorical_column_fetch(df)

In [3]:
df = df.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)  
# Strip whitespace from object columns

df.replace('', np.nan, inplace=True)  
# Replace empty strings with NaN

df.dropna(inplace=True)  
# Drop rows with NaN values

df.drop('customerID', axis=1, inplace=True)  
# Drop the 'customerID' column

df['TotalCharges'] = df['TotalCharges'].astype(float)  
# Convert 'TotalCharges' column to float

# Sample equal number of 'No' instances as 'Yes' instances
yes_data = df[df['Churn'] == 'Yes']
no_data = df[df['Churn'] == 'No'].sample(n=len(yes_data), random_state=24)
df1 = pd.concat([yes_data, no_data], ignore_index=True)

df1 = df1.sample(frac=1, random_state=24).reset_index(drop=True)  # Shuffle the dataframe

new_df = df1.copy()
df = df1.copy()

In [4]:
X= df.iloc[:, :-1]

y=df.iloc[:, -1]

le = LabelEncoder()
y = le.fit_transform(y)

cat_columns = [i for i in X.columns if df[i].dtype == 'object']
numerical_columns = ['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 4)


In [5]:
X_train.shape, X_test.shape

((2990, 19), (748, 19))

In [6]:
target_transformer = Pipeline([('label_encoder',LabelEncoder())])

categorical_transformer = Pipeline([('one_hot_encoder',OneHotEncoder(drop='first',
                                                                     handle_unknown='ignore'))
                                    ])

# Create the ColumnTransformer to apply different transformers to different columns
preprocessor = ColumnTransformer([('categorical',categorical_transformer,cat_columns)],
                                                remainder='passthrough')

# Create the final pipeline
pipeline = Pipeline([('preprocessor', preprocessor),
                     ('Standard Scalar', StandardScaler()),
                     ('clf', LogisticRegression())
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)


In [7]:
y_pred = pipeline.predict(X_test)

In [8]:
from sklearn.metrics import classification_report

report = classification_report(y_test, y_pred)

# Print the classification report
print(report)


              precision    recall  f1-score   support

           0       0.79      0.73      0.76       388
           1       0.73      0.79      0.76       360

    accuracy                           0.76       748
   macro avg       0.76      0.76      0.76       748
weighted avg       0.76      0.76      0.76       748



In [9]:
cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')

# Print the cross-validation scores
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Score:", cv_scores.mean())
print("Standard Deviation:", cv_scores.std())


Cross-Validation Scores: [0.76203209 0.77406417 0.76069519 0.77108434 0.78313253]
Mean CV Score: 0.7702016622640293
Standard Deviation: 0.008246459674372122


In [10]:
from sklearn.model_selection import GridSearchCV

# Assuming you have defined X and y

# Define the hyperparameters and their possible values
param_grid = {
    'clf__C': [0.1, 1.0, 10.0],
    'clf__penalty': ['l2']
}

# Create the final pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
     ('Standard Scalar', StandardScaler()),
    ('clf', LogisticRegression())
])

# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best model and its hyperparameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Print the best hyperparameters and score
print("Best Hyperparameters:", best_params)
print("Best Score:", best_score)


Best Hyperparameters: {'clf__C': 1.0, 'clf__penalty': 'l2'}
Best Score: 0.7698996655518394


In [11]:
# Define the hyperparameters and their possible values for each model
param_grid_rf = {
    'decision_tree__n_estimators': [100, 200, 300],
    'decision_tree__max_depth': [None, 5, 10]
}

# Create the final pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
     ('Standard Scalar', StandardScaler()),
    ('decision_tree', RandomForestClassifier())
])

# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid_rf, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best model and its hyperparameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Print the best hyperparameters and score
print("Best Hyperparameters:", best_params)
print("Best Score:", best_score)

Best Hyperparameters: {'decision_tree__max_depth': 5, 'decision_tree__n_estimators': 100}
Best Score: 0.7755852842809364


In [12]:
param_grid_svm = {
    'svm__C': [0.1, 1, 10],
    'svm__kernel': ['linear', 'rbf']
}

# Create the final pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
     ('Standard Scalar', StandardScaler()),
    ('svm', SVC())
])

# Perform grid search with cross-validation for SVM
grid_search_svm = GridSearchCV(pipeline, param_grid_svm, cv=5, scoring='accuracy')
grid_search_svm.fit(X_train, y_train)

# Get the best model and its hyperparameters for SVM
best_model_svm = grid_search_svm.best_estimator_
best_params_svm = grid_search_svm.best_params_
best_score_svm = grid_search_svm.best_score_

# Print the best hyperparameters and score for SVM
print("SVM - Best Hyperparameters:", best_params_svm)
print("SVM - Best Score:", best_score_svm)

SVM - Best Hyperparameters: {'svm__C': 1, 'svm__kernel': 'rbf'}
SVM - Best Score: 0.7625418060200669


In [13]:
param_grid_xgb = {
    'xgb__learning_rate': [0.1, 0.01],
    'xgb__max_depth': [3, 5, 7],
    'xgb__n_estimators': [100, 200, 300]
}

# Create the final pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
     ('Standard Scalar', StandardScaler()),
    ('xgb', XGBClassifier())
])

# Perform grid search with cross-validation for SVM
grid_search_xgb = GridSearchCV(pipeline, param_grid_xgb, cv=5, scoring='accuracy')
grid_search_xgb.fit(X_train, y_train)

# Get the best model and its hyperparameters for SVM
best_model_xgb = grid_search_xgb.best_estimator_
best_params_xgb = grid_search_xgb.best_params_
best_score_xgb = grid_search_xgb.best_score_

# Print the best hyperparameters and score for XGBoost
print("XGBoost - Best Hyperparameters:", best_params_xgb)
print("XGBoost - Best Score:", best_score_xgb)


XGBoost - Best Hyperparameters: {'xgb__learning_rate': 0.1, 'xgb__max_depth': 3, 'xgb__n_estimators': 100}
XGBoost - Best Score: 0.7732441471571907


In [14]:
# Create the ANN model
model = keras.Sequential([
    keras.layers.Dense(64, activation='relu', input_shape=(30,)),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

# Create a preprocessing pipeline
preprocessing_pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

# Preprocess the training data
X_train_preprocessed = preprocessing_pipeline.fit_transform(X_train)

# Preprocess the test data
X_test_preprocessed = preprocessing_pipeline.transform(X_test)


# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Fit the model to the preprocessed training data
model.fit(X_train_preprocessed, y_train, epochs=10)

# Evaluate the model on the preprocessed test data
loss, accuracy = model.evaluate(X_test_preprocessed, y_test)

# Print the accuracy of the model
print("ANN - Accuracy:", accuracy)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
ANN - Accuracy: 0.7165775299072266


In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import pickle

# Define the hyperparameters and their possible values for each model
best_parameters = {'decision_tree__max_depth': [5], 
                   'decision_tree__n_estimators': [300]}

preprocessor = ColumnTransformer([('categorical',categorical_transformer,cat_columns)],
                                                remainder='passthrough')


# Create the final pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
     ('Standard Scalar', StandardScaler()),
    ('decision_tree', RandomForestClassifier())
])

# pipeline.set_params(**best_parameters)

grid_search = GridSearchCV(pipeline, best_parameters, scoring='accuracy', cv=5)

grid_search.fit(X_train, y_train)


In [16]:
import pickle
with open('pipeline.pkl', 'wb') as f:
    pickle.dump(grid_search, f)


In [17]:
with open('pipeline.pkl', 'rb') as f:
    loaded_pipeline = pickle.load(f)


In [18]:
predictions = loaded_pipeline.predict(X_test)

In [19]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, predictions)
accuracy

0.7459893048128342

In [20]:
df1

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Male,0,Yes,No,32,Yes,Yes,Fiber optic,No,No,Yes,No,No,No,Month-to-month,Yes,Electronic check,78.90,2447.95,Yes
1,Female,1,Yes,No,26,Yes,No,Fiber optic,No,Yes,No,No,No,Yes,Month-to-month,No,Electronic check,84.95,2169.75,Yes
2,Female,0,No,No,13,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,One year,No,Mailed check,20.20,273.25,No
3,Male,1,Yes,No,13,Yes,No,Fiber optic,No,Yes,No,Yes,No,Yes,Month-to-month,Yes,Bank transfer (automatic),89.05,1169.35,Yes
4,Male,0,No,No,56,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,Yes,Bank transfer (automatic),25.95,1444.05,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3733,Male,0,Yes,Yes,63,Yes,Yes,DSL,Yes,Yes,Yes,No,Yes,Yes,Two year,No,Credit card (automatic),83.50,5435.00,No
3734,Male,0,Yes,No,47,Yes,Yes,Fiber optic,No,Yes,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,109.55,5124.55,Yes
3735,Female,0,Yes,No,49,No,No phone service,DSL,Yes,No,No,No,Yes,No,Month-to-month,No,Bank transfer (automatic),40.65,2070.75,No
3736,Male,0,Yes,No,24,Yes,Yes,Fiber optic,No,No,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,101.05,2391.80,Yes


In [21]:
X_train

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
1927,Male,0,Yes,No,23,Yes,Yes,Fiber optic,Yes,Yes,Yes,No,Yes,No,Month-to-month,Yes,Electronic check,99.25,2186.40
3625,Female,0,Yes,Yes,21,Yes,No,Fiber optic,No,Yes,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,102.80,2110.15
3349,Male,0,Yes,Yes,1,Yes,No,Fiber optic,No,No,No,No,Yes,Yes,Month-to-month,No,Mailed check,89.15,89.15
1108,Female,0,No,No,60,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.60,1093.00
2415,Male,0,No,No,5,No,No phone service,DSL,No,Yes,No,No,Yes,Yes,Month-to-month,Yes,Mailed check,51.00,305.95
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1921,Female,0,No,No,25,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Credit card (automatic),19.80,475.20
709,Female,0,Yes,No,15,Yes,No,Fiber optic,No,No,Yes,No,Yes,No,Month-to-month,Yes,Electronic check,84.30,1308.40
2487,Male,0,Yes,No,8,Yes,Yes,Fiber optic,No,No,No,No,No,Yes,Month-to-month,Yes,Bank transfer (automatic),85.65,659.45
174,Male,0,No,Yes,49,Yes,Yes,Fiber optic,No,No,No,Yes,Yes,Yes,One year,Yes,Bank transfer (automatic),100.60,5069.65


In [32]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import FunctionTransformer
import pandas as pd

# Load your data
# df1 = pd.read_csv(".csv")

# Split the data into features and target
X = df1.iloc[:, :-1]
y = df1.iloc[:, -1]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=402)

# Define the hyperparameters and their possible values for the random forest model
param_grid_rf = {
    'decision_tree__n_estimators': [100, 200, 300],
    'decision_tree__max_depth': [None, 5, 10]
}

# Define the columns to be one-hot encoded
categorical_columns = ["gender", "Partner", "Dependents", "PhoneService", "MultipleLines", "InternetService",
                       "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV",
                       "StreamingMovies", "Contract", "PaperlessBilling", "PaymentMethod"]

# Create the preprocessing pipeline
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), ~X.columns.isin(categorical_columns)),
    ('cat', OneHotEncoder(), categorical_columns)
])

# Define the SMOTE function
def smote_func(data):
    X, y = data
    smote = SMOTE()
    X_resampled, y_resampled = smote.fit_resample(X, y)
    return X_resampled, y_resampled

# Wrap SMOTE inside FunctionTransformer
smote_transformer = FunctionTransformer(smote_func, validate=False)

# Create the final pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('smote', smote_transformer),
    ('decision_tree', RandomForestClassifier())
])

# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid_rf, cv=2, scoring='accuracy', error_score='raise')
grid_search.fit((X_train, y_train))

# Get the best model and its hyperparameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Print the best hyperparameters and score
print("Best Hyperparameters:", best_params)
print("Best Score:", best_score)

# Predict on the test set using the best model
y_pred = best_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


ValueError: all features must be in [0, 2615] or [-2616, 0]

In [34]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import FunctionTransformer
import pandas as pd

# Load your data
# df1 = pd.read_csv(".csv")

# Split the data into features and target
X = df1.iloc[:, :-1]
y = df1.iloc[:, -1]

In [35]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=402)

In [37]:
# Define the hyperparameters and their possible values for the random forest model
param_grid_rf = {
    'decision_tree__n_estimators': [100, 200, 300],
    'decision_tree__max_depth': [None, 5, 10]
}

# Define the columns to be one-hot encoded
categorical_columns = ["gender", "Partner", "Dependents", "PhoneService", "MultipleLines",
                       "InternetService","OnlineSecurity", "OnlineBackup", "DeviceProtection",
                       "TechSupport", "StreamingTV","StreamingMovies", "Contract",
                       "PaperlessBilling", "PaymentMethod"]

# Create the preprocessing pipeline
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), ~X.columns.isin(categorical_columns)),
    ('cat', OneHotEncoder(), categorical_columns)
])


In [38]:
# Define the SMOTE function
def smote_func(data):
    X, y = data
    smote = SMOTE()
    X_resampled, y_resampled = smote.fit_resample(X, y)
    return X_resampled, y_resampled

# Wrap SMOTE inside FunctionTransformer
smote_transformer = FunctionTransformer(smote_func, validate=False)


In [39]:
# Create the final pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('smote', smote_transformer),
    ('decision_tree', RandomForestClassifier())
])

In [44]:
# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid_rf, cv=2, scoring='accuracy', error_score='raise')
grid_search.fit((X_train, y_train))

ValueError: all features must be in [0, 2989] or [-2990, 0]

In [48]:
X_train.shape

(2990, 19)

In [33]:




# Get the best model and its hyperparameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Print the best hyperparameters and score
print("Best Hyperparameters:", best_params)
print("Best Score:", best_score)

# Predict on the test set using the best model
y_pred = best_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


ValueError: all features must be in [0, 2989] or [-2990, 0]

In [None]:
pipeline