# Import all needed library 

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import pickle

# Using StandardScaler proprecess the x_train and x_test

In [2]:
def preprocess_data(X_train, X_test):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled, scaler


# Using Train_test_split the split dataset

In [3]:
def split_data(X, y, test_size=0.2, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

# Features selection 

In [4]:
def select_k_best(X_train, X_test, y_train, k=5):
# In the selectKBest algorthium f_regression is used in regression model    
    selector = SelectKBest(f_regression, k=k)
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)
    return X_train_selected, X_test_selected, selector

# Model 

In [5]:
# Using hyperparameter model is fit and fined the best model
def train_models(X_train, y_train):
    models = {
        'Linear Regression': LinearRegression(),
        'SVR': SVR(),
        'Decision Tree': DecisionTreeRegressor(),
        'Random Forest': RandomForestRegressor()
    }
    
    best_models = {}
    
    for name, model in models.items():
        param_grid = {
            "Linear Regression": {},  
            "SVR": {'kernel': ['rbf', 'poly', 'sigmoid', 'linear'], 'C': [10, 100], 'gamma': ["auto", "scale"]},
            "Decision Tree": {'criterion': ['mse', 'mae', 'friedman_mse'], 'max_features': ['auto', 'sqrt', 'log2'],
                              'splitter': ["best", "random"]},
            "Random Forest": {'criterion': ["squared_error", "absolute_error", 'friedman_mse', 'poisson'],
                              'max_features': ['auto', 'sqrt', 'log2'], 'n_estimators': [10, 100]}
        }
        
        
        if name == "Linear Regression":
            param_grid[name] = [{}]

        grid_search = GridSearchCV(model, param_grid[name], cv=5, scoring='r2')
        grid_search.fit(X_train, y_train)
        
        best_models[name] = grid_search.best_estimator_
    
    return best_models

# Which is model is best using r2_score

In [6]:
# predict the values and view model and r2_score in dataframe
def evaluate_models(models, X_test, y_test):
    results = {'Model': [], 'R2 Score': []}
    
    for name, model in models.items():
        y_pred = model.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        
        results['Model'].append(name)
        results['R2 Score'].append(r2)
    
    results_df = pd.DataFrame(results)
    return results_df

# Save model

In [24]:
def save_model(model, filename, mode='wb'):
    with open(filename, mode) as file:
        pickle.dump(model, file)

In [8]:
# import csv file 
data = pd.read_csv("pre-E_comm.csv",index_col=None)
df2=data
# get dummies dataset 
df2 = pd.get_dummies(df2, drop_first=True)
# X is independent set
X=df2.drop('Churn', 1)
# y is dependent set
y=df2['Churn']
# split X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = split_data(X, y)
# standard scalar preprocess
X_train_scaled, X_test_scaled, scaler = preprocess_data(X_train, X_test)
# Features selected
X_train_selected, X_test_selected ,selector= select_k_best(X_train_scaled, X_test_scaled, y_train, k=5)
# Best model
best_models = train_models(X_train_selected, y_train)

results_df = evaluate_models(best_models, X_test_selected, y_test)
print(results_df)






               Model  R2 Score
0  Linear Regression  0.198120
1                SVR  0.262687
2      Decision Tree  0.599522
3      Random Forest  0.604271


In [9]:
# which variables gives best model usign feature selection
selected_features = X.columns[selector.get_support()]
print('Selected Features:', selected_features)

Selected Features: Index(['Tenure', 'Complain', 'CashbackAmount', 'PreferedOrderCat_Mobile Phone',
       'MaritalStatus_Single'],
      dtype='object')


In [25]:
# save model 

save_model(best_models, 'best_models.pkl')
save_model(scaler, 'scaler.pkl')
save_model(selector, 'selector.pkl', 'wb')

In [26]:
# loaded models

with open('best_models.pkl', 'rb') as file:
    loaded_best_models = pickle.load(file)

with open('scaler.pkl', 'rb') as file:
    loaded_scaler = pickle.load(file)
    
with open('selector.pkl', 'rb') as file:
    loaded_selector = pickle.load(file)
    

In [27]:
# deployment phase

X_new = X_new.reindex(columns=X.columns, fill_value=0)

In [28]:
# call the loaded model
X_new_scaled = loaded_scaler.transform(X_new)
X_new_selected = selector.transform(X_new_scaled)
# predicted the model
predictions = loaded_best_models['Random Forest'].predict(X_new_selected)


print("Predicted Churn Status:", predictions)


Predicted Churn Status: [0.44259524]
